src/libFLAC/stream_decoder.c : Fix NULL de-reference.
[flac.git] / src / libFLAC / lpc_intrin_sse.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2014  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <xmmintrin.h> /* SSE */
47
48 /*   new routines: more unaligned loads, less shuffle
49  *   old routines: less unaligned loads, more shuffle
50  *   these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
51  */
52
53 /* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
54
55 FLAC__SSE_TARGET("sse")
56 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
57 {
58         int i;
59         int limit = data_len - 4;
60         __m128 sum0;
61
62         (void) lag;
63         FLAC__ASSERT(lag <= 4);
64         FLAC__ASSERT(lag <= data_len);
65
66         sum0 = _mm_setzero_ps();
67
68         for(i = 0; i <= limit; i++) {
69                 __m128 d, d0;
70                 d0 = _mm_loadu_ps(data+i);
71                 d = d0; d = _mm_shuffle_ps(d, d, 0);
72                 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
73         }
74
75         {
76                 __m128 d0 = _mm_setzero_ps();
77                 limit++; if(limit < 0) limit = 0;
78
79                 for(i = data_len-1; i >= limit; i--) {
80                         __m128 d;
81                         d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
82                         d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
83                         d0 = _mm_move_ss(d0, d);
84                         sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
85                 }
86         }
87
88         _mm_storeu_ps(autoc,   sum0);
89 }
90
91 FLAC__SSE_TARGET("sse")
92 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
93 {
94         int i;
95         int limit = data_len - 8;
96         __m128 sum0, sum1;
97
98         (void) lag;
99         FLAC__ASSERT(lag <= 8);
100         FLAC__ASSERT(lag <= data_len);
101
102         sum0 = _mm_setzero_ps();
103         sum1 = _mm_setzero_ps();
104
105         for(i = 0; i <= limit; i++) {
106                 __m128 d, d0, d1;
107                 d0 = _mm_loadu_ps(data+i);
108                 d1 = _mm_loadu_ps(data+i+4);
109                 d = d0; d = _mm_shuffle_ps(d, d, 0);
110                 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
111                 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
112         }
113
114         {
115                 __m128 d0 = _mm_setzero_ps();
116                 __m128 d1 = _mm_setzero_ps();
117                 limit++; if(limit < 0) limit = 0;
118
119                 for(i = data_len-1; i >= limit; i--) {
120                         __m128 d;
121                         d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
122                         d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
123                         d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
124                         d1 = _mm_move_ss(d1, d0);
125                         d0 = _mm_move_ss(d0, d);
126                         sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
127                         sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
128                 }
129         }
130
131         _mm_storeu_ps(autoc,   sum0);
132         _mm_storeu_ps(autoc+4, sum1);
133 }
134
135 FLAC__SSE_TARGET("sse")
136 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
137 {
138         int i;
139         int limit = data_len - 12;
140         __m128 sum0, sum1, sum2;
141
142         (void) lag;
143         FLAC__ASSERT(lag <= 12);
144         FLAC__ASSERT(lag <= data_len);
145
146         sum0 = _mm_setzero_ps();
147         sum1 = _mm_setzero_ps();
148         sum2 = _mm_setzero_ps();
149
150         for(i = 0; i <= limit; i++) {
151                 __m128 d, d0, d1, d2;
152                 d0 = _mm_loadu_ps(data+i);
153                 d1 = _mm_loadu_ps(data+i+4);
154                 d2 = _mm_loadu_ps(data+i+8);
155                 d = d0; d = _mm_shuffle_ps(d, d, 0);
156                 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
157                 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
158                 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
159         }
160
161         {
162                 __m128 d0 = _mm_setzero_ps();
163                 __m128 d1 = _mm_setzero_ps();
164                 __m128 d2 = _mm_setzero_ps();
165                 limit++; if(limit < 0) limit = 0;
166
167                 for(i = data_len-1; i >= limit; i--) {
168                         __m128 d;
169                         d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
170                         d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
171                         d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
172                         d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
173                         d2 = _mm_move_ss(d2, d1);
174                         d1 = _mm_move_ss(d1, d0);
175                         d0 = _mm_move_ss(d0, d);
176                         sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
177                         sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
178                         sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
179                 }
180         }
181
182         _mm_storeu_ps(autoc,   sum0);
183         _mm_storeu_ps(autoc+4, sum1);
184         _mm_storeu_ps(autoc+8, sum2);
185 }
186
187 FLAC__SSE_TARGET("sse")
188 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
189 {
190         int i;
191         int limit = data_len - 16;
192         __m128 sum0, sum1, sum2, sum3;
193
194         (void) lag;
195         FLAC__ASSERT(lag <= 16);
196         FLAC__ASSERT(lag <= data_len);
197
198         sum0 = _mm_setzero_ps();
199         sum1 = _mm_setzero_ps();
200         sum2 = _mm_setzero_ps();
201         sum3 = _mm_setzero_ps();
202
203         for(i = 0; i <= limit; i++) {
204                 __m128 d, d0, d1, d2, d3;
205                 d0 = _mm_loadu_ps(data+i);
206                 d1 = _mm_loadu_ps(data+i+4);
207                 d2 = _mm_loadu_ps(data+i+8);
208                 d3 = _mm_loadu_ps(data+i+12);
209                 d = d0; d = _mm_shuffle_ps(d, d, 0);
210                 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
211                 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
212                 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
213                 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
214         }
215
216         {
217                 __m128 d0 = _mm_setzero_ps();
218                 __m128 d1 = _mm_setzero_ps();
219                 __m128 d2 = _mm_setzero_ps();
220                 __m128 d3 = _mm_setzero_ps();
221                 limit++; if(limit < 0) limit = 0;
222
223                 for(i = data_len-1; i >= limit; i--) {
224                         __m128 d;
225                         d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
226                         d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
227                         d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
228                         d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
229                         d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
230                         d3 = _mm_move_ss(d3, d2);
231                         d2 = _mm_move_ss(d2, d1);
232                         d1 = _mm_move_ss(d1, d0);
233                         d0 = _mm_move_ss(d0, d);
234                         sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
235                         sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
236                         sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
237                         sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
238                 }
239         }
240
241         _mm_storeu_ps(autoc,   sum0);
242         _mm_storeu_ps(autoc+4, sum1);
243         _mm_storeu_ps(autoc+8, sum2);
244         _mm_storeu_ps(autoc+12,sum3);
245 }
246
247 /* old routines: faster on older Intel CPUs (up to Core 2) */
248
249 FLAC__SSE_TARGET("sse")
250 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
251 {
252         __m128 xmm0, xmm2, xmm5;
253
254         (void) lag;
255         FLAC__ASSERT(lag > 0);
256         FLAC__ASSERT(lag <= 4);
257         FLAC__ASSERT(lag <= data_len);
258         FLAC__ASSERT(data_len > 0);
259
260         xmm5 = _mm_setzero_ps();
261
262         xmm0 = _mm_load_ss(data++);
263         xmm2 = xmm0;
264         xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
265
266         xmm0 = _mm_mul_ps(xmm0, xmm2);
267         xmm5 = _mm_add_ps(xmm5, xmm0);
268
269         data_len--;
270
271         while(data_len)
272         {
273                 xmm0 = _mm_load1_ps(data++);
274
275                 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
276                 xmm2 = _mm_move_ss(xmm2, xmm0);
277                 xmm0 = _mm_mul_ps(xmm0, xmm2);
278                 xmm5 = _mm_add_ps(xmm5, xmm0);
279
280                 data_len--;
281         }
282
283         _mm_storeu_ps(autoc, xmm5);
284 }
285
286 FLAC__SSE_TARGET("sse")
287 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
288 {
289         __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
290
291         (void) lag;
292         FLAC__ASSERT(lag > 0);
293         FLAC__ASSERT(lag <= 8);
294         FLAC__ASSERT(lag <= data_len);
295         FLAC__ASSERT(data_len > 0);
296
297         xmm5 = _mm_setzero_ps();
298         xmm6 = _mm_setzero_ps();
299
300         xmm0 = _mm_load_ss(data++);
301         xmm2 = xmm0;
302         xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
303         xmm3 = _mm_setzero_ps();
304
305         xmm0 = _mm_mul_ps(xmm0, xmm2);
306         xmm5 = _mm_add_ps(xmm5, xmm0);
307
308         data_len--;
309
310         while(data_len)
311         {
312                 xmm0 = _mm_load1_ps(data++);
313
314                 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
315                 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
316                 xmm3 = _mm_move_ss(xmm3, xmm2);
317                 xmm2 = _mm_move_ss(xmm2, xmm0);
318
319                 xmm1 = xmm0;
320                 xmm1 = _mm_mul_ps(xmm1, xmm3);
321                 xmm0 = _mm_mul_ps(xmm0, xmm2);
322                 xmm6 = _mm_add_ps(xmm6, xmm1);
323                 xmm5 = _mm_add_ps(xmm5, xmm0);
324
325                 data_len--;
326         }
327
328         _mm_storeu_ps(autoc,   xmm5);
329         _mm_storeu_ps(autoc+4, xmm6);
330 }
331
332 FLAC__SSE_TARGET("sse")
333 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
334 {
335         __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
336
337         (void) lag;
338         FLAC__ASSERT(lag > 0);
339         FLAC__ASSERT(lag <= 12);
340         FLAC__ASSERT(lag <= data_len);
341         FLAC__ASSERT(data_len > 0);
342
343         xmm5 = _mm_setzero_ps();
344         xmm6 = _mm_setzero_ps();
345         xmm7 = _mm_setzero_ps();
346
347         xmm0 = _mm_load_ss(data++);
348         xmm2 = xmm0;
349         xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
350         xmm3 = _mm_setzero_ps();
351         xmm4 = _mm_setzero_ps();
352
353         xmm0 = _mm_mul_ps(xmm0, xmm2);
354         xmm5 = _mm_add_ps(xmm5, xmm0);
355
356         data_len--;
357
358         while(data_len)
359         {
360                 xmm0 = _mm_load1_ps(data++);
361
362                 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
363                 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
364                 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
365                 xmm4 = _mm_move_ss(xmm4, xmm3);
366                 xmm3 = _mm_move_ss(xmm3, xmm2);
367                 xmm2 = _mm_move_ss(xmm2, xmm0);
368
369                 xmm1 = xmm0;
370                 xmm1 = _mm_mul_ps(xmm1, xmm2);
371                 xmm5 = _mm_add_ps(xmm5, xmm1);
372                 xmm1 = xmm0;
373                 xmm1 = _mm_mul_ps(xmm1, xmm3);
374                 xmm6 = _mm_add_ps(xmm6, xmm1);
375                 xmm0 = _mm_mul_ps(xmm0, xmm4);
376                 xmm7 = _mm_add_ps(xmm7, xmm0);
377
378                 data_len--;
379         }
380
381         _mm_storeu_ps(autoc,   xmm5);
382         _mm_storeu_ps(autoc+4, xmm6);
383         _mm_storeu_ps(autoc+8, xmm7);
384 }
385
386 FLAC__SSE_TARGET("sse")
387 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
388 {
389         __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
390
391         (void) lag;
392         FLAC__ASSERT(lag > 0);
393         FLAC__ASSERT(lag <= 16);
394         FLAC__ASSERT(lag <= data_len);
395         FLAC__ASSERT(data_len > 0);
396
397         xmm6 = _mm_setzero_ps();
398         xmm7 = _mm_setzero_ps();
399         xmm8 = _mm_setzero_ps();
400         xmm9 = _mm_setzero_ps();
401
402         xmm0 = _mm_load_ss(data++);
403         xmm2 = xmm0;
404         xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
405         xmm3 = _mm_setzero_ps();
406         xmm4 = _mm_setzero_ps();
407         xmm5 = _mm_setzero_ps();
408
409         xmm0 = _mm_mul_ps(xmm0, xmm2);
410         xmm6 = _mm_add_ps(xmm6, xmm0);
411
412         data_len--;
413
414         while(data_len)
415         {
416                 xmm0 = _mm_load1_ps(data++);
417
418                 /* shift xmm5:xmm4:xmm3:xmm2 left by one float */
419                 xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
420                 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
421                 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
422                 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
423                 xmm5 = _mm_move_ss(xmm5, xmm4);
424                 xmm4 = _mm_move_ss(xmm4, xmm3);
425                 xmm3 = _mm_move_ss(xmm3, xmm2);
426                 xmm2 = _mm_move_ss(xmm2, xmm0);
427
428                 /* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
429                 xmm1 = xmm0;
430                 xmm1 = _mm_mul_ps(xmm1, xmm5);
431                 xmm9 = _mm_add_ps(xmm9, xmm1);
432                 xmm1 = xmm0;
433                 xmm1 = _mm_mul_ps(xmm1, xmm4);
434                 xmm8 = _mm_add_ps(xmm8, xmm1);
435                 xmm1 = xmm0;
436                 xmm1 = _mm_mul_ps(xmm1, xmm3);
437                 xmm7 = _mm_add_ps(xmm7, xmm1);
438                 xmm0 = _mm_mul_ps(xmm0, xmm2);
439                 xmm6 = _mm_add_ps(xmm6, xmm0);
440
441                 data_len--;
442         }
443
444         _mm_storeu_ps(autoc,   xmm6);
445         _mm_storeu_ps(autoc+4, xmm7);
446         _mm_storeu_ps(autoc+8, xmm8);
447         _mm_storeu_ps(autoc+12,xmm9);
448 }
449
450 #endif /* FLAC__SSE_SUPPORTED */
451 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
452 #endif /* FLAC__NO_ASM */
453 #endif /* FLAC__INTEGER_ONLY_LIBRARY */