Add FLAC__SSE_SUPPORTED and FLAC__SSE2_SUPPORTED flags.
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #if HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <emmintrin.h> /* SSE2 */
47
48 FLAC__SSE_TARGET("sse2")
49 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
50 {
51         int i;
52         FLAC__int32 sum;
53
54         FLAC__ASSERT(order > 0);
55         FLAC__ASSERT(order <= 32);
56         FLAC__ASSERT(data_len > 0);
57
58         if(order <= 12) {
59                 FLAC__int32 curr;
60                 if(order > 8) { /* order == 9, 10, 11, 12 */
61 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
62                         /* can be modified to work with order <= 15 but the subset limit is 12 */
63                         int r;
64                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
65                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
66                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
67                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
68                         switch(order)                                          /* ...and zero them out */
69                         {
70                         case 9:
71                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
72                         case 10:
73                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
74                         case 11:
75                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
76                         }
77                         xmm2 = _mm_setzero_si128();
78                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
79                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
80
81                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
82                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
83                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
84                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
85                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
86                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
87                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
88                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
89
90                         xmm7 = _mm_slli_si128(xmm1, 2);
91                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
92                         xmm2 = _mm_slli_si128(xmm0, 2);
93
94                         /* xmm0, xmm1: qlp_coeff
95                            xmm2, xmm7: qlp_coeff << 16 bit
96                            xmm3, xmm4: data */
97
98                         xmm6 = xmm4;
99                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
100                         xmm5 = xmm3;
101                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
102                         xmm6 = _mm_add_epi32(xmm6, xmm5);
103                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
104                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
105
106                         curr = *data++;
107                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
108
109                         data_len--;
110                         r = data_len % 2;
111
112                         if(r) {
113                                 xmm4 = _mm_slli_si128(xmm4, 2);
114                                 xmm6 = xmm3;
115                                 xmm3 = _mm_slli_si128(xmm3, 2);
116                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
117                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
118
119                                 xmm6 = xmm4;
120                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
121                                 xmm5 = xmm3;
122                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
123                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
124                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
125                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
126
127                                 curr = *data++;
128                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
129
130                                 data_len--;
131                         }
132
133                         while(data_len) { /* data_len is a multiple of 2 */
134                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
135                                 xmm4 = _mm_slli_si128(xmm4, 4);
136                                 xmm6 = xmm3;
137                                 xmm3 = _mm_slli_si128(xmm3, 4);
138                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
139                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
140
141                                 xmm6 = xmm4;
142                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
143                                 xmm5 = xmm3;
144                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
145                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
146                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
147                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
148
149                                 curr = *data++;
150                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
151
152                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
153
154                                 xmm6 = xmm4;
155                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
156                                 xmm5 = xmm3;
157                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
158                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
159                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
160                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
161
162                                 curr = *data++;
163                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
164
165                                 data_len-=2;
166                         }
167 #else /* 16 XMM registers available */
168                         int r;
169                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
170                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
171                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
172                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
173                         switch(order)                                          /* ...and zero them out */
174                         {
175                         case 9:
176                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
177                         case 10:
178                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
179                         case 11:
180                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
181                         }
182                         xmm2 = _mm_setzero_si128();
183                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
184                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
185
186                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
187                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
188                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
189                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
190                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
191                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
192                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
193                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
194
195                         xmm7 = _mm_slli_si128(xmm1, 2);
196                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
197                         xmm2 = _mm_slli_si128(xmm0, 2);
198
199                         xmm9 = _mm_slli_si128(xmm1, 4);
200                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
201                         xmm8 = _mm_slli_si128(xmm0, 4);
202
203                         xmmB = _mm_slli_si128(xmm1, 6);
204                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
205                         xmmA = _mm_slli_si128(xmm0, 6);
206
207                         /* xmm0, xmm1: qlp_coeff
208                            xmm2, xmm7: qlp_coeff << 16 bit
209                            xmm8, xmm9: qlp_coeff << 2*16 bit
210                            xmmA, xmmB: qlp_coeff << 3*16 bit
211                            xmm3, xmm4: data */
212
213                         xmm6 = xmm4;
214                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
215                         xmm5 = xmm3;
216                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
217                         xmm6 = _mm_add_epi32(xmm6, xmm5);
218                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
219                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
220
221                         curr = *data++;
222                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
223
224                         data_len--;
225                         r = data_len % 4;
226
227                         while(r) {
228                                 xmm4 = _mm_slli_si128(xmm4, 2);
229                                 xmm6 = xmm3;
230                                 xmm3 = _mm_slli_si128(xmm3, 2);
231                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
232                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
233
234                                 xmm6 = xmm4;
235                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
236                                 xmm5 = xmm3;
237                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
238                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
239                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
240                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
241
242                                 curr = *data++;
243                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
244
245                                 data_len--; r--;
246                         }
247
248                         while(data_len) { /* data_len is a multiple of 4 */
249                                 xmm4 = _mm_slli_si128(xmm4, 8);
250                                 xmm6 = xmm3;
251                                 xmm3 = _mm_slli_si128(xmm3, 8);
252                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
253
254                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
255
256                                 xmm6 = xmm4;
257                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
258                                 xmm5 = xmm3;
259                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
260                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
261                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
262                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
263
264                                 curr = *data++;
265                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
266
267                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
268
269                                 xmm6 = xmm4;
270                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
271                                 xmm5 = xmm3;
272                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
273                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
274                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
275                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
276
277                                 curr = *data++;
278                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
279
280                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
281
282                                 xmm6 = xmm4;
283                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
284                                 xmm5 = xmm3;
285                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
286                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
287                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
288                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
289
290                                 curr = *data++;
291                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
292
293                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
294
295                                 xmm6 = xmm4;
296                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
297                                 xmm5 = xmm3;
298                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
299                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
300                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
301                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
302
303                                 curr = *data++;
304                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
305
306                                 data_len-=4;
307                         }
308 #endif
309                 } /* endif(order > 8) */
310                 else if(order > 4) { /* order == 5, 6, 7, 8 */
311                         if(order > 6) { /* order == 7, 8 */
312                                 if(order == 8) {
313                                         __m128i xmm0, xmm1, xmm3, xmm6;
314                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
315                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
316                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
317
318                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
319                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
320                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
321                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
322                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
323
324                                         /* xmm0: qlp_coeff
325                                            xmm3: data */
326
327                                         xmm6 = xmm3;
328                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
329                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
330                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
331
332                                         curr = *data++;
333                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
334
335                                         data_len--;
336
337                                         while(data_len) {
338                                                 xmm3 = _mm_slli_si128(xmm3, 2);
339                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
340
341                                                 xmm6 = xmm3;
342                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
343                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
344                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
345
346                                                 curr = *data++;
347                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
348
349                                                 data_len--;
350                                         }
351                                 }
352                                 else { /* order == 7 */
353                                         int r;
354                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
355                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
356                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
357                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
358                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
359
360                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
361                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
362                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
363                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
364                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
365                                         xmm2 = _mm_slli_si128(xmm0, 2);
366
367                                         /* xmm0: qlp_coeff
368                                            xmm2: qlp_coeff << 16 bit
369                                            xmm3: data */
370
371                                         xmm6 = xmm3;
372                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
373                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
374                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
375
376                                         curr = *data++;
377                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
378
379                                         data_len--;
380                                         r = data_len % 2;
381
382                                         if(r) {
383                                                 xmm3 = _mm_slli_si128(xmm3, 2);
384                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
385
386                                                 xmm6 = xmm3;
387                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
388                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
389                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
390
391                                                 curr = *data++;
392                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
393
394                                                 data_len--;
395                                         }
396
397                                         while(data_len) { /* data_len is a multiple of 2 */
398                                                 xmm3 = _mm_slli_si128(xmm3, 4);
399                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
400
401                                                 xmm6 = xmm3;
402                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
403                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
404                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
405
406                                                 curr = *data++;
407                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
408
409                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
410                                                 xmm6 = xmm3;
411                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
412                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
413                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
414
415                                                 curr = *data++;
416                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
417
418                                                 data_len-=2;
419                                         }
420                                 }
421                         }
422                         else { /* order == 5, 6 */
423                                 if(order == 6) {
424                                         int r;
425                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
426                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
427                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
428                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
429                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
430
431                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
432                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
433                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
434                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
435                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
436                                         xmm2 = _mm_slli_si128(xmm0, 2);
437                                         xmm4 = _mm_slli_si128(xmm0, 4);
438
439                                         /* xmm0: qlp_coeff
440                                            xmm2: qlp_coeff << 16 bit
441                                            xmm4: qlp_coeff << 2*16 bit
442                                            xmm3: data */
443
444                                         xmm6 = xmm3;
445                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
446                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
447                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
448
449                                         curr = *data++;
450                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
451
452                                         data_len--;
453                                         r = data_len % 3;
454
455                                         while(r) {
456                                                 xmm3 = _mm_slli_si128(xmm3, 2);
457                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
458
459                                                 xmm6 = xmm3;
460                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
461                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
462                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
463
464                                                 curr = *data++;
465                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
466
467                                                 data_len--; r--;
468                                         }
469
470                                         while(data_len) { /* data_len is a multiple of 3 */
471                                                 xmm3 = _mm_slli_si128(xmm3, 6);
472                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
473
474                                                 xmm6 = xmm3;
475                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
476                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
477                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
478
479                                                 curr = *data++;
480                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
481
482                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
483
484                                                 xmm6 = xmm3;
485                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
486                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
487                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
488
489                                                 curr = *data++;
490                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
491
492                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
493
494                                                 xmm6 = xmm3;
495                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
496                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
497                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
498
499                                                 curr = *data++;
500                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
501
502                                                 data_len-=3;
503                                         }
504                                 }
505                                 else { /* order == 5 */
506                                         int r;
507                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
508                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
509                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
510                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
511                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
512
513                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
514                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
515                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
516                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
517                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
518                                         xmm2 = _mm_slli_si128(xmm0, 2);
519                                         xmm4 = _mm_slli_si128(xmm0, 4);
520                                         xmm5 = _mm_slli_si128(xmm0, 6);
521
522                                         /* xmm0: qlp_coeff
523                                            xmm2: qlp_coeff << 16 bit
524                                            xmm4: qlp_coeff << 2*16 bit
525                                            xmm4: qlp_coeff << 3*16 bit
526                                            xmm3: data */
527
528                                         xmm6 = xmm3;
529                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
530                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
531                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
532
533                                         curr = *data++;
534                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
535
536                                         data_len--;
537                                         r = data_len % 4;
538
539                                         while(r) {
540                                                 xmm3 = _mm_slli_si128(xmm3, 2);
541                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
542
543                                                 xmm6 = xmm3;
544                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
545                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
546                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
547
548                                                 curr = *data++;
549                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
550
551                                                 data_len--; r--;
552                                         }
553
554                                         while(data_len) { /* data_len is a multiple of 4 */
555                                                 xmm3 = _mm_slli_si128(xmm3, 8);
556                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
557
558                                                 xmm6 = xmm3;
559                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
560                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
561                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
562
563                                                 curr = *data++;
564                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
565
566                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
567
568                                                 xmm6 = xmm3;
569                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
570                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
571                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
572
573                                                 curr = *data++;
574                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
575
576                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
577
578                                                 xmm6 = xmm3;
579                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
580                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
581                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
582
583                                                 curr = *data++;
584                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
585
586                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
587
588                                                 xmm6 = xmm3;
589                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
590                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
591                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
592
593                                                 curr = *data++;
594                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
595
596                                                 data_len-=4;
597                                         }
598                                 }
599                         }
600                 }
601                 else { /* order == 1, 2, 3, 4 */
602                         if(order > 2) {
603                                 if(order == 4) {
604                                         __m128i xmm0, xmm3, xmm6;
605                                         xmm6 = _mm_setzero_si128();
606                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
607                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
608
609                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
610                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
611                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
612
613                                         /* xmm0: qlp_coeff
614                                            xmm3: data */
615
616                                         xmm6 = xmm3;
617                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
618                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
619
620                                         curr = *data++;
621                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
622
623                                         data_len--;
624
625                                         while(data_len) {
626                                                 xmm3 = _mm_slli_si128(xmm3, 2);
627                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
628
629                                                 xmm6 = xmm3;
630                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
631                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
632
633                                                 curr = *data++;
634                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
635
636                                                 data_len--;
637                                         }
638                                 }
639                                 else { /* order == 3 */
640                                         int r;
641                                         __m128i xmm0, xmm1, xmm3, xmm6;
642                                         xmm6 = _mm_setzero_si128();
643                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
644                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
645                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
646
647                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
648                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
649                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
650                                         xmm1 = _mm_slli_si128(xmm0, 2);
651
652                                         /* xmm0: qlp_coeff
653                                            xmm1: qlp_coeff << 16 bit
654                                            xmm3: data */
655
656                                         xmm6 = xmm3;
657                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
658                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
659
660                                         curr = *data++;
661                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
662
663                                         data_len--;
664                                         r = data_len % 2;
665
666                                         if(r) {
667                                                 xmm3 = _mm_slli_si128(xmm3, 2);
668                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
669
670                                                 xmm6 = xmm3;
671                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
672                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
673
674                                                 curr = *data++;
675                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
676
677                                                 data_len--;
678                                         }
679
680                                         while(data_len) { /* data_len is a multiple of 2 */
681                                                 xmm3 = _mm_slli_si128(xmm3, 4);
682
683                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
684
685                                                 xmm6 = xmm3;
686                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
687                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
688
689                                                 curr = *data++;
690                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
691
692                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
693
694                                                 xmm6 = xmm3;
695                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
696                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
697
698                                                 curr = *data++;
699                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
700
701                                                 data_len-=2;
702                                         }
703                                 }
704                         }
705                         else {
706                                 if(order == 2) {
707                                         __m128i xmm0, xmm3, xmm6;
708                                         xmm6 = _mm_setzero_si128();
709                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
710                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
711                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
712
713                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
714                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
715                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
716
717                                         /* xmm0: qlp_coeff
718                                            xmm3: data */
719
720                                         xmm6 = xmm3;
721                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
722
723                                         curr = *data++;
724                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
725
726                                         data_len--;
727
728                                         while(data_len) {
729                                                 xmm3 = _mm_slli_si128(xmm3, 2);
730                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
731
732                                                 xmm6 = xmm3;
733                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
734
735                                                 curr = *data++;
736                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
737
738                                                 data_len--;
739                                         }
740                                 }
741                                 else { /* order == 1 */
742                                         for(i = 0; i < (int)data_len; i++)
743                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
744                                 }
745                         }
746                 }
747         }
748         else { /* order > 12 */
749                 for(i = 0; i < (int)data_len; i++) {
750                         sum = 0;
751                         switch(order) {
752                                 case 32: sum += qlp_coeff[31] * data[i-32];
753                                 case 31: sum += qlp_coeff[30] * data[i-31];
754                                 case 30: sum += qlp_coeff[29] * data[i-30];
755                                 case 29: sum += qlp_coeff[28] * data[i-29];
756                                 case 28: sum += qlp_coeff[27] * data[i-28];
757                                 case 27: sum += qlp_coeff[26] * data[i-27];
758                                 case 26: sum += qlp_coeff[25] * data[i-26];
759                                 case 25: sum += qlp_coeff[24] * data[i-25];
760                                 case 24: sum += qlp_coeff[23] * data[i-24];
761                                 case 23: sum += qlp_coeff[22] * data[i-23];
762                                 case 22: sum += qlp_coeff[21] * data[i-22];
763                                 case 21: sum += qlp_coeff[20] * data[i-21];
764                                 case 20: sum += qlp_coeff[19] * data[i-20];
765                                 case 19: sum += qlp_coeff[18] * data[i-19];
766                                 case 18: sum += qlp_coeff[17] * data[i-18];
767                                 case 17: sum += qlp_coeff[16] * data[i-17];
768                                 case 16: sum += qlp_coeff[15] * data[i-16];
769                                 case 15: sum += qlp_coeff[14] * data[i-15];
770                                 case 14: sum += qlp_coeff[13] * data[i-14];
771                                 case 13: sum += qlp_coeff[12] * data[i-13];
772                                          sum += qlp_coeff[11] * data[i-12];
773                                          sum += qlp_coeff[10] * data[i-11];
774                                          sum += qlp_coeff[ 9] * data[i-10];
775                                          sum += qlp_coeff[ 8] * data[i- 9];
776                                          sum += qlp_coeff[ 7] * data[i- 8];
777                                          sum += qlp_coeff[ 6] * data[i- 7];
778                                          sum += qlp_coeff[ 5] * data[i- 6];
779                                          sum += qlp_coeff[ 4] * data[i- 5];
780                                          sum += qlp_coeff[ 3] * data[i- 4];
781                                          sum += qlp_coeff[ 2] * data[i- 3];
782                                          sum += qlp_coeff[ 1] * data[i- 2];
783                                          sum += qlp_coeff[ 0] * data[i- 1];
784                         }
785                         residual[i] = data[i] - (sum >> lp_quantization);
786                 }
787         }
788 }
789
790 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
791
792 FLAC__SSE_TARGET("sse2")
793 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
794 {
795         int i;
796
797         FLAC__ASSERT(order > 0);
798         FLAC__ASSERT(order <= 32);
799
800         if(order <= 12) {
801                 if(order > 8) { /* order == 9, 10, 11, 12 */
802                         if(order > 10) { /* order == 11, 12 */
803                                 if(order == 12) {
804                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
805                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
806                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
807                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
808                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
809                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
810                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
811
812                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
813                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
814                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
815                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
816                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
817                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
818
819                                         for(i = 0; i < (int)data_len; i++) {
820                                                 //sum = 0;
821                                                 //sum += qlp_coeff[11] * data[i-12];
822                                                 //sum += qlp_coeff[10] * data[i-11];
823                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
824                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
825                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
826
827                                                 //sum += qlp_coeff[9] * data[i-10];
828                                                 //sum += qlp_coeff[8] * data[i-9];
829                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
830                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
831                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
832                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
833
834                                                 //sum += qlp_coeff[7] * data[i-8];
835                                                 //sum += qlp_coeff[6] * data[i-7];
836                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
837                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
838                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
839                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
840
841                                                 //sum += qlp_coeff[5] * data[i-6];
842                                                 //sum += qlp_coeff[4] * data[i-5];
843                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
844                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
845                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
846                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
847
848                                                 //sum += qlp_coeff[3] * data[i-4];
849                                                 //sum += qlp_coeff[2] * data[i-3];
850                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
851                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
852                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
853                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
854
855                                                 //sum += qlp_coeff[1] * data[i-2];
856                                                 //sum += qlp_coeff[0] * data[i-1];
857                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
858                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
859                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
860                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
861
862                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
863                                                 RESIDUAL_RESULT(xmm7);
864                                         }
865                                 }
866                                 else { /* order == 11 */
867                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
868                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
869                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
870                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
871                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
872                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
873                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
874
875                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
876                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
877                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
878                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
879                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
880
881                                         for(i = 0; i < (int)data_len; i++) {
882                                                 //sum = 0;
883                                                 //sum  = qlp_coeff[10] * data[i-11];
884                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
885                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
886
887                                                 //sum += qlp_coeff[9] * data[i-10];
888                                                 //sum += qlp_coeff[8] * data[i-9];
889                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
890                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
891                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
892                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
893
894                                                 //sum += qlp_coeff[7] * data[i-8];
895                                                 //sum += qlp_coeff[6] * data[i-7];
896                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
897                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
898                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
899                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
900
901                                                 //sum += qlp_coeff[5] * data[i-6];
902                                                 //sum += qlp_coeff[4] * data[i-5];
903                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
904                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
905                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
906                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
907
908                                                 //sum += qlp_coeff[3] * data[i-4];
909                                                 //sum += qlp_coeff[2] * data[i-3];
910                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
911                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
912                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
913                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
914
915                                                 //sum += qlp_coeff[1] * data[i-2];
916                                                 //sum += qlp_coeff[0] * data[i-1];
917                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
918                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
919                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
920                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
921
922                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
923                                                 RESIDUAL_RESULT(xmm7);
924                                         }
925                                 }
926                         }
927                         else { /* order == 9, 10 */
928                                 if(order == 10) {
929                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
930                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
931                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
932                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
933                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
934                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
935
936                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
937                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
938                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
939                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
940                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
941
942                                         for(i = 0; i < (int)data_len; i++) {
943                                                 //sum = 0;
944                                                 //sum += qlp_coeff[9] * data[i-10];
945                                                 //sum += qlp_coeff[8] * data[i-9];
946                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
947                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
948                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
949
950                                                 //sum += qlp_coeff[7] * data[i-8];
951                                                 //sum += qlp_coeff[6] * data[i-7];
952                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
953                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
954                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
955                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
956
957                                                 //sum += qlp_coeff[5] * data[i-6];
958                                                 //sum += qlp_coeff[4] * data[i-5];
959                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
960                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
961                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
962                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
963
964                                                 //sum += qlp_coeff[3] * data[i-4];
965                                                 //sum += qlp_coeff[2] * data[i-3];
966                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
967                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
968                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
969                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
970
971                                                 //sum += qlp_coeff[1] * data[i-2];
972                                                 //sum += qlp_coeff[0] * data[i-1];
973                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
974                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
975                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
976                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
977
978                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
979                                                 RESIDUAL_RESULT(xmm7);
980                                         }
981                                 }
982                                 else { /* order == 9 */
983                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
984                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
985                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
986                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
987                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
988                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
989
990                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
991                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
992                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
993                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
994
995                                         for(i = 0; i < (int)data_len; i++) {
996                                                 //sum = 0;
997                                                 //sum  = qlp_coeff[8] * data[i-9];
998                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
999                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
1000
1001                                                 //sum += qlp_coeff[7] * data[i-8];
1002                                                 //sum += qlp_coeff[6] * data[i-7];
1003                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1004                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1005                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
1006                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1007
1008                                                 //sum += qlp_coeff[5] * data[i-6];
1009                                                 //sum += qlp_coeff[4] * data[i-5];
1010                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1011                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1012                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1013                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1014
1015                                                 //sum += qlp_coeff[3] * data[i-4];
1016                                                 //sum += qlp_coeff[2] * data[i-3];
1017                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1018                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1019                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1020                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1021
1022                                                 //sum += qlp_coeff[1] * data[i-2];
1023                                                 //sum += qlp_coeff[0] * data[i-1];
1024                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1025                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1026                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1027                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1028
1029                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1030                                                 RESIDUAL_RESULT(xmm7);
1031                                         }
1032                                 }
1033                         }
1034                 }
1035                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1036                         if(order > 6) { /* order == 7, 8 */
1037                                 if(order == 8) {
1038                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1039                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1040                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1041                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1042                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1043
1044                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1045                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1046                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1047                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1048
1049                                         for(i = 0; i < (int)data_len; i++) {
1050                                                 //sum = 0;
1051                                                 //sum += qlp_coeff[7] * data[i-8];
1052                                                 //sum += qlp_coeff[6] * data[i-7];
1053                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1054                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1055                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1056
1057                                                 //sum += qlp_coeff[5] * data[i-6];
1058                                                 //sum += qlp_coeff[4] * data[i-5];
1059                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1060                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1061                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1062                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1063
1064                                                 //sum += qlp_coeff[3] * data[i-4];
1065                                                 //sum += qlp_coeff[2] * data[i-3];
1066                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1067                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1068                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1069                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1070
1071                                                 //sum += qlp_coeff[1] * data[i-2];
1072                                                 //sum += qlp_coeff[0] * data[i-1];
1073                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1074                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1075                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1076                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1077
1078                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1079                                                 RESIDUAL_RESULT(xmm7);
1080                                         }
1081                                 }
1082                                 else { /* order == 7 */
1083                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1084                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1085                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1086                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1087                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
1088
1089                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1090                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1091                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1092
1093                                         for(i = 0; i < (int)data_len; i++) {
1094                                                 //sum = 0;
1095                                                 //sum  = qlp_coeff[6] * data[i-7];
1096                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
1097                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1098
1099                                                 //sum += qlp_coeff[5] * data[i-6];
1100                                                 //sum += qlp_coeff[4] * data[i-5];
1101                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1102                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1103                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1104                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1105
1106                                                 //sum += qlp_coeff[3] * data[i-4];
1107                                                 //sum += qlp_coeff[2] * data[i-3];
1108                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1109                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1110                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1111                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1112
1113                                                 //sum += qlp_coeff[1] * data[i-2];
1114                                                 //sum += qlp_coeff[0] * data[i-1];
1115                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1116                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1117                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1118                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1119
1120                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1121                                                 RESIDUAL_RESULT(xmm7);
1122                                         }
1123                                 }
1124                         }
1125                         else { /* order == 5, 6 */
1126                                 if(order == 6) {
1127                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1128                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1129                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1130                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1131
1132                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1133                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1134                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1135
1136                                         for(i = 0; i < (int)data_len; i++) {
1137                                                 //sum = 0;
1138                                                 //sum += qlp_coeff[5] * data[i-6];
1139                                                 //sum += qlp_coeff[4] * data[i-5];
1140                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1141                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1142                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1143
1144                                                 //sum += qlp_coeff[3] * data[i-4];
1145                                                 //sum += qlp_coeff[2] * data[i-3];
1146                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1147                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1148                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1149                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1150
1151                                                 //sum += qlp_coeff[1] * data[i-2];
1152                                                 //sum += qlp_coeff[0] * data[i-1];
1153                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1154                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1155                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1156                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1157
1158                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1159                                                 RESIDUAL_RESULT(xmm7);
1160                                         }
1161                                 }
1162                                 else { /* order == 5 */
1163                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1164                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1165                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1166                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
1167
1168                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1169                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1170
1171                                         for(i = 0; i < (int)data_len; i++) {
1172                                                 //sum = 0;
1173                                                 //sum  = qlp_coeff[4] * data[i-5];
1174                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
1175                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1176
1177                                                 //sum += qlp_coeff[3] * data[i-4];
1178                                                 //sum += qlp_coeff[2] * data[i-3];
1179                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1180                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1181                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1182                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1183
1184                                                 //sum += qlp_coeff[1] * data[i-2];
1185                                                 //sum += qlp_coeff[0] * data[i-1];
1186                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1187                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1188                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1189                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1190
1191                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1192                                                 RESIDUAL_RESULT(xmm7);
1193                                         }
1194                                 }
1195                         }
1196                 }
1197                 else { /* order == 1, 2, 3, 4 */
1198                         if(order > 2) { /* order == 3, 4 */
1199                                 if(order == 4) {
1200                                         __m128i xmm0, xmm1, xmm6, xmm7;
1201                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1202                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1203
1204                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1205                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1206
1207                                         for(i = 0; i < (int)data_len; i++) {
1208                                                 //sum = 0;
1209                                                 //sum += qlp_coeff[3] * data[i-4];
1210                                                 //sum += qlp_coeff[2] * data[i-3];
1211                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1212                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1213                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1214
1215                                                 //sum += qlp_coeff[1] * data[i-2];
1216                                                 //sum += qlp_coeff[0] * data[i-1];
1217                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1218                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1219                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1220                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1221
1222                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1223                                                 RESIDUAL_RESULT(xmm7);
1224                                         }
1225                                 }
1226                                 else { /* order == 3 */
1227                                         __m128i xmm0, xmm1, xmm6, xmm7;
1228                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1229                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1230
1231                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1232
1233                                         for(i = 0; i < (int)data_len; i++) {
1234                                                 //sum = 0;
1235                                                 //sum  = qlp_coeff[2] * data[i-3];
1236                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1237                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1238
1239                                                 //sum += qlp_coeff[1] * data[i-2];
1240                                                 //sum += qlp_coeff[0] * data[i-1];
1241                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1242                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1243                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1244                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1245
1246                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1247                                                 RESIDUAL_RESULT(xmm7);
1248                                         }
1249                                 }
1250                         }
1251                         else { /* order == 1, 2 */
1252                                 if(order == 2) {
1253                                         __m128i xmm0, xmm7;
1254                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1255                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1256
1257                                         for(i = 0; i < (int)data_len; i++) {
1258                                                 //sum = 0;
1259                                                 //sum += qlp_coeff[1] * data[i-2];
1260                                                 //sum += qlp_coeff[0] * data[i-1];
1261                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1262                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1263                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
1264
1265                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1266                                                 RESIDUAL_RESULT(xmm7);
1267                                         }
1268                                 }
1269                                 else { /* order == 1 */
1270                                         for(i = 0; i < (int)data_len; i++)
1271                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1272                                 }
1273                         }
1274                 }
1275         }
1276         else { /* order > 12 */
1277                 FLAC__int32 sum;
1278                 for(i = 0; i < (int)data_len; i++) {
1279                         sum = 0;
1280                         switch(order) {
1281                                 case 32: sum += qlp_coeff[31] * data[i-32];
1282                                 case 31: sum += qlp_coeff[30] * data[i-31];
1283                                 case 30: sum += qlp_coeff[29] * data[i-30];
1284                                 case 29: sum += qlp_coeff[28] * data[i-29];
1285                                 case 28: sum += qlp_coeff[27] * data[i-28];
1286                                 case 27: sum += qlp_coeff[26] * data[i-27];
1287                                 case 26: sum += qlp_coeff[25] * data[i-26];
1288                                 case 25: sum += qlp_coeff[24] * data[i-25];
1289                                 case 24: sum += qlp_coeff[23] * data[i-24];
1290                                 case 23: sum += qlp_coeff[22] * data[i-23];
1291                                 case 22: sum += qlp_coeff[21] * data[i-22];
1292                                 case 21: sum += qlp_coeff[20] * data[i-21];
1293                                 case 20: sum += qlp_coeff[19] * data[i-20];
1294                                 case 19: sum += qlp_coeff[18] * data[i-19];
1295                                 case 18: sum += qlp_coeff[17] * data[i-18];
1296                                 case 17: sum += qlp_coeff[16] * data[i-17];
1297                                 case 16: sum += qlp_coeff[15] * data[i-16];
1298                                 case 15: sum += qlp_coeff[14] * data[i-15];
1299                                 case 14: sum += qlp_coeff[13] * data[i-14];
1300                                 case 13: sum += qlp_coeff[12] * data[i-13];
1301                                          sum += qlp_coeff[11] * data[i-12];
1302                                          sum += qlp_coeff[10] * data[i-11];
1303                                          sum += qlp_coeff[ 9] * data[i-10];
1304                                          sum += qlp_coeff[ 8] * data[i- 9];
1305                                          sum += qlp_coeff[ 7] * data[i- 8];
1306                                          sum += qlp_coeff[ 6] * data[i- 7];
1307                                          sum += qlp_coeff[ 5] * data[i- 6];
1308                                          sum += qlp_coeff[ 4] * data[i- 5];
1309                                          sum += qlp_coeff[ 3] * data[i- 4];
1310                                          sum += qlp_coeff[ 2] * data[i- 3];
1311                                          sum += qlp_coeff[ 1] * data[i- 2];
1312                                          sum += qlp_coeff[ 0] * data[i- 1];
1313                         }
1314                         residual[i] = data[i] - (sum >> lp_quantization);
1315                 }
1316         }
1317 }
1318
1319 #endif /* FLAC__SSE2_SUPPORTED */
1320 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1321 #endif /* FLAC__NO_ASM */
1322 #endif /* FLAC__INTEGER_ONLY_LIBRARY */