Improve x86 instrinsic implementation.
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #if HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40
41 #include "FLAC/assert.h"
42 #include "FLAC/format.h"
43 #include "private/lpc.h"
44
45 #include <emmintrin.h> /* SSE2 */
46
47 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
48 {
49         int i;
50         FLAC__int32 sum;
51
52         FLAC__ASSERT(order > 0);
53         FLAC__ASSERT(order <= 32);
54         FLAC__ASSERT(data_len > 0);
55
56         if(order <= 12) {
57                 FLAC__int32 curr;
58                 if(order > 8) { /* order == 9, 10, 11, 12 */
59 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
60                         /* can be modified to work with order <= 15 but the subset limit is 12 */
61                         int r;
62                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
63                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
64                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
65                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
66                         switch(order)                                          /* ...and zero them out */
67                         {
68                         case 9:
69                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
70                         case 10:
71                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
72                         case 11:
73                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
74                         }
75                         xmm2 = _mm_setzero_si128();
76                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
77                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
78
79                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
80                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
81                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
82                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
83                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
84                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
85                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
86                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
87
88                         xmm7 = _mm_slli_si128(xmm1, 2);
89                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
90                         xmm2 = _mm_slli_si128(xmm0, 2);
91
92                         /* xmm0, xmm1: qlp_coeff
93                            xmm2, xmm7: qlp_coeff << 16 bit
94                            xmm3, xmm4: data */
95
96                         xmm6 = xmm4;
97                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
98                         xmm5 = xmm3;
99                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
100                         xmm6 = _mm_add_epi32(xmm6, xmm5);
101                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
102                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
103
104                         curr = *data++;
105                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
106
107                         data_len--;
108                         r = data_len % 2;
109
110                         if(r) {
111                                 xmm4 = _mm_slli_si128(xmm4, 2);
112                                 xmm6 = xmm3;
113                                 xmm3 = _mm_slli_si128(xmm3, 2);
114                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
115                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
116
117                                 xmm6 = xmm4;
118                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
119                                 xmm5 = xmm3;
120                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
121                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
122                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
123                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
124
125                                 curr = *data++;
126                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
127
128                                 data_len--;
129                         }
130
131                         while(data_len) { /* data_len is a multiple of 2 */
132                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
133                                 xmm4 = _mm_slli_si128(xmm4, 4);
134                                 xmm6 = xmm3;
135                                 xmm3 = _mm_slli_si128(xmm3, 4);
136                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
137                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
138
139                                 xmm6 = xmm4;
140                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
141                                 xmm5 = xmm3;
142                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
143                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
144                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
145                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
146
147                                 curr = *data++;
148                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
149
150                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
151
152                                 xmm6 = xmm4;
153                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
154                                 xmm5 = xmm3;
155                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
156                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
157                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
158                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
159
160                                 curr = *data++;
161                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
162
163                                 data_len-=2;
164                         }
165 #else /* 16 XMM registers available */
166                         int r;
167                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
168                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
169                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
170                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
171                         switch(order)                                          /* ...and zero them out */
172                         {
173                         case 9:
174                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
175                         case 10:
176                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
177                         case 11:
178                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
179                         }
180                         xmm2 = _mm_setzero_si128();
181                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
182                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
183
184                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
185                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
186                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
187                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
188                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
189                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
190                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
191                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
192
193                         xmm7 = _mm_slli_si128(xmm1, 2);
194                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
195                         xmm2 = _mm_slli_si128(xmm0, 2);
196
197                         xmm9 = _mm_slli_si128(xmm1, 4);
198                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
199                         xmm8 = _mm_slli_si128(xmm0, 4);
200
201                         xmmB = _mm_slli_si128(xmm1, 6);
202                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
203                         xmmA = _mm_slli_si128(xmm0, 6);
204
205                         /* xmm0, xmm1: qlp_coeff
206                            xmm2, xmm7: qlp_coeff << 16 bit
207                            xmm8, xmm9: qlp_coeff << 2*16 bit
208                            xmmA, xmmB: qlp_coeff << 3*16 bit
209                            xmm3, xmm4: data */
210
211                         xmm6 = xmm4;
212                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
213                         xmm5 = xmm3;
214                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
215                         xmm6 = _mm_add_epi32(xmm6, xmm5);
216                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
217                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
218
219                         curr = *data++;
220                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
221
222                         data_len--;
223                         r = data_len % 4;
224
225                         while(r) {
226                                 xmm4 = _mm_slli_si128(xmm4, 2);
227                                 xmm6 = xmm3;
228                                 xmm3 = _mm_slli_si128(xmm3, 2);
229                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
230                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
231
232                                 xmm6 = xmm4;
233                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
234                                 xmm5 = xmm3;
235                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
236                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
237                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
238                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
239
240                                 curr = *data++;
241                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
242
243                                 data_len--; r--;
244                         }
245
246                         while(data_len) { /* data_len is a multiple of 4 */
247                                 xmm4 = _mm_slli_si128(xmm4, 8);
248                                 xmm6 = xmm3;
249                                 xmm3 = _mm_slli_si128(xmm3, 8);
250                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
251
252                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
253
254                                 xmm6 = xmm4;
255                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
256                                 xmm5 = xmm3;
257                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
258                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
259                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
260                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
261
262                                 curr = *data++;
263                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
264
265                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
266
267                                 xmm6 = xmm4;
268                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
269                                 xmm5 = xmm3;
270                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
271                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
272                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
273                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
274
275                                 curr = *data++;
276                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
277
278                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
279
280                                 xmm6 = xmm4;
281                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
282                                 xmm5 = xmm3;
283                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
284                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
285                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
286                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
287
288                                 curr = *data++;
289                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
290
291                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
292
293                                 xmm6 = xmm4;
294                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
295                                 xmm5 = xmm3;
296                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
297                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
298                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
299                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
300
301                                 curr = *data++;
302                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
303
304                                 data_len-=4;
305                         }
306 #endif
307                 } /* endif(order > 8) */
308                 else if(order > 4) { /* order == 5, 6, 7, 8 */
309                         if(order > 6) { /* order == 7, 8 */
310                                 if(order == 8) {
311                                         __m128i xmm0, xmm1, xmm3, xmm6;
312                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
313                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
314                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
315
316                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
317                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
318                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
319                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
320                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
321
322                                         /* xmm0: qlp_coeff
323                                            xmm3: data */
324
325                                         xmm6 = xmm3;
326                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
327                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
328                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
329
330                                         curr = *data++;
331                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
332
333                                         data_len--;
334
335                                         while(data_len) {
336                                                 xmm3 = _mm_slli_si128(xmm3, 2);
337                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
338
339                                                 xmm6 = xmm3;
340                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
341                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
342                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
343
344                                                 curr = *data++;
345                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
346
347                                                 data_len--;
348                                         }
349                                 }
350                                 else { /* order == 7 */
351                                         int r;
352                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
353                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
354                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
355                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
356                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
357
358                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
359                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
360                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
361                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
362                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
363                                         xmm2 = _mm_slli_si128(xmm0, 2);
364
365                                         /* xmm0: qlp_coeff
366                                            xmm2: qlp_coeff << 16 bit
367                                            xmm3: data */
368
369                                         xmm6 = xmm3;
370                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
371                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
372                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
373
374                                         curr = *data++;
375                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
376
377                                         data_len--;
378                                         r = data_len % 2;
379
380                                         if(r) {
381                                                 xmm3 = _mm_slli_si128(xmm3, 2);
382                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
383
384                                                 xmm6 = xmm3;
385                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
386                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
387                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
388
389                                                 curr = *data++;
390                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
391
392                                                 data_len--;
393                                         }
394
395                                         while(data_len) { /* data_len is a multiple of 2 */
396                                                 xmm3 = _mm_slli_si128(xmm3, 4);
397                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
398
399                                                 xmm6 = xmm3;
400                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
401                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
402                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
403
404                                                 curr = *data++;
405                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
406
407                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
408                                                 xmm6 = xmm3;
409                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
410                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
411                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
412
413                                                 curr = *data++;
414                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
415
416                                                 data_len-=2;
417                                         }
418                                 }
419                         }
420                         else { /* order == 5, 6 */
421                                 if(order == 6) {
422                                         int r;
423                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
424                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
425                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
426                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
427                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
428
429                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
430                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
431                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
432                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
433                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
434                                         xmm2 = _mm_slli_si128(xmm0, 2);
435                                         xmm4 = _mm_slli_si128(xmm0, 4);
436
437                                         /* xmm0: qlp_coeff
438                                            xmm2: qlp_coeff << 16 bit
439                                            xmm4: qlp_coeff << 2*16 bit
440                                            xmm3: data */
441
442                                         xmm6 = xmm3;
443                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
444                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
445                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
446
447                                         curr = *data++;
448                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
449
450                                         data_len--;
451                                         r = data_len % 3;
452
453                                         while(r) {
454                                                 xmm3 = _mm_slli_si128(xmm3, 2);
455                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
456
457                                                 xmm6 = xmm3;
458                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
459                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
460                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
461
462                                                 curr = *data++;
463                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
464
465                                                 data_len--; r--;
466                                         }
467
468                                         while(data_len) { /* data_len is a multiple of 3 */
469                                                 xmm3 = _mm_slli_si128(xmm3, 6);
470                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
471
472                                                 xmm6 = xmm3;
473                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
474                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
475                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
476
477                                                 curr = *data++;
478                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
479
480                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
481
482                                                 xmm6 = xmm3;
483                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
484                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
485                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
486
487                                                 curr = *data++;
488                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
489
490                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
491
492                                                 xmm6 = xmm3;
493                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
494                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
495                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
496
497                                                 curr = *data++;
498                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
499
500                                                 data_len-=3;
501                                         }
502                                 }
503                                 else { /* order == 5 */
504                                         int r;
505                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
506                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
507                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
508                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
509                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
510
511                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
512                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
513                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
514                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
515                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
516                                         xmm2 = _mm_slli_si128(xmm0, 2);
517                                         xmm4 = _mm_slli_si128(xmm0, 4);
518                                         xmm5 = _mm_slli_si128(xmm0, 6);
519
520                                         /* xmm0: qlp_coeff
521                                            xmm2: qlp_coeff << 16 bit
522                                            xmm4: qlp_coeff << 2*16 bit
523                                            xmm4: qlp_coeff << 3*16 bit
524                                            xmm3: data */
525
526                                         xmm6 = xmm3;
527                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
528                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
529                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
530
531                                         curr = *data++;
532                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
533
534                                         data_len--;
535                                         r = data_len % 4;
536
537                                         while(r) {
538                                                 xmm3 = _mm_slli_si128(xmm3, 2);
539                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
540
541                                                 xmm6 = xmm3;
542                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
543                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
544                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
545
546                                                 curr = *data++;
547                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
548
549                                                 data_len--; r--;
550                                         }
551
552                                         while(data_len) { /* data_len is a multiple of 4 */
553                                                 xmm3 = _mm_slli_si128(xmm3, 8);
554                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
555
556                                                 xmm6 = xmm3;
557                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
558                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
559                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
560
561                                                 curr = *data++;
562                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
563
564                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
565
566                                                 xmm6 = xmm3;
567                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
568                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
569                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
570
571                                                 curr = *data++;
572                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
573
574                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
575
576                                                 xmm6 = xmm3;
577                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
578                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
579                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
580
581                                                 curr = *data++;
582                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
583
584                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
585
586                                                 xmm6 = xmm3;
587                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
588                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
589                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
590
591                                                 curr = *data++;
592                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
593
594                                                 data_len-=4;
595                                         }
596                                 }
597                         }
598                 }
599                 else { /* order == 1, 2, 3, 4 */
600                         if(order > 2) {
601                                 if(order == 4) {
602                                         __m128i xmm0, xmm3, xmm6;
603                                         xmm6 = _mm_setzero_si128();
604                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
605                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
606
607                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
608                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
609                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
610
611                                         /* xmm0: qlp_coeff
612                                            xmm3: data */
613
614                                         xmm6 = xmm3;
615                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
616                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
617
618                                         curr = *data++;
619                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
620
621                                         data_len--;
622
623                                         while(data_len) {
624                                                 xmm3 = _mm_slli_si128(xmm3, 2);
625                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
626
627                                                 xmm6 = xmm3;
628                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
629                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
630
631                                                 curr = *data++;
632                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
633
634                                                 data_len--;
635                                         }
636                                 }
637                                 else { /* order == 3 */
638                                         int r;
639                                         __m128i xmm0, xmm1, xmm3, xmm6;
640                                         xmm6 = _mm_setzero_si128();
641                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
642                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
643                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
644
645                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
646                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
647                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
648                                         xmm1 = _mm_slli_si128(xmm0, 2);
649
650                                         /* xmm0: qlp_coeff
651                                            xmm1: qlp_coeff << 16 bit
652                                            xmm3: data */
653
654                                         xmm6 = xmm3;
655                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
656                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
657
658                                         curr = *data++;
659                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
660
661                                         data_len--;
662                                         r = data_len % 2;
663
664                                         if(r) {
665                                                 xmm3 = _mm_slli_si128(xmm3, 2);
666                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
667
668                                                 xmm6 = xmm3;
669                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
670                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
671
672                                                 curr = *data++;
673                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
674
675                                                 data_len--;
676                                         }
677
678                                         while(data_len) { /* data_len is a multiple of 2 */
679                                                 xmm3 = _mm_slli_si128(xmm3, 4);
680
681                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
682
683                                                 xmm6 = xmm3;
684                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
685                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
686
687                                                 curr = *data++;
688                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
689
690                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
691
692                                                 xmm6 = xmm3;
693                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
694                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
695
696                                                 curr = *data++;
697                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
698
699                                                 data_len-=2;
700                                         }
701                                 }
702                         }
703                         else {
704                                 if(order == 2) {
705                                         __m128i xmm0, xmm3, xmm6;
706                                         xmm6 = _mm_setzero_si128();
707                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
708                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
709                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
710
711                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
712                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
713                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
714
715                                         /* xmm0: qlp_coeff
716                                            xmm3: data */
717
718                                         xmm6 = xmm3;
719                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
720
721                                         curr = *data++;
722                                         *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
723
724                                         data_len--;
725
726                                         while(data_len) {
727                                                 xmm3 = _mm_slli_si128(xmm3, 2);
728                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
729
730                                                 xmm6 = xmm3;
731                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
732
733                                                 curr = *data++;
734                                                 *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
735
736                                                 data_len--;
737                                         }
738                                 }
739                                 else { /* order == 1 */
740                                         for(i = 0; i < (int)data_len; i++)
741                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
742                                 }
743                         }
744                 }
745         }
746         else { /* order > 12 */
747                 for(i = 0; i < (int)data_len; i++) {
748                         sum = 0;
749                         switch(order) {
750                                 case 32: sum += qlp_coeff[31] * data[i-32];
751                                 case 31: sum += qlp_coeff[30] * data[i-31];
752                                 case 30: sum += qlp_coeff[29] * data[i-30];
753                                 case 29: sum += qlp_coeff[28] * data[i-29];
754                                 case 28: sum += qlp_coeff[27] * data[i-28];
755                                 case 27: sum += qlp_coeff[26] * data[i-27];
756                                 case 26: sum += qlp_coeff[25] * data[i-26];
757                                 case 25: sum += qlp_coeff[24] * data[i-25];
758                                 case 24: sum += qlp_coeff[23] * data[i-24];
759                                 case 23: sum += qlp_coeff[22] * data[i-23];
760                                 case 22: sum += qlp_coeff[21] * data[i-22];
761                                 case 21: sum += qlp_coeff[20] * data[i-21];
762                                 case 20: sum += qlp_coeff[19] * data[i-20];
763                                 case 19: sum += qlp_coeff[18] * data[i-19];
764                                 case 18: sum += qlp_coeff[17] * data[i-18];
765                                 case 17: sum += qlp_coeff[16] * data[i-17];
766                                 case 16: sum += qlp_coeff[15] * data[i-16];
767                                 case 15: sum += qlp_coeff[14] * data[i-15];
768                                 case 14: sum += qlp_coeff[13] * data[i-14];
769                                 case 13: sum += qlp_coeff[12] * data[i-13];
770                                          sum += qlp_coeff[11] * data[i-12];
771                                          sum += qlp_coeff[10] * data[i-11];
772                                          sum += qlp_coeff[ 9] * data[i-10];
773                                          sum += qlp_coeff[ 8] * data[i- 9];
774                                          sum += qlp_coeff[ 7] * data[i- 8];
775                                          sum += qlp_coeff[ 6] * data[i- 7];
776                                          sum += qlp_coeff[ 5] * data[i- 6];
777                                          sum += qlp_coeff[ 4] * data[i- 5];
778                                          sum += qlp_coeff[ 3] * data[i- 4];
779                                          sum += qlp_coeff[ 2] * data[i- 3];
780                                          sum += qlp_coeff[ 1] * data[i- 2];
781                                          sum += qlp_coeff[ 0] * data[i- 1];
782                         }
783                         residual[i] = data[i] - (sum >> lp_quantization);
784                 }
785         }
786 }
787
788 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
789
790 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
791 {
792         int i;
793
794         FLAC__ASSERT(order > 0);
795         FLAC__ASSERT(order <= 32);
796
797         if(order <= 12) {
798                 if(order > 8) { /* order == 9, 10, 11, 12 */
799                         if(order > 10) { /* order == 11, 12 */
800                                 if(order == 12) {
801                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
802                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
803                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
804                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
805                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
806                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
807                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
808
809                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
810                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
811                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
812                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
813                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
814                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
815
816                                         for(i = 0; i < (int)data_len; i++) {
817                                                 //sum = 0;
818                                                 //sum += qlp_coeff[11] * data[i-12];
819                                                 //sum += qlp_coeff[10] * data[i-11];
820                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
821                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
822                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
823
824                                                 //sum += qlp_coeff[9] * data[i-10];
825                                                 //sum += qlp_coeff[8] * data[i-9];
826                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
827                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
828                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
829                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
830
831                                                 //sum += qlp_coeff[7] * data[i-8];
832                                                 //sum += qlp_coeff[6] * data[i-7];
833                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
834                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
835                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
836                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
837
838                                                 //sum += qlp_coeff[5] * data[i-6];
839                                                 //sum += qlp_coeff[4] * data[i-5];
840                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
841                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
842                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
843                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
844
845                                                 //sum += qlp_coeff[3] * data[i-4];
846                                                 //sum += qlp_coeff[2] * data[i-3];
847                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
848                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
849                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
850                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
851
852                                                 //sum += qlp_coeff[1] * data[i-2];
853                                                 //sum += qlp_coeff[0] * data[i-1];
854                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
855                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
856                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
857                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
858
859                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
860                                                 RESIDUAL_RESULT(xmm7);
861                                         }
862                                 }
863                                 else { /* order == 11 */
864                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
865                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
866                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
867                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
868                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
869                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
870                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
871
872                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
873                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
874                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
875                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
876                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
877
878                                         for(i = 0; i < (int)data_len; i++) {
879                                                 //sum = 0;
880                                                 //sum  = qlp_coeff[10] * data[i-11];
881                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
882                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
883
884                                                 //sum += qlp_coeff[9] * data[i-10];
885                                                 //sum += qlp_coeff[8] * data[i-9];
886                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
887                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
888                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
889                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
890
891                                                 //sum += qlp_coeff[7] * data[i-8];
892                                                 //sum += qlp_coeff[6] * data[i-7];
893                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
894                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
895                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
896                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
897
898                                                 //sum += qlp_coeff[5] * data[i-6];
899                                                 //sum += qlp_coeff[4] * data[i-5];
900                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
901                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
902                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
903                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
904
905                                                 //sum += qlp_coeff[3] * data[i-4];
906                                                 //sum += qlp_coeff[2] * data[i-3];
907                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
908                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
909                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
910                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
911
912                                                 //sum += qlp_coeff[1] * data[i-2];
913                                                 //sum += qlp_coeff[0] * data[i-1];
914                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
915                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
916                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
917                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
918
919                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
920                                                 RESIDUAL_RESULT(xmm7);
921                                         }
922                                 }
923                         }
924                         else { /* order == 9, 10 */
925                                 if(order == 10) {
926                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
927                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
928                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
929                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
930                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
931                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
932
933                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
934                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
935                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
936                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
937                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
938
939                                         for(i = 0; i < (int)data_len; i++) {
940                                                 //sum = 0;
941                                                 //sum += qlp_coeff[9] * data[i-10];
942                                                 //sum += qlp_coeff[8] * data[i-9];
943                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
944                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
945                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
946
947                                                 //sum += qlp_coeff[7] * data[i-8];
948                                                 //sum += qlp_coeff[6] * data[i-7];
949                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
950                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
951                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
952                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
953
954                                                 //sum += qlp_coeff[5] * data[i-6];
955                                                 //sum += qlp_coeff[4] * data[i-5];
956                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
957                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
958                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
959                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
960
961                                                 //sum += qlp_coeff[3] * data[i-4];
962                                                 //sum += qlp_coeff[2] * data[i-3];
963                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
964                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
965                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
966                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
967
968                                                 //sum += qlp_coeff[1] * data[i-2];
969                                                 //sum += qlp_coeff[0] * data[i-1];
970                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
971                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
972                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
973                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
974
975                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
976                                                 RESIDUAL_RESULT(xmm7);
977                                         }
978                                 }
979                                 else { /* order == 9 */
980                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
981                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
982                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
983                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
984                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
985                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
986
987                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
988                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
989                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
990                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
991
992                                         for(i = 0; i < (int)data_len; i++) {
993                                                 //sum = 0;
994                                                 //sum  = qlp_coeff[8] * data[i-9];
995                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
996                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
997
998                                                 //sum += qlp_coeff[7] * data[i-8];
999                                                 //sum += qlp_coeff[6] * data[i-7];
1000                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1001                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1002                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
1003                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1004
1005                                                 //sum += qlp_coeff[5] * data[i-6];
1006                                                 //sum += qlp_coeff[4] * data[i-5];
1007                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1008                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1009                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1010                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1011
1012                                                 //sum += qlp_coeff[3] * data[i-4];
1013                                                 //sum += qlp_coeff[2] * data[i-3];
1014                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1015                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1016                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1017                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1018
1019                                                 //sum += qlp_coeff[1] * data[i-2];
1020                                                 //sum += qlp_coeff[0] * data[i-1];
1021                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1022                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1023                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1024                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1025
1026                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1027                                                 RESIDUAL_RESULT(xmm7);
1028                                         }
1029                                 }
1030                         }
1031                 }
1032                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1033                         if(order > 6) { /* order == 7, 8 */
1034                                 if(order == 8) {
1035                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1036                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1037                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1038                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1039                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1040
1041                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1042                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1043                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1044                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1045
1046                                         for(i = 0; i < (int)data_len; i++) {
1047                                                 //sum = 0;
1048                                                 //sum += qlp_coeff[7] * data[i-8];
1049                                                 //sum += qlp_coeff[6] * data[i-7];
1050                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1051                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1052                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1053
1054                                                 //sum += qlp_coeff[5] * data[i-6];
1055                                                 //sum += qlp_coeff[4] * data[i-5];
1056                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1057                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1058                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1059                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1060
1061                                                 //sum += qlp_coeff[3] * data[i-4];
1062                                                 //sum += qlp_coeff[2] * data[i-3];
1063                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1064                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1065                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1066                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1067
1068                                                 //sum += qlp_coeff[1] * data[i-2];
1069                                                 //sum += qlp_coeff[0] * data[i-1];
1070                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1071                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1072                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1073                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1074
1075                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1076                                                 RESIDUAL_RESULT(xmm7);
1077                                         }
1078                                 }
1079                                 else { /* order == 7 */
1080                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1081                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1082                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1083                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1084                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
1085
1086                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1087                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1088                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1089
1090                                         for(i = 0; i < (int)data_len; i++) {
1091                                                 //sum = 0;
1092                                                 //sum  = qlp_coeff[6] * data[i-7];
1093                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
1094                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1095
1096                                                 //sum += qlp_coeff[5] * data[i-6];
1097                                                 //sum += qlp_coeff[4] * data[i-5];
1098                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1099                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1100                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1101                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1102
1103                                                 //sum += qlp_coeff[3] * data[i-4];
1104                                                 //sum += qlp_coeff[2] * data[i-3];
1105                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1106                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1107                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1108                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1109
1110                                                 //sum += qlp_coeff[1] * data[i-2];
1111                                                 //sum += qlp_coeff[0] * data[i-1];
1112                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1113                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1114                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1115                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1116
1117                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1118                                                 RESIDUAL_RESULT(xmm7);
1119                                         }
1120                                 }
1121                         }
1122                         else { /* order == 5, 6 */
1123                                 if(order == 6) {
1124                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1125                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1126                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1127                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1128
1129                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1130                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1131                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1132
1133                                         for(i = 0; i < (int)data_len; i++) {
1134                                                 //sum = 0;
1135                                                 //sum += qlp_coeff[5] * data[i-6];
1136                                                 //sum += qlp_coeff[4] * data[i-5];
1137                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1138                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1139                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1140
1141                                                 //sum += qlp_coeff[3] * data[i-4];
1142                                                 //sum += qlp_coeff[2] * data[i-3];
1143                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1144                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1145                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1146                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1147
1148                                                 //sum += qlp_coeff[1] * data[i-2];
1149                                                 //sum += qlp_coeff[0] * data[i-1];
1150                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1151                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1152                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1153                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1154
1155                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1156                                                 RESIDUAL_RESULT(xmm7);
1157                                         }
1158                                 }
1159                                 else { /* order == 5 */
1160                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1161                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1162                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1163                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
1164
1165                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1166                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1167
1168                                         for(i = 0; i < (int)data_len; i++) {
1169                                                 //sum = 0;
1170                                                 //sum  = qlp_coeff[4] * data[i-5];
1171                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
1172                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1173
1174                                                 //sum += qlp_coeff[3] * data[i-4];
1175                                                 //sum += qlp_coeff[2] * data[i-3];
1176                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1177                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1178                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1179                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1180
1181                                                 //sum += qlp_coeff[1] * data[i-2];
1182                                                 //sum += qlp_coeff[0] * data[i-1];
1183                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1184                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1185                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1186                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1187
1188                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1189                                                 RESIDUAL_RESULT(xmm7);
1190                                         }
1191                                 }
1192                         }
1193                 }
1194                 else { /* order == 1, 2, 3, 4 */
1195                         if(order > 2) { /* order == 3, 4 */
1196                                 if(order == 4) {
1197                                         __m128i xmm0, xmm1, xmm6, xmm7;
1198                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1199                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1200
1201                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1202                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1203
1204                                         for(i = 0; i < (int)data_len; i++) {
1205                                                 //sum = 0;
1206                                                 //sum += qlp_coeff[3] * data[i-4];
1207                                                 //sum += qlp_coeff[2] * data[i-3];
1208                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1209                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1210                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1211
1212                                                 //sum += qlp_coeff[1] * data[i-2];
1213                                                 //sum += qlp_coeff[0] * data[i-1];
1214                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1215                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1216                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1217                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1218
1219                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1220                                                 RESIDUAL_RESULT(xmm7);
1221                                         }
1222                                 }
1223                                 else { /* order == 3 */
1224                                         __m128i xmm0, xmm1, xmm6, xmm7;
1225                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1226                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1227
1228                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1229
1230                                         for(i = 0; i < (int)data_len; i++) {
1231                                                 //sum = 0;
1232                                                 //sum  = qlp_coeff[2] * data[i-3];
1233                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1234                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1235
1236                                                 //sum += qlp_coeff[1] * data[i-2];
1237                                                 //sum += qlp_coeff[0] * data[i-1];
1238                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1239                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1240                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1241                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1242
1243                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1244                                                 RESIDUAL_RESULT(xmm7);
1245                                         }
1246                                 }
1247                         }
1248                         else { /* order == 1, 2 */
1249                                 if(order == 2) {
1250                                         __m128i xmm0, xmm7;
1251                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1252                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1253
1254                                         for(i = 0; i < (int)data_len; i++) {
1255                                                 //sum = 0;
1256                                                 //sum += qlp_coeff[1] * data[i-2];
1257                                                 //sum += qlp_coeff[0] * data[i-1];
1258                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1259                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1260                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
1261
1262                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1263                                                 RESIDUAL_RESULT(xmm7);
1264                                         }
1265                                 }
1266                                 else { /* order == 1 */
1267                                         for(i = 0; i < (int)data_len; i++)
1268                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1269                                 }
1270                         }
1271                 }
1272         }
1273         else { /* order > 12 */
1274                 FLAC__int32 sum;
1275                 for(i = 0; i < (int)data_len; i++) {
1276                         sum = 0;
1277                         switch(order) {
1278                                 case 32: sum += qlp_coeff[31] * data[i-32];
1279                                 case 31: sum += qlp_coeff[30] * data[i-31];
1280                                 case 30: sum += qlp_coeff[29] * data[i-30];
1281                                 case 29: sum += qlp_coeff[28] * data[i-29];
1282                                 case 28: sum += qlp_coeff[27] * data[i-28];
1283                                 case 27: sum += qlp_coeff[26] * data[i-27];
1284                                 case 26: sum += qlp_coeff[25] * data[i-26];
1285                                 case 25: sum += qlp_coeff[24] * data[i-25];
1286                                 case 24: sum += qlp_coeff[23] * data[i-24];
1287                                 case 23: sum += qlp_coeff[22] * data[i-23];
1288                                 case 22: sum += qlp_coeff[21] * data[i-22];
1289                                 case 21: sum += qlp_coeff[20] * data[i-21];
1290                                 case 20: sum += qlp_coeff[19] * data[i-20];
1291                                 case 19: sum += qlp_coeff[18] * data[i-19];
1292                                 case 18: sum += qlp_coeff[17] * data[i-18];
1293                                 case 17: sum += qlp_coeff[16] * data[i-17];
1294                                 case 16: sum += qlp_coeff[15] * data[i-16];
1295                                 case 15: sum += qlp_coeff[14] * data[i-15];
1296                                 case 14: sum += qlp_coeff[13] * data[i-14];
1297                                 case 13: sum += qlp_coeff[12] * data[i-13];
1298                                          sum += qlp_coeff[11] * data[i-12];
1299                                          sum += qlp_coeff[10] * data[i-11];
1300                                          sum += qlp_coeff[ 9] * data[i-10];
1301                                          sum += qlp_coeff[ 8] * data[i- 9];
1302                                          sum += qlp_coeff[ 7] * data[i- 8];
1303                                          sum += qlp_coeff[ 6] * data[i- 7];
1304                                          sum += qlp_coeff[ 5] * data[i- 6];
1305                                          sum += qlp_coeff[ 4] * data[i- 5];
1306                                          sum += qlp_coeff[ 3] * data[i- 4];
1307                                          sum += qlp_coeff[ 2] * data[i- 3];
1308                                          sum += qlp_coeff[ 1] * data[i- 2];
1309                                          sum += qlp_coeff[ 0] * data[i- 1];
1310                         }
1311                         residual[i] = data[i] - (sum >> lp_quantization);
1312                 }
1313         }
1314 }
1315
1316 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1317 #endif /* FLAC__NO_ASM */
1318 #endif /* FLAC__INTEGER_ONLY_LIBRARY */