1ee5c95fc7276ddf2ed486ccc0070b2f27d9a5e7
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <emmintrin.h> /* SSE2 */
47
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
50
51 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define     DATA_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
53
54 FLAC__SSE_TARGET("sse2")
55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
56 {
57         int i;
58         FLAC__int32 sum;
59
60         FLAC__ASSERT(order > 0);
61         FLAC__ASSERT(order <= 32);
62         FLAC__ASSERT(data_len > 0);
63
64         if(order <= 12) {
65                 FLAC__int32 curr;
66                 if(order > 8) { /* order == 9, 10, 11, 12 */
67 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
68                         int r;
69                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
70                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
71                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
72                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
73                         switch(order)                                          /* ...and zero them out */
74                         {
75                         case 9:
76                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
77                         case 10:
78                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
79                         case 11:
80                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
81                         }
82                         xmm2 = _mm_setzero_si128();
83                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
84                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
85
86                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
87                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
88                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
89                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
90                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
91                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
92                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
93                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
94
95                         xmm7 = _mm_slli_si128(xmm1, 2);
96                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
97                         xmm2 = _mm_slli_si128(xmm0, 2);
98
99                         /* xmm0, xmm1: qlp_coeff
100                            xmm2, xmm7: qlp_coeff << 16 bit
101                            xmm3, xmm4: data */
102
103                         xmm6 = xmm4;
104                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
105                         xmm5 = xmm3;
106                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
107                         xmm6 = _mm_add_epi32(xmm6, xmm5);
108                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
109                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
110
111                         RESIDUAL16_RESULT(xmm6);
112
113                         data_len--;
114                         r = data_len % 2;
115
116                         if(r) {
117                                 xmm4 = _mm_slli_si128(xmm4, 2);
118                                 xmm6 = xmm3;
119                                 xmm3 = _mm_slli_si128(xmm3, 2);
120                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
121                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
122
123                                 xmm6 = xmm4;
124                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
125                                 xmm5 = xmm3;
126                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
127                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
128                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
129                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
130
131                                 RESIDUAL16_RESULT(xmm6);
132
133                                 data_len--;
134                         }
135
136                         while(data_len) { /* data_len is a multiple of 2 */
137                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
138                                 xmm4 = _mm_slli_si128(xmm4, 4);
139                                 xmm6 = xmm3;
140                                 xmm3 = _mm_slli_si128(xmm3, 4);
141                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
142                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
143
144                                 xmm6 = xmm4;
145                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
146                                 xmm5 = xmm3;
147                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
148                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
149                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
150                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
151
152                                 RESIDUAL16_RESULT(xmm6);
153
154                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
155
156                                 xmm6 = xmm4;
157                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
158                                 xmm5 = xmm3;
159                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
160                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
161                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
162                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
163
164                                 RESIDUAL16_RESULT(xmm6);
165
166                                 data_len-=2;
167                         }
168 #else /* 16 XMM registers available */
169                         int r;
170                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
171                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
172                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
173                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
174                         switch(order)                                          /* ...and zero them out */
175                         {
176                         case 9:
177                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
178                         case 10:
179                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
180                         case 11:
181                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
182                         }
183                         xmm2 = _mm_setzero_si128();
184                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
185                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
186
187                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
188                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
189                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
190                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
191                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
192                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
193                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
194                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
195
196                         xmm7 = _mm_slli_si128(xmm1, 2);
197                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
198                         xmm2 = _mm_slli_si128(xmm0, 2);
199
200                         xmm9 = _mm_slli_si128(xmm1, 4);
201                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
202                         xmm8 = _mm_slli_si128(xmm0, 4);
203
204                         xmmB = _mm_slli_si128(xmm1, 6);
205                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
206                         xmmA = _mm_slli_si128(xmm0, 6);
207
208                         /* xmm0, xmm1: qlp_coeff
209                            xmm2, xmm7: qlp_coeff << 16 bit
210                            xmm8, xmm9: qlp_coeff << 2*16 bit
211                            xmmA, xmmB: qlp_coeff << 3*16 bit
212                            xmm3, xmm4: data */
213
214                         xmm6 = xmm4;
215                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
216                         xmm5 = xmm3;
217                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
218                         xmm6 = _mm_add_epi32(xmm6, xmm5);
219                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
220                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
221
222                         RESIDUAL16_RESULT(xmm6);
223
224                         data_len--;
225                         r = data_len % 4;
226
227                         while(r) {
228                                 xmm4 = _mm_slli_si128(xmm4, 2);
229                                 xmm6 = xmm3;
230                                 xmm3 = _mm_slli_si128(xmm3, 2);
231                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
232                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
233
234                                 xmm6 = xmm4;
235                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
236                                 xmm5 = xmm3;
237                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
238                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
239                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
240                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
241
242                                 RESIDUAL16_RESULT(xmm6);
243
244                                 data_len--; r--;
245                         }
246
247                         while(data_len) { /* data_len is a multiple of 4 */
248                                 xmm4 = _mm_slli_si128(xmm4, 8);
249                                 xmm6 = xmm3;
250                                 xmm3 = _mm_slli_si128(xmm3, 8);
251                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
252
253                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
254
255                                 xmm6 = xmm4;
256                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
257                                 xmm5 = xmm3;
258                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
259                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
260                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
261                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
262
263                                 RESIDUAL16_RESULT(xmm6);
264
265                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
266
267                                 xmm6 = xmm4;
268                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
269                                 xmm5 = xmm3;
270                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
271                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
272                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
273                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
274
275                                 RESIDUAL16_RESULT(xmm6);
276
277                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
278
279                                 xmm6 = xmm4;
280                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
281                                 xmm5 = xmm3;
282                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
283                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
284                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
285                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
286
287                                 RESIDUAL16_RESULT(xmm6);
288
289                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
290
291                                 xmm6 = xmm4;
292                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
293                                 xmm5 = xmm3;
294                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
295                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
296                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
297                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
298
299                                 RESIDUAL16_RESULT(xmm6);
300
301                                 data_len-=4;
302                         }
303 #endif
304                 } /* endif(order > 8) */
305                 else if(order > 4) { /* order == 5, 6, 7, 8 */
306                         if(order > 6) { /* order == 7, 8 */
307                                 if(order == 8) {
308                                         __m128i xmm0, xmm1, xmm3, xmm6;
309                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
310                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
311                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
312
313                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
314                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
315                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
316                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
317                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
318
319                                         /* xmm0: qlp_coeff
320                                            xmm3: data */
321
322                                         xmm6 = xmm3;
323                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
324                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
325                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
326
327                                         RESIDUAL16_RESULT(xmm6);
328
329                                         data_len--;
330
331                                         while(data_len) {
332                                                 xmm3 = _mm_slli_si128(xmm3, 2);
333                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
334
335                                                 xmm6 = xmm3;
336                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
337                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
338                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
339
340                                                 RESIDUAL16_RESULT(xmm6);
341
342                                                 data_len--;
343                                         }
344                                 }
345                                 else { /* order == 7 */
346                                         int r;
347                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
348                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
349                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
350                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
351                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
352
353                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
354                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
355                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
356                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
357                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
358                                         xmm2 = _mm_slli_si128(xmm0, 2);
359
360                                         /* xmm0: qlp_coeff
361                                            xmm2: qlp_coeff << 16 bit
362                                            xmm3: data */
363
364                                         xmm6 = xmm3;
365                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
366                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
367                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
368
369                                         RESIDUAL16_RESULT(xmm6);
370
371                                         data_len--;
372                                         r = data_len % 2;
373
374                                         if(r) {
375                                                 xmm3 = _mm_slli_si128(xmm3, 2);
376                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
377
378                                                 xmm6 = xmm3;
379                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
380                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
381                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
382
383                                                 RESIDUAL16_RESULT(xmm6);
384
385                                                 data_len--;
386                                         }
387
388                                         while(data_len) { /* data_len is a multiple of 2 */
389                                                 xmm3 = _mm_slli_si128(xmm3, 4);
390                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
391
392                                                 xmm6 = xmm3;
393                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
394                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
395                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
396
397                                                 RESIDUAL16_RESULT(xmm6);
398
399                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
400                                                 xmm6 = xmm3;
401                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
402                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
403                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
404
405                                                 RESIDUAL16_RESULT(xmm6);
406
407                                                 data_len-=2;
408                                         }
409                                 }
410                         }
411                         else { /* order == 5, 6 */
412                                 if(order == 6) {
413                                         int r;
414                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
415                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
416                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
417                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
418                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
419
420                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
421                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
422                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
423                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
424                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
425                                         xmm2 = _mm_slli_si128(xmm0, 2);
426                                         xmm4 = _mm_slli_si128(xmm0, 4);
427
428                                         /* xmm0: qlp_coeff
429                                            xmm2: qlp_coeff << 16 bit
430                                            xmm4: qlp_coeff << 2*16 bit
431                                            xmm3: data */
432
433                                         xmm6 = xmm3;
434                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
435                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
436                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
437
438                                         RESIDUAL16_RESULT(xmm6);
439
440                                         data_len--;
441                                         r = data_len % 3;
442
443                                         while(r) {
444                                                 xmm3 = _mm_slli_si128(xmm3, 2);
445                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
446
447                                                 xmm6 = xmm3;
448                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
449                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
450                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
451
452                                                 RESIDUAL16_RESULT(xmm6);
453
454                                                 data_len--; r--;
455                                         }
456
457                                         while(data_len) { /* data_len is a multiple of 3 */
458                                                 xmm3 = _mm_slli_si128(xmm3, 6);
459                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
460
461                                                 xmm6 = xmm3;
462                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
463                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
464                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
465
466                                                 RESIDUAL16_RESULT(xmm6);
467
468                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
469
470                                                 xmm6 = xmm3;
471                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
472                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
473                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
474
475                                                 RESIDUAL16_RESULT(xmm6);
476
477                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
478
479                                                 xmm6 = xmm3;
480                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
481                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
482                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
483
484                                                 RESIDUAL16_RESULT(xmm6);
485
486                                                 data_len-=3;
487                                         }
488                                 }
489                                 else { /* order == 5 */
490                                         int r;
491                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
492                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
493                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
494                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
495                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
496
497                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
498                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
499                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
500                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
501                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
502                                         xmm2 = _mm_slli_si128(xmm0, 2);
503                                         xmm4 = _mm_slli_si128(xmm0, 4);
504                                         xmm5 = _mm_slli_si128(xmm0, 6);
505
506                                         /* xmm0: qlp_coeff
507                                            xmm2: qlp_coeff << 16 bit
508                                            xmm4: qlp_coeff << 2*16 bit
509                                            xmm4: qlp_coeff << 3*16 bit
510                                            xmm3: data */
511
512                                         xmm6 = xmm3;
513                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
514                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
515                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
516
517                                         RESIDUAL16_RESULT(xmm6);
518
519                                         data_len--;
520                                         r = data_len % 4;
521
522                                         while(r) {
523                                                 xmm3 = _mm_slli_si128(xmm3, 2);
524                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
525
526                                                 xmm6 = xmm3;
527                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
528                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
529                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
530
531                                                 RESIDUAL16_RESULT(xmm6);
532
533                                                 data_len--; r--;
534                                         }
535
536                                         while(data_len) { /* data_len is a multiple of 4 */
537                                                 xmm3 = _mm_slli_si128(xmm3, 8);
538                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
539
540                                                 xmm6 = xmm3;
541                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
542                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
543                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
544
545                                                 RESIDUAL16_RESULT(xmm6);
546
547                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
548
549                                                 xmm6 = xmm3;
550                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
551                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
552                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
553
554                                                 RESIDUAL16_RESULT(xmm6);
555
556                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
557
558                                                 xmm6 = xmm3;
559                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
560                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
561                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
562
563                                                 RESIDUAL16_RESULT(xmm6);
564
565                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
566
567                                                 xmm6 = xmm3;
568                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
569                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
570                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
571
572                                                 RESIDUAL16_RESULT(xmm6);
573
574                                                 data_len-=4;
575                                         }
576                                 }
577                         }
578                 }
579                 else { /* order == 1, 2, 3, 4 */
580                         if(order > 2) {
581                                 if(order == 4) {
582                                         __m128i xmm0, xmm3, xmm6;
583                                         xmm6 = _mm_setzero_si128();
584                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
585                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
586
587                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
588                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
589                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
590
591                                         /* xmm0: qlp_coeff
592                                            xmm3: data */
593
594                                         xmm6 = xmm3;
595                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
596                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
597
598                                         RESIDUAL16_RESULT(xmm6);
599
600                                         data_len--;
601
602                                         while(data_len) {
603                                                 xmm3 = _mm_slli_si128(xmm3, 2);
604                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
605
606                                                 xmm6 = xmm3;
607                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
608                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
609
610                                                 RESIDUAL16_RESULT(xmm6);
611
612                                                 data_len--;
613                                         }
614                                 }
615                                 else { /* order == 3 */
616                                         int r;
617                                         __m128i xmm0, xmm1, xmm3, xmm6;
618                                         xmm6 = _mm_setzero_si128();
619                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
620                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
621                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
622
623                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
624                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
625                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
626                                         xmm1 = _mm_slli_si128(xmm0, 2);
627
628                                         /* xmm0: qlp_coeff
629                                            xmm1: qlp_coeff << 16 bit
630                                            xmm3: data */
631
632                                         xmm6 = xmm3;
633                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
634                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
635
636                                         RESIDUAL16_RESULT(xmm6);
637
638                                         data_len--;
639                                         r = data_len % 2;
640
641                                         if(r) {
642                                                 xmm3 = _mm_slli_si128(xmm3, 2);
643                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
644
645                                                 xmm6 = xmm3;
646                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
647                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
648
649                                                 RESIDUAL16_RESULT(xmm6);
650
651                                                 data_len--;
652                                         }
653
654                                         while(data_len) { /* data_len is a multiple of 2 */
655                                                 xmm3 = _mm_slli_si128(xmm3, 4);
656
657                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
658
659                                                 xmm6 = xmm3;
660                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
661                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
662
663                                                 RESIDUAL16_RESULT(xmm6);
664
665                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
666
667                                                 xmm6 = xmm3;
668                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
669                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
670
671                                                 RESIDUAL16_RESULT(xmm6);
672
673                                                 data_len-=2;
674                                         }
675                                 }
676                         }
677                         else {
678                                 if(order == 2) {
679                                         __m128i xmm0, xmm3, xmm6;
680                                         xmm6 = _mm_setzero_si128();
681                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
682                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
683                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
684
685                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
686                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
687                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
688
689                                         /* xmm0: qlp_coeff
690                                            xmm3: data */
691
692                                         xmm6 = xmm3;
693                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
694
695                                         RESIDUAL16_RESULT(xmm6);
696
697                                         data_len--;
698
699                                         while(data_len) {
700                                                 xmm3 = _mm_slli_si128(xmm3, 2);
701                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
702
703                                                 xmm6 = xmm3;
704                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
705
706                                                 RESIDUAL16_RESULT(xmm6);
707
708                                                 data_len--;
709                                         }
710                                 }
711                                 else { /* order == 1 */
712                                         for(i = 0; i < (int)data_len; i++)
713                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
714                                 }
715                         }
716                 }
717         }
718         else { /* order > 12 */
719                 for(i = 0; i < (int)data_len; i++) {
720                         sum = 0;
721                         switch(order) {
722                                 case 32: sum += qlp_coeff[31] * data[i-32];
723                                 case 31: sum += qlp_coeff[30] * data[i-31];
724                                 case 30: sum += qlp_coeff[29] * data[i-30];
725                                 case 29: sum += qlp_coeff[28] * data[i-29];
726                                 case 28: sum += qlp_coeff[27] * data[i-28];
727                                 case 27: sum += qlp_coeff[26] * data[i-27];
728                                 case 26: sum += qlp_coeff[25] * data[i-26];
729                                 case 25: sum += qlp_coeff[24] * data[i-25];
730                                 case 24: sum += qlp_coeff[23] * data[i-24];
731                                 case 23: sum += qlp_coeff[22] * data[i-23];
732                                 case 22: sum += qlp_coeff[21] * data[i-22];
733                                 case 21: sum += qlp_coeff[20] * data[i-21];
734                                 case 20: sum += qlp_coeff[19] * data[i-20];
735                                 case 19: sum += qlp_coeff[18] * data[i-19];
736                                 case 18: sum += qlp_coeff[17] * data[i-18];
737                                 case 17: sum += qlp_coeff[16] * data[i-17];
738                                 case 16: sum += qlp_coeff[15] * data[i-16];
739                                 case 15: sum += qlp_coeff[14] * data[i-15];
740                                 case 14: sum += qlp_coeff[13] * data[i-14];
741                                 case 13: sum += qlp_coeff[12] * data[i-13];
742                                          sum += qlp_coeff[11] * data[i-12];
743                                          sum += qlp_coeff[10] * data[i-11];
744                                          sum += qlp_coeff[ 9] * data[i-10];
745                                          sum += qlp_coeff[ 8] * data[i- 9];
746                                          sum += qlp_coeff[ 7] * data[i- 8];
747                                          sum += qlp_coeff[ 6] * data[i- 7];
748                                          sum += qlp_coeff[ 5] * data[i- 6];
749                                          sum += qlp_coeff[ 4] * data[i- 5];
750                                          sum += qlp_coeff[ 3] * data[i- 4];
751                                          sum += qlp_coeff[ 2] * data[i- 3];
752                                          sum += qlp_coeff[ 1] * data[i- 2];
753                                          sum += qlp_coeff[ 0] * data[i- 1];
754                         }
755                         residual[i] = data[i] - (sum >> lp_quantization);
756                 }
757         }
758 }
759
760 FLAC__SSE_TARGET("sse2")
761 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
762 {
763         int i;
764
765         FLAC__ASSERT(order > 0);
766         FLAC__ASSERT(order <= 32);
767
768         if(order <= 12) {
769                 if(order > 8) { /* order == 9, 10, 11, 12 */
770                         if(order > 10) { /* order == 11, 12 */
771                                 if(order == 12) {
772                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
773                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
774                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
775                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
776                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
777                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
778                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
779
780                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
781                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
782                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
783                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
784                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
785                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
786
787                                         for(i = 0; i < (int)data_len; i++) {
788                                                 //sum = 0;
789                                                 //sum += qlp_coeff[11] * data[i-12];
790                                                 //sum += qlp_coeff[10] * data[i-11];
791                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
792                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
793                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
794
795                                                 //sum += qlp_coeff[9] * data[i-10];
796                                                 //sum += qlp_coeff[8] * data[i-9];
797                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
798                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
800                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
801
802                                                 //sum += qlp_coeff[7] * data[i-8];
803                                                 //sum += qlp_coeff[6] * data[i-7];
804                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
805                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
807                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
808
809                                                 //sum += qlp_coeff[5] * data[i-6];
810                                                 //sum += qlp_coeff[4] * data[i-5];
811                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
812                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
813                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
814                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
815
816                                                 //sum += qlp_coeff[3] * data[i-4];
817                                                 //sum += qlp_coeff[2] * data[i-3];
818                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
819                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
820                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
821                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
822
823                                                 //sum += qlp_coeff[1] * data[i-2];
824                                                 //sum += qlp_coeff[0] * data[i-1];
825                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
826                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
827                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
828                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
829
830                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
831                                                 RESIDUAL_RESULT(xmm7);
832                                         }
833                                 }
834                                 else { /* order == 11 */
835                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
836                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
837                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
838                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
839                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
840                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
841                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
842
843                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
844                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
845                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
846                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
847                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
848
849                                         for(i = 0; i < (int)data_len; i++) {
850                                                 //sum = 0;
851                                                 //sum  = qlp_coeff[10] * data[i-11];
852                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
853                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
854
855                                                 //sum += qlp_coeff[9] * data[i-10];
856                                                 //sum += qlp_coeff[8] * data[i-9];
857                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
858                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
859                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
860                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
861
862                                                 //sum += qlp_coeff[7] * data[i-8];
863                                                 //sum += qlp_coeff[6] * data[i-7];
864                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
865                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
866                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
867                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
868
869                                                 //sum += qlp_coeff[5] * data[i-6];
870                                                 //sum += qlp_coeff[4] * data[i-5];
871                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
872                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
873                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
874                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
875
876                                                 //sum += qlp_coeff[3] * data[i-4];
877                                                 //sum += qlp_coeff[2] * data[i-3];
878                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
879                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
880                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
881                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
882
883                                                 //sum += qlp_coeff[1] * data[i-2];
884                                                 //sum += qlp_coeff[0] * data[i-1];
885                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
886                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
887                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
888                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
889
890                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
891                                                 RESIDUAL_RESULT(xmm7);
892                                         }
893                                 }
894                         }
895                         else { /* order == 9, 10 */
896                                 if(order == 10) {
897                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
898                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
899                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
900                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
901                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
902                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
903
904                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
905                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
906                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
907                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
908                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
909
910                                         for(i = 0; i < (int)data_len; i++) {
911                                                 //sum = 0;
912                                                 //sum += qlp_coeff[9] * data[i-10];
913                                                 //sum += qlp_coeff[8] * data[i-9];
914                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
915                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
916                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
917
918                                                 //sum += qlp_coeff[7] * data[i-8];
919                                                 //sum += qlp_coeff[6] * data[i-7];
920                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
921                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
922                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
923                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
924
925                                                 //sum += qlp_coeff[5] * data[i-6];
926                                                 //sum += qlp_coeff[4] * data[i-5];
927                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
928                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
929                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
930                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
931
932                                                 //sum += qlp_coeff[3] * data[i-4];
933                                                 //sum += qlp_coeff[2] * data[i-3];
934                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
935                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
936                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
937                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
938
939                                                 //sum += qlp_coeff[1] * data[i-2];
940                                                 //sum += qlp_coeff[0] * data[i-1];
941                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
942                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
943                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
944                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
945
946                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
947                                                 RESIDUAL_RESULT(xmm7);
948                                         }
949                                 }
950                                 else { /* order == 9 */
951                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
952                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
953                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
954                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
955                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
956                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
957
958                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
959                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
960                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
961                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
962
963                                         for(i = 0; i < (int)data_len; i++) {
964                                                 //sum = 0;
965                                                 //sum  = qlp_coeff[8] * data[i-9];
966                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
967                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
968
969                                                 //sum += qlp_coeff[7] * data[i-8];
970                                                 //sum += qlp_coeff[6] * data[i-7];
971                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
972                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
973                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
974                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
975
976                                                 //sum += qlp_coeff[5] * data[i-6];
977                                                 //sum += qlp_coeff[4] * data[i-5];
978                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
979                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
980                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
981                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
982
983                                                 //sum += qlp_coeff[3] * data[i-4];
984                                                 //sum += qlp_coeff[2] * data[i-3];
985                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
986                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
987                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
988                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
989
990                                                 //sum += qlp_coeff[1] * data[i-2];
991                                                 //sum += qlp_coeff[0] * data[i-1];
992                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
993                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
994                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
995                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
996
997                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
998                                                 RESIDUAL_RESULT(xmm7);
999                                         }
1000                                 }
1001                         }
1002                 }
1003                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1004                         if(order > 6) { /* order == 7, 8 */
1005                                 if(order == 8) {
1006                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1007                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1008                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1009                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1010                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1011
1012                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1013                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1014                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1015                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1016
1017                                         for(i = 0; i < (int)data_len; i++) {
1018                                                 //sum = 0;
1019                                                 //sum += qlp_coeff[7] * data[i-8];
1020                                                 //sum += qlp_coeff[6] * data[i-7];
1021                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1022                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1023                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1024
1025                                                 //sum += qlp_coeff[5] * data[i-6];
1026                                                 //sum += qlp_coeff[4] * data[i-5];
1027                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1028                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1029                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1030                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1031
1032                                                 //sum += qlp_coeff[3] * data[i-4];
1033                                                 //sum += qlp_coeff[2] * data[i-3];
1034                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1035                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1036                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1037                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1038
1039                                                 //sum += qlp_coeff[1] * data[i-2];
1040                                                 //sum += qlp_coeff[0] * data[i-1];
1041                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1042                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1043                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1044                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1045
1046                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1047                                                 RESIDUAL_RESULT(xmm7);
1048                                         }
1049                                 }
1050                                 else { /* order == 7 */
1051                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1052                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1053                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1054                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1055                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
1056
1057                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1058                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1059                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1060
1061                                         for(i = 0; i < (int)data_len; i++) {
1062                                                 //sum = 0;
1063                                                 //sum  = qlp_coeff[6] * data[i-7];
1064                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
1065                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1066
1067                                                 //sum += qlp_coeff[5] * data[i-6];
1068                                                 //sum += qlp_coeff[4] * data[i-5];
1069                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1070                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1071                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1072                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1073
1074                                                 //sum += qlp_coeff[3] * data[i-4];
1075                                                 //sum += qlp_coeff[2] * data[i-3];
1076                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1077                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1078                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1079                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1080
1081                                                 //sum += qlp_coeff[1] * data[i-2];
1082                                                 //sum += qlp_coeff[0] * data[i-1];
1083                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1084                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1085                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1086                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1087
1088                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1089                                                 RESIDUAL_RESULT(xmm7);
1090                                         }
1091                                 }
1092                         }
1093                         else { /* order == 5, 6 */
1094                                 if(order == 6) {
1095                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1096                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1097                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1098                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1099
1100                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1101                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1102                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1103
1104                                         for(i = 0; i < (int)data_len; i++) {
1105                                                 //sum = 0;
1106                                                 //sum += qlp_coeff[5] * data[i-6];
1107                                                 //sum += qlp_coeff[4] * data[i-5];
1108                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1109                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1110                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1111
1112                                                 //sum += qlp_coeff[3] * data[i-4];
1113                                                 //sum += qlp_coeff[2] * data[i-3];
1114                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1115                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1116                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1117                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1118
1119                                                 //sum += qlp_coeff[1] * data[i-2];
1120                                                 //sum += qlp_coeff[0] * data[i-1];
1121                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1122                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1123                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1124                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1125
1126                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1127                                                 RESIDUAL_RESULT(xmm7);
1128                                         }
1129                                 }
1130                                 else { /* order == 5 */
1131                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1132                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1133                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1134                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
1135
1136                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1137                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1138
1139                                         for(i = 0; i < (int)data_len; i++) {
1140                                                 //sum = 0;
1141                                                 //sum  = qlp_coeff[4] * data[i-5];
1142                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
1143                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1144
1145                                                 //sum += qlp_coeff[3] * data[i-4];
1146                                                 //sum += qlp_coeff[2] * data[i-3];
1147                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1148                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1149                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1150                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1151
1152                                                 //sum += qlp_coeff[1] * data[i-2];
1153                                                 //sum += qlp_coeff[0] * data[i-1];
1154                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1155                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1156                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1157                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1158
1159                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1160                                                 RESIDUAL_RESULT(xmm7);
1161                                         }
1162                                 }
1163                         }
1164                 }
1165                 else { /* order == 1, 2, 3, 4 */
1166                         if(order > 2) { /* order == 3, 4 */
1167                                 if(order == 4) {
1168                                         __m128i xmm0, xmm1, xmm6, xmm7;
1169                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1170                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1171
1172                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1173                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1174
1175                                         for(i = 0; i < (int)data_len; i++) {
1176                                                 //sum = 0;
1177                                                 //sum += qlp_coeff[3] * data[i-4];
1178                                                 //sum += qlp_coeff[2] * data[i-3];
1179                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1180                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1181                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1182
1183                                                 //sum += qlp_coeff[1] * data[i-2];
1184                                                 //sum += qlp_coeff[0] * data[i-1];
1185                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1186                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1187                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1188                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1189
1190                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1191                                                 RESIDUAL_RESULT(xmm7);
1192                                         }
1193                                 }
1194                                 else { /* order == 3 */
1195                                         __m128i xmm0, xmm1, xmm6, xmm7;
1196                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1197                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1198
1199                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1200
1201                                         for(i = 0; i < (int)data_len; i++) {
1202                                                 //sum = 0;
1203                                                 //sum  = qlp_coeff[2] * data[i-3];
1204                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1205                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1206
1207                                                 //sum += qlp_coeff[1] * data[i-2];
1208                                                 //sum += qlp_coeff[0] * data[i-1];
1209                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1210                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1211                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1212                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1213
1214                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1215                                                 RESIDUAL_RESULT(xmm7);
1216                                         }
1217                                 }
1218                         }
1219                         else { /* order == 1, 2 */
1220                                 if(order == 2) {
1221                                         __m128i xmm0, xmm7;
1222                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1223                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1224
1225                                         for(i = 0; i < (int)data_len; i++) {
1226                                                 //sum = 0;
1227                                                 //sum += qlp_coeff[1] * data[i-2];
1228                                                 //sum += qlp_coeff[0] * data[i-1];
1229                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1230                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1231                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
1232
1233                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1234                                                 RESIDUAL_RESULT(xmm7);
1235                                         }
1236                                 }
1237                                 else { /* order == 1 */
1238                                         for(i = 0; i < (int)data_len; i++)
1239                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1240                                 }
1241                         }
1242                 }
1243         }
1244         else { /* order > 12 */
1245                 FLAC__int32 sum;
1246                 for(i = 0; i < (int)data_len; i++) {
1247                         sum = 0;
1248                         switch(order) {
1249                                 case 32: sum += qlp_coeff[31] * data[i-32];
1250                                 case 31: sum += qlp_coeff[30] * data[i-31];
1251                                 case 30: sum += qlp_coeff[29] * data[i-30];
1252                                 case 29: sum += qlp_coeff[28] * data[i-29];
1253                                 case 28: sum += qlp_coeff[27] * data[i-28];
1254                                 case 27: sum += qlp_coeff[26] * data[i-27];
1255                                 case 26: sum += qlp_coeff[25] * data[i-26];
1256                                 case 25: sum += qlp_coeff[24] * data[i-25];
1257                                 case 24: sum += qlp_coeff[23] * data[i-24];
1258                                 case 23: sum += qlp_coeff[22] * data[i-23];
1259                                 case 22: sum += qlp_coeff[21] * data[i-22];
1260                                 case 21: sum += qlp_coeff[20] * data[i-21];
1261                                 case 20: sum += qlp_coeff[19] * data[i-20];
1262                                 case 19: sum += qlp_coeff[18] * data[i-19];
1263                                 case 18: sum += qlp_coeff[17] * data[i-18];
1264                                 case 17: sum += qlp_coeff[16] * data[i-17];
1265                                 case 16: sum += qlp_coeff[15] * data[i-16];
1266                                 case 15: sum += qlp_coeff[14] * data[i-15];
1267                                 case 14: sum += qlp_coeff[13] * data[i-14];
1268                                 case 13: sum += qlp_coeff[12] * data[i-13];
1269                                          sum += qlp_coeff[11] * data[i-12];
1270                                          sum += qlp_coeff[10] * data[i-11];
1271                                          sum += qlp_coeff[ 9] * data[i-10];
1272                                          sum += qlp_coeff[ 8] * data[i- 9];
1273                                          sum += qlp_coeff[ 7] * data[i- 8];
1274                                          sum += qlp_coeff[ 6] * data[i- 7];
1275                                          sum += qlp_coeff[ 5] * data[i- 6];
1276                                          sum += qlp_coeff[ 4] * data[i- 5];
1277                                          sum += qlp_coeff[ 3] * data[i- 4];
1278                                          sum += qlp_coeff[ 2] * data[i- 3];
1279                                          sum += qlp_coeff[ 1] * data[i- 2];
1280                                          sum += qlp_coeff[ 0] * data[i- 1];
1281                         }
1282                         residual[i] = data[i] - (sum >> lp_quantization);
1283                 }
1284         }
1285 }
1286
1287 FLAC__SSE_TARGET("sse2")
1288 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1289 {
1290         int i;
1291         FLAC__int32 sum;
1292         if (order < 8) {
1293                 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
1294                 return;
1295         }
1296
1297         FLAC__ASSERT(order > 0);
1298         FLAC__ASSERT(order <= 32);
1299         FLAC__ASSERT(data_len > 0);
1300
1301         if(order <= 12) {
1302                 FLAC__int32 curr;
1303                 if(order > 8) { /* order == 9, 10, 11, 12 */
1304 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
1305                         int r;
1306                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1307                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1308                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1309                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1310                         switch(order)                                          /* ...and zero them out */
1311                         {
1312                         case 9:
1313                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1314                         case 10:
1315                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1316                         case 11:
1317                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1318                         }
1319                         xmm2 = _mm_setzero_si128();
1320                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1321                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
1322
1323                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1324                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1325                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1326                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1327                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1328                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1329                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
1330                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
1331
1332                         xmm7 = _mm_slli_si128(xmm1, 2);
1333                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1334                         xmm2 = _mm_slli_si128(xmm0, 2);
1335
1336                         /* xmm0, xmm1: qlp_coeff
1337                            xmm2, xmm7: qlp_coeff << 16 bit
1338                            xmm3, xmm4: data */
1339
1340                         xmm6 = xmm4;
1341                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
1342                         xmm5 = xmm3;
1343                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
1344                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1345                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1346                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1347
1348                         DATA16_RESULT(xmm6);
1349
1350                         data_len--;
1351                         r = data_len % 2;
1352
1353                         if(r) {
1354                                 xmm4 = _mm_slli_si128(xmm4, 2);
1355                                 xmm6 = xmm3;
1356                                 xmm3 = _mm_slli_si128(xmm3, 2);
1357                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1358                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1359
1360                                 xmm6 = xmm4;
1361                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1362                                 xmm5 = xmm3;
1363                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1364                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1365                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1366                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1367
1368                                 DATA16_RESULT(xmm6);
1369
1370                                 data_len--;
1371                         }
1372
1373                         while(data_len) { /* data_len is a multiple of 2 */
1374                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1375                                 xmm4 = _mm_slli_si128(xmm4, 4);
1376                                 xmm6 = xmm3;
1377                                 xmm3 = _mm_slli_si128(xmm3, 4);
1378                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
1379                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1380
1381                                 xmm6 = xmm4;
1382                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1383                                 xmm5 = xmm3;
1384                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1385                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1386                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1387                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1388
1389                                 DATA16_RESULT(xmm6);
1390
1391                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1392
1393                                 xmm6 = xmm4;
1394                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1395                                 xmm5 = xmm3;
1396                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1397                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1398                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1399                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1400
1401                                 DATA16_RESULT(xmm6);
1402
1403                                 data_len-=2;
1404                         }
1405 #else /* 16 XMM registers available */
1406                         int r;
1407                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
1408                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1409                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1410                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1411                         switch(order)                                          /* ...and zero them out */
1412                         {
1413                         case 9:
1414                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1415                         case 10:
1416                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1417                         case 11:
1418                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1419                         }
1420                         xmm2 = _mm_setzero_si128();
1421                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1422                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
1423
1424                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1425                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1426                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1427                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1428                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1429                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1430                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
1431                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
1432
1433                         xmm7 = _mm_slli_si128(xmm1, 2);
1434                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1435                         xmm2 = _mm_slli_si128(xmm0, 2);
1436
1437                         xmm9 = _mm_slli_si128(xmm1, 4);
1438                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
1439                         xmm8 = _mm_slli_si128(xmm0, 4);
1440
1441                         xmmB = _mm_slli_si128(xmm1, 6);
1442                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
1443                         xmmA = _mm_slli_si128(xmm0, 6);
1444
1445                         /* xmm0, xmm1: qlp_coeff
1446                            xmm2, xmm7: qlp_coeff << 16 bit
1447                            xmm8, xmm9: qlp_coeff << 2*16 bit
1448                            xmmA, xmmB: qlp_coeff << 3*16 bit
1449                            xmm3, xmm4: data */
1450
1451                         xmm6 = xmm4;
1452                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
1453                         xmm5 = xmm3;
1454                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
1455                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1456                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1457                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1458
1459                         DATA16_RESULT(xmm6);
1460
1461                         data_len--;
1462                         r = data_len % 4;
1463
1464                         while(r) {
1465                                 xmm4 = _mm_slli_si128(xmm4, 2);
1466                                 xmm6 = xmm3;
1467                                 xmm3 = _mm_slli_si128(xmm3, 2);
1468                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1469                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1470
1471                                 xmm6 = xmm4;
1472                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1473                                 xmm5 = xmm3;
1474                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1475                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1476                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1477                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1478
1479                                 DATA16_RESULT(xmm6);
1480
1481                                 data_len--; r--;
1482                         }
1483
1484                         while(data_len) { /* data_len is a multiple of 4 */
1485                                 xmm4 = _mm_slli_si128(xmm4, 8);
1486                                 xmm6 = xmm3;
1487                                 xmm3 = _mm_slli_si128(xmm3, 8);
1488                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
1489
1490                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1491
1492                                 xmm6 = xmm4;
1493                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
1494                                 xmm5 = xmm3;
1495                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
1496                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1497                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1498                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1499
1500                                 DATA16_RESULT(xmm6);
1501
1502                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1503
1504                                 xmm6 = xmm4;
1505                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
1506                                 xmm5 = xmm3;
1507                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
1508                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1509                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1510                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1511
1512                                 DATA16_RESULT(xmm6);
1513
1514                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1515
1516                                 xmm6 = xmm4;
1517                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1518                                 xmm5 = xmm3;
1519                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1520                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1521                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1522                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1523
1524                                 DATA16_RESULT(xmm6);
1525
1526                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1527
1528                                 xmm6 = xmm4;
1529                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1530                                 xmm5 = xmm3;
1531                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1532                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1533                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1534                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1535
1536                                 DATA16_RESULT(xmm6);
1537
1538                                 data_len-=4;
1539                         }
1540 #endif
1541                 } /* endif(order > 8) */
1542                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1543                         if(order > 6) { /* order == 7, 8 */
1544                                 if(order == 8) {
1545                                         __m128i xmm0, xmm1, xmm3, xmm6;
1546                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1547                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1548                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1549
1550                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1551                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1552                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1553                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1554                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1555
1556                                         /* xmm0: qlp_coeff
1557                                            xmm3: data */
1558
1559                                         xmm6 = xmm3;
1560                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1561                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1562                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1563
1564                                         DATA16_RESULT(xmm6);
1565
1566                                         data_len--;
1567
1568                                         while(data_len) {
1569                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1570                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1571
1572                                                 xmm6 = xmm3;
1573                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1574                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1575                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1576
1577                                                 DATA16_RESULT(xmm6);
1578
1579                                                 data_len--;
1580                                         }
1581                                 }
1582                                 else { /* order == 7 */
1583                                         int r;
1584                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
1585                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1586                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1587                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
1588                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1589
1590                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1591                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1592                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1593                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1594                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1595                                         xmm2 = _mm_slli_si128(xmm0, 2);
1596
1597                                         /* xmm0: qlp_coeff
1598                                            xmm2: qlp_coeff << 16 bit
1599                                            xmm3: data */
1600
1601                                         xmm6 = xmm3;
1602                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1603                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1604                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1605
1606                                         DATA16_RESULT(xmm6);
1607
1608                                         data_len--;
1609                                         r = data_len % 2;
1610
1611                                         if(r) {
1612                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1613                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1614
1615                                                 xmm6 = xmm3;
1616                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1617                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1618                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1619
1620                                                 DATA16_RESULT(xmm6);
1621
1622                                                 data_len--;
1623                                         }
1624
1625                                         while(data_len) { /* data_len is a multiple of 2 */
1626                                                 xmm3 = _mm_slli_si128(xmm3, 4);
1627                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1628
1629                                                 xmm6 = xmm3;
1630                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1631                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1632                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1633
1634                                                 DATA16_RESULT(xmm6);
1635
1636                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1637                                                 xmm6 = xmm3;
1638                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1639                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1640                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1641
1642                                                 DATA16_RESULT(xmm6);
1643
1644                                                 data_len-=2;
1645                                         }
1646                                 }
1647                         }
1648                         else { /* order == 5, 6 */
1649                                 if(order == 6) {
1650                                         int r;
1651                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
1652                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1653                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1654                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
1655                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1656
1657                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1658                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1659                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1660                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1661                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1662                                         xmm2 = _mm_slli_si128(xmm0, 2);
1663                                         xmm4 = _mm_slli_si128(xmm0, 4);
1664
1665                                         /* xmm0: qlp_coeff
1666                                            xmm2: qlp_coeff << 16 bit
1667                                            xmm4: qlp_coeff << 2*16 bit
1668                                            xmm3: data */
1669
1670                                         xmm6 = xmm3;
1671                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1672                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1673                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1674
1675                                         DATA16_RESULT(xmm6);
1676
1677                                         data_len--;
1678                                         r = data_len % 3;
1679
1680                                         while(r) {
1681                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1682                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1683
1684                                                 xmm6 = xmm3;
1685                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1686                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1687                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1688
1689                                                 DATA16_RESULT(xmm6);
1690
1691                                                 data_len--; r--;
1692                                         }
1693
1694                                         while(data_len) { /* data_len is a multiple of 3 */
1695                                                 xmm3 = _mm_slli_si128(xmm3, 6);
1696                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1697
1698                                                 xmm6 = xmm3;
1699                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
1700                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1701                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1702
1703                                                 DATA16_RESULT(xmm6);
1704
1705                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1706
1707                                                 xmm6 = xmm3;
1708                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1709                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1710                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1711
1712                                                 DATA16_RESULT(xmm6);
1713
1714                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1715
1716                                                 xmm6 = xmm3;
1717                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1718                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1719                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1720
1721                                                 DATA16_RESULT(xmm6);
1722
1723                                                 data_len-=3;
1724                                         }
1725                                 }
1726                                 else { /* order == 5 */
1727                                         int r;
1728                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1729                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1730                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1731                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
1732                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1733
1734                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1735                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1736                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1737                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1738                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1739                                         xmm2 = _mm_slli_si128(xmm0, 2);
1740                                         xmm4 = _mm_slli_si128(xmm0, 4);
1741                                         xmm5 = _mm_slli_si128(xmm0, 6);
1742
1743                                         /* xmm0: qlp_coeff
1744                                            xmm2: qlp_coeff << 16 bit
1745                                            xmm4: qlp_coeff << 2*16 bit
1746                                            xmm4: qlp_coeff << 3*16 bit
1747                                            xmm3: data */
1748
1749                                         xmm6 = xmm3;
1750                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1751                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1752                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1753
1754                                         DATA16_RESULT(xmm6);
1755
1756                                         data_len--;
1757                                         r = data_len % 4;
1758
1759                                         while(r) {
1760                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1761                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1762
1763                                                 xmm6 = xmm3;
1764                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1765                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1766                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1767
1768                                                 DATA16_RESULT(xmm6);
1769
1770                                                 data_len--; r--;
1771                                         }
1772
1773                                         while(data_len) { /* data_len is a multiple of 4 */
1774                                                 xmm3 = _mm_slli_si128(xmm3, 8);
1775                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1776
1777                                                 xmm6 = xmm3;
1778                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
1779                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1780                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1781
1782                                                 DATA16_RESULT(xmm6);
1783
1784                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1785
1786                                                 xmm6 = xmm3;
1787                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
1788                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1789                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1790
1791                                                 DATA16_RESULT(xmm6);
1792
1793                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1794
1795                                                 xmm6 = xmm3;
1796                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1797                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1798                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1799
1800                                                 DATA16_RESULT(xmm6);
1801
1802                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1803
1804                                                 xmm6 = xmm3;
1805                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1806                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1807                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1808
1809                                                 DATA16_RESULT(xmm6);
1810
1811                                                 data_len-=4;
1812                                         }
1813                                 }
1814                         }
1815                 }
1816                 else { /* order == 1, 2, 3, 4 */
1817                         if(order > 2) {
1818                                 if(order == 4) {
1819                                         __m128i xmm0, xmm3, xmm6;
1820                                         xmm6 = _mm_setzero_si128();
1821                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1822                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1823
1824                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1825                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1826                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
1827
1828                                         /* xmm0: qlp_coeff
1829                                            xmm3: data */
1830
1831                                         xmm6 = xmm3;
1832                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1833                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1834
1835                                         DATA16_RESULT(xmm6);
1836
1837                                         data_len--;
1838
1839                                         while(data_len) {
1840                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1841                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1842
1843                                                 xmm6 = xmm3;
1844                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1845                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1846
1847                                                 DATA16_RESULT(xmm6);
1848
1849                                                 data_len--;
1850                                         }
1851                                 }
1852                                 else { /* order == 3 */
1853                                         int r;
1854                                         __m128i xmm0, xmm1, xmm3, xmm6;
1855                                         xmm6 = _mm_setzero_si128();
1856                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1857                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
1858                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1859
1860                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1861                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1862                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
1863                                         xmm1 = _mm_slli_si128(xmm0, 2);
1864
1865                                         /* xmm0: qlp_coeff
1866                                            xmm1: qlp_coeff << 16 bit
1867                                            xmm3: data */
1868
1869                                         xmm6 = xmm3;
1870                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1871                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1872
1873                                         DATA16_RESULT(xmm6);
1874
1875                                         data_len--;
1876                                         r = data_len % 2;
1877
1878                                         if(r) {
1879                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1880                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1881
1882                                                 xmm6 = xmm3;
1883                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1884                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1885
1886                                                 DATA16_RESULT(xmm6);
1887
1888                                                 data_len--;
1889                                         }
1890
1891                                         while(data_len) { /* data_len is a multiple of 2 */
1892                                                 xmm3 = _mm_slli_si128(xmm3, 4);
1893
1894                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1895
1896                                                 xmm6 = xmm3;
1897                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1898                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1899
1900                                                 DATA16_RESULT(xmm6);
1901
1902                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1903
1904                                                 xmm6 = xmm3;
1905                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1906                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1907
1908                                                 DATA16_RESULT(xmm6);
1909
1910                                                 data_len-=2;
1911                                         }
1912                                 }
1913                         }
1914                         else {
1915                                 if(order == 2) {
1916                                         __m128i xmm0, xmm3, xmm6;
1917                                         xmm6 = _mm_setzero_si128();
1918                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1919                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
1920                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1921
1922                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1923                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1924                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
1925
1926                                         /* xmm0: qlp_coeff
1927                                            xmm3: data */
1928
1929                                         xmm6 = xmm3;
1930                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1931
1932                                         DATA16_RESULT(xmm6);
1933
1934                                         data_len--;
1935
1936                                         while(data_len) {
1937                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1938                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1939
1940                                                 xmm6 = xmm3;
1941                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1942
1943                                                 DATA16_RESULT(xmm6);
1944
1945                                                 data_len--;
1946                                         }
1947                                 }
1948                                 else { /* order == 1 */
1949                                         for(i = 0; i < (int)data_len; i++)
1950                                                 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1951                                 }
1952                         }
1953                 }
1954         }
1955         else { /* order > 12 */
1956                 for(i = 0; i < (int)data_len; i++) {
1957                         sum = 0;
1958                         switch(order) {
1959                                 case 32: sum += qlp_coeff[31] * data[i-32];
1960                                 case 31: sum += qlp_coeff[30] * data[i-31];
1961                                 case 30: sum += qlp_coeff[29] * data[i-30];
1962                                 case 29: sum += qlp_coeff[28] * data[i-29];
1963                                 case 28: sum += qlp_coeff[27] * data[i-28];
1964                                 case 27: sum += qlp_coeff[26] * data[i-27];
1965                                 case 26: sum += qlp_coeff[25] * data[i-26];
1966                                 case 25: sum += qlp_coeff[24] * data[i-25];
1967                                 case 24: sum += qlp_coeff[23] * data[i-24];
1968                                 case 23: sum += qlp_coeff[22] * data[i-23];
1969                                 case 22: sum += qlp_coeff[21] * data[i-22];
1970                                 case 21: sum += qlp_coeff[20] * data[i-21];
1971                                 case 20: sum += qlp_coeff[19] * data[i-20];
1972                                 case 19: sum += qlp_coeff[18] * data[i-19];
1973                                 case 18: sum += qlp_coeff[17] * data[i-18];
1974                                 case 17: sum += qlp_coeff[16] * data[i-17];
1975                                 case 16: sum += qlp_coeff[15] * data[i-16];
1976                                 case 15: sum += qlp_coeff[14] * data[i-15];
1977                                 case 14: sum += qlp_coeff[13] * data[i-14];
1978                                 case 13: sum += qlp_coeff[12] * data[i-13];
1979                                          sum += qlp_coeff[11] * data[i-12];
1980                                          sum += qlp_coeff[10] * data[i-11];
1981                                          sum += qlp_coeff[ 9] * data[i-10];
1982                                          sum += qlp_coeff[ 8] * data[i- 9];
1983                                          sum += qlp_coeff[ 7] * data[i- 8];
1984                                          sum += qlp_coeff[ 6] * data[i- 7];
1985                                          sum += qlp_coeff[ 5] * data[i- 6];
1986                                          sum += qlp_coeff[ 4] * data[i- 5];
1987                                          sum += qlp_coeff[ 3] * data[i- 4];
1988                                          sum += qlp_coeff[ 2] * data[i- 3];
1989                                          sum += qlp_coeff[ 1] * data[i- 2];
1990                                          sum += qlp_coeff[ 0] * data[i- 1];
1991                         }
1992                         data[i] = residual[i] + (sum >> lp_quantization);
1993                 }
1994         }
1995 }
1996
1997 FLAC__SSE_TARGET("sse2")
1998 void FLAC__lpc_restore_signal_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1999 {
2000         int i;
2001
2002         FLAC__ASSERT(order > 0);
2003         FLAC__ASSERT(order <= 32);
2004
2005         if(order <= 12) {
2006                 if(order > 8) { /* order == 9, 10, 11, 12 */
2007                         if(order > 10) { /* order == 11, 12 */
2008                                 if(order == 12) {
2009                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2010                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
2011                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
2012                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
2013                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
2014                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
2015                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
2016
2017                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
2018                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
2019                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
2020                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
2021                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
2022                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
2023
2024                                         for(i = 0; i < (int)data_len; i++) {
2025                                                 //sum = 0;
2026                                                 //sum += qlp_coeff[11] * data[i-12];
2027                                                 //sum += qlp_coeff[10] * data[i-11];
2028                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
2029                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
2030                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
2031
2032                                                 //sum += qlp_coeff[9] * data[i-10];
2033                                                 //sum += qlp_coeff[8] * data[i-9];
2034                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2035                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2036                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2037                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2038
2039                                                 //sum += qlp_coeff[7] * data[i-8];
2040                                                 //sum += qlp_coeff[6] * data[i-7];
2041                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2042                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2043                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2044                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2045
2046                                                 //sum += qlp_coeff[5] * data[i-6];
2047                                                 //sum += qlp_coeff[4] * data[i-5];
2048                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2049                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2050                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2051                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2052
2053                                                 //sum += qlp_coeff[3] * data[i-4];
2054                                                 //sum += qlp_coeff[2] * data[i-3];
2055                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2056                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2057                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2058                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2059
2060                                                 //sum += qlp_coeff[1] * data[i-2];
2061                                                 //sum += qlp_coeff[0] * data[i-1];
2062                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2063                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2064                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2065                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2066
2067                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2068                                                 DATA_RESULT(xmm7);
2069                                         }
2070                                 }
2071                                 else { /* order == 11 */
2072                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2073                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2074                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2075                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2076                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2077                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2078                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
2079
2080                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2081                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2082                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2083                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2084                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2085
2086                                         for(i = 0; i < (int)data_len; i++) {
2087                                                 //sum = 0;
2088                                                 //sum  = qlp_coeff[10] * data[i-11];
2089                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
2090                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
2091
2092                                                 //sum += qlp_coeff[9] * data[i-10];
2093                                                 //sum += qlp_coeff[8] * data[i-9];
2094                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2095                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2096                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2097                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2098
2099                                                 //sum += qlp_coeff[7] * data[i-8];
2100                                                 //sum += qlp_coeff[6] * data[i-7];
2101                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2102                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2103                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2104                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2105
2106                                                 //sum += qlp_coeff[5] * data[i-6];
2107                                                 //sum += qlp_coeff[4] * data[i-5];
2108                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2109                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2110                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2111                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2112
2113                                                 //sum += qlp_coeff[3] * data[i-4];
2114                                                 //sum += qlp_coeff[2] * data[i-3];
2115                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2116                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2117                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2118                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2119
2120                                                 //sum += qlp_coeff[1] * data[i-2];
2121                                                 //sum += qlp_coeff[0] * data[i-1];
2122                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2123                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2124                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2125                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2126
2127                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2128                                                 DATA_RESULT(xmm7);
2129                                         }
2130                                 }
2131                         }
2132                         else { /* order == 9, 10 */
2133                                 if(order == 10) {
2134                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2135                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2136                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2137                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2138                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2139                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2140
2141                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2142                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2143                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2144                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2145                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2146
2147                                         for(i = 0; i < (int)data_len; i++) {
2148                                                 //sum = 0;
2149                                                 //sum += qlp_coeff[9] * data[i-10];
2150                                                 //sum += qlp_coeff[8] * data[i-9];
2151                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2152                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2153                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2154
2155                                                 //sum += qlp_coeff[7] * data[i-8];
2156                                                 //sum += qlp_coeff[6] * data[i-7];
2157                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2158                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2159                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2160                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2161
2162                                                 //sum += qlp_coeff[5] * data[i-6];
2163                                                 //sum += qlp_coeff[4] * data[i-5];
2164                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2165                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2166                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2167                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2168
2169                                                 //sum += qlp_coeff[3] * data[i-4];
2170                                                 //sum += qlp_coeff[2] * data[i-3];
2171                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2172                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2173                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2174                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2175
2176                                                 //sum += qlp_coeff[1] * data[i-2];
2177                                                 //sum += qlp_coeff[0] * data[i-1];
2178                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2179                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2180                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2181                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2182
2183                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2184                                                 DATA_RESULT(xmm7);
2185                                         }
2186                                 }
2187                                 else { /* order == 9 */
2188                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2189                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2190                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2191                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2192                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2193                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
2194
2195                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2196                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2197                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2198                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2199
2200                                         for(i = 0; i < (int)data_len; i++) {
2201                                                 //sum = 0;
2202                                                 //sum  = qlp_coeff[8] * data[i-9];
2203                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
2204                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2205
2206                                                 //sum += qlp_coeff[7] * data[i-8];
2207                                                 //sum += qlp_coeff[6] * data[i-7];
2208                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2209                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2210                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2211                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2212
2213                                                 //sum += qlp_coeff[5] * data[i-6];
2214                                                 //sum += qlp_coeff[4] * data[i-5];
2215                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2216                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2217                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2218                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2219
2220                                                 //sum += qlp_coeff[3] * data[i-4];
2221                                                 //sum += qlp_coeff[2] * data[i-3];
2222                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2223                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2224                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2225                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2226
2227                                                 //sum += qlp_coeff[1] * data[i-2];
2228                                                 //sum += qlp_coeff[0] * data[i-1];
2229                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2230                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2231                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2232                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2233
2234                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2235                                                 DATA_RESULT(xmm7);
2236                                         }
2237                                 }
2238                         }
2239                 }
2240                 else if(order > 4) { /* order == 5, 6, 7, 8 */
2241                         if(order > 6) { /* order == 7, 8 */
2242                                 if(order == 8) {
2243                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
2244                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2245                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2246                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2247                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2248
2249                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2250                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2251                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2252                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2253
2254                                         for(i = 0; i < (int)data_len; i++) {
2255                                                 //sum = 0;
2256                                                 //sum += qlp_coeff[7] * data[i-8];
2257                                                 //sum += qlp_coeff[6] * data[i-7];
2258                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2259                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2260                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
2261
2262                                                 //sum += qlp_coeff[5] * data[i-6];
2263                                                 //sum += qlp_coeff[4] * data[i-5];
2264                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2265                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2266                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2267                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2268
2269                                                 //sum += qlp_coeff[3] * data[i-4];
2270                                                 //sum += qlp_coeff[2] * data[i-3];
2271                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2272                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2273                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2274                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2275
2276                                                 //sum += qlp_coeff[1] * data[i-2];
2277                                                 //sum += qlp_coeff[0] * data[i-1];
2278                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2279                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2280                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2281                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2282
2283                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2284                                                 DATA_RESULT(xmm7);
2285                                         }
2286                                 }
2287                                 else { /* order == 7 */
2288                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
2289                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2290                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2291                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2292                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
2293
2294                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2295                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2296                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2297
2298                                         for(i = 0; i < (int)data_len; i++) {
2299                                                 //sum = 0;
2300                                                 //sum  = qlp_coeff[6] * data[i-7];
2301                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
2302                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
2303
2304                                                 //sum += qlp_coeff[5] * data[i-6];
2305                                                 //sum += qlp_coeff[4] * data[i-5];
2306                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2307                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2308                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2309                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2310
2311                                                 //sum += qlp_coeff[3] * data[i-4];
2312                                                 //sum += qlp_coeff[2] * data[i-3];
2313                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2314                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2315                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2316                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2317
2318                                                 //sum += qlp_coeff[1] * data[i-2];
2319                                                 //sum += qlp_coeff[0] * data[i-1];
2320                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2321                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2322                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2323                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2324
2325                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2326                                                 DATA_RESULT(xmm7);
2327                                         }
2328                                 }
2329                         }
2330                         else { /* order == 5, 6 */
2331                                 if(order == 6) {
2332                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
2333                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2334                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2335                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2336
2337                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2338                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2339                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2340
2341                                         for(i = 0; i < (int)data_len; i++) {
2342                                                 //sum = 0;
2343                                                 //sum += qlp_coeff[5] * data[i-6];
2344                                                 //sum += qlp_coeff[4] * data[i-5];
2345                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2346                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2347                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
2348
2349                                                 //sum += qlp_coeff[3] * data[i-4];
2350                                                 //sum += qlp_coeff[2] * data[i-3];
2351                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2352                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2353                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2354                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2355
2356                                                 //sum += qlp_coeff[1] * data[i-2];
2357                                                 //sum += qlp_coeff[0] * data[i-1];
2358                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2359                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2360                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2361                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2362
2363                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2364                                                 DATA_RESULT(xmm7);
2365                                         }
2366                                 }
2367                                 else { /* order == 5 */
2368                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
2369                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2370                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2371                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
2372
2373                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2374                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2375
2376                                         for(i = 0; i < (int)data_len; i++) {
2377                                                 //sum = 0;
2378                                                 //sum  = qlp_coeff[4] * data[i-5];
2379                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
2380                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
2381
2382                                                 //sum += qlp_coeff[3] * data[i-4];
2383                                                 //sum += qlp_coeff[2] * data[i-3];
2384                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2385                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2386                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2387                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2388
2389                                                 //sum += qlp_coeff[1] * data[i-2];
2390                                                 //sum += qlp_coeff[0] * data[i-1];
2391                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2392                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2393                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2394                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2395
2396                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2397                                                 DATA_RESULT(xmm7);
2398                                         }
2399                                 }
2400                         }
2401                 }
2402                 else { /* order == 1, 2, 3, 4 */
2403                         if(order > 2) { /* order == 3, 4 */
2404                                 if(order == 4) {
2405                                         __m128i xmm0, xmm1, xmm6, xmm7;
2406                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2407                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2408
2409                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2410                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2411
2412                                         for(i = 0; i < (int)data_len; i++) {
2413                                                 //sum = 0;
2414                                                 //sum += qlp_coeff[3] * data[i-4];
2415                                                 //sum += qlp_coeff[2] * data[i-3];
2416                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2417                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2418                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
2419
2420                                                 //sum += qlp_coeff[1] * data[i-2];
2421                                                 //sum += qlp_coeff[0] * data[i-1];
2422                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2423                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2424                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2425                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2426
2427                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2428                                                 DATA_RESULT(xmm7);
2429                                         }
2430                                 }
2431                                 else { /* order == 3 */
2432                                         __m128i xmm0, xmm1, xmm6, xmm7;
2433                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2434                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
2435
2436                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2437
2438                                         for(i = 0; i < (int)data_len; i++) {
2439                                                 //sum = 0;
2440                                                 //sum  = qlp_coeff[2] * data[i-3];
2441                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
2442                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
2443
2444                                                 //sum += qlp_coeff[1] * data[i-2];
2445                                                 //sum += qlp_coeff[0] * data[i-1];
2446                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2447                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2448                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2449                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2450
2451                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2452                                                 DATA_RESULT(xmm7);
2453                                         }
2454                                 }
2455                         }
2456                         else { /* order == 1, 2 */
2457                                 if(order == 2) {
2458                                         __m128i xmm0, xmm7;
2459                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2460                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2461
2462                                         for(i = 0; i < (int)data_len; i++) {
2463                                                 //sum = 0;
2464                                                 //sum += qlp_coeff[1] * data[i-2];
2465                                                 //sum += qlp_coeff[0] * data[i-1];
2466                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2467                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2468                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
2469
2470                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2471                                                 DATA_RESULT(xmm7);
2472                                         }
2473                                 }
2474                                 else { /* order == 1 */
2475                                         for(i = 0; i < (int)data_len; i++)
2476                                                 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
2477                                 }
2478                         }
2479                 }
2480         }
2481         else { /* order > 12 */
2482                 FLAC__int32 sum;
2483                 for(i = 0; i < (int)data_len; i++) {
2484                         sum = 0;
2485                         switch(order) {
2486                                 case 32: sum += qlp_coeff[31] * data[i-32];
2487                                 case 31: sum += qlp_coeff[30] * data[i-31];
2488                                 case 30: sum += qlp_coeff[29] * data[i-30];
2489                                 case 29: sum += qlp_coeff[28] * data[i-29];
2490                                 case 28: sum += qlp_coeff[27] * data[i-28];
2491                                 case 27: sum += qlp_coeff[26] * data[i-27];
2492                                 case 26: sum += qlp_coeff[25] * data[i-26];
2493                                 case 25: sum += qlp_coeff[24] * data[i-25];
2494                                 case 24: sum += qlp_coeff[23] * data[i-24];
2495                                 case 23: sum += qlp_coeff[22] * data[i-23];
2496                                 case 22: sum += qlp_coeff[21] * data[i-22];
2497                                 case 21: sum += qlp_coeff[20] * data[i-21];
2498                                 case 20: sum += qlp_coeff[19] * data[i-20];
2499                                 case 19: sum += qlp_coeff[18] * data[i-19];
2500                                 case 18: sum += qlp_coeff[17] * data[i-18];
2501                                 case 17: sum += qlp_coeff[16] * data[i-17];
2502                                 case 16: sum += qlp_coeff[15] * data[i-16];
2503                                 case 15: sum += qlp_coeff[14] * data[i-15];
2504                                 case 14: sum += qlp_coeff[13] * data[i-14];
2505                                 case 13: sum += qlp_coeff[12] * data[i-13];
2506                                          sum += qlp_coeff[11] * data[i-12];
2507                                          sum += qlp_coeff[10] * data[i-11];
2508                                          sum += qlp_coeff[ 9] * data[i-10];
2509                                          sum += qlp_coeff[ 8] * data[i- 9];
2510                                          sum += qlp_coeff[ 7] * data[i- 8];
2511                                          sum += qlp_coeff[ 6] * data[i- 7];
2512                                          sum += qlp_coeff[ 5] * data[i- 6];
2513                                          sum += qlp_coeff[ 4] * data[i- 5];
2514                                          sum += qlp_coeff[ 3] * data[i- 4];
2515                                          sum += qlp_coeff[ 2] * data[i- 3];
2516                                          sum += qlp_coeff[ 1] * data[i- 2];
2517                                          sum += qlp_coeff[ 0] * data[i- 1];
2518                         }
2519                         data[i] = residual[i] + (sum >> lp_quantization);
2520                 }
2521         }
2522 }
2523
2524 #endif /* FLAC__SSE2_SUPPORTED */
2525 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
2526 #endif /* FLAC__NO_ASM */
2527 #endif /* FLAC__INTEGER_ONLY_LIBRARY */