Add sse2 intrinscics code for lpc_restore_signal_...()
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #if HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <emmintrin.h> /* SSE2 */
47
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
50
51 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define     DATA_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
53
54 FLAC__SSE_TARGET("sse2")
55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
56 {
57         int i;
58         FLAC__int32 sum;
59
60         FLAC__ASSERT(order > 0);
61         FLAC__ASSERT(order <= 32);
62         FLAC__ASSERT(data_len > 0);
63
64         if(order <= 12) {
65                 FLAC__int32 curr;
66                 if(order > 8) { /* order == 9, 10, 11, 12 */
67 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
68                         int r;
69                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
70                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
71                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
72                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
73                         switch(order)                                          /* ...and zero them out */
74                         {
75                         case 9:
76                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
77                         case 10:
78                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
79                         case 11:
80                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
81                         }
82                         xmm2 = _mm_setzero_si128();
83                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
84                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
85
86                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
87                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
88                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
89                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
90                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
91                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
92                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
93                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
94
95                         xmm7 = _mm_slli_si128(xmm1, 2);
96                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
97                         xmm2 = _mm_slli_si128(xmm0, 2);
98
99                         /* xmm0, xmm1: qlp_coeff
100                            xmm2, xmm7: qlp_coeff << 16 bit
101                            xmm3, xmm4: data */
102
103                         xmm6 = xmm4;
104                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
105                         xmm5 = xmm3;
106                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
107                         xmm6 = _mm_add_epi32(xmm6, xmm5);
108                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
109                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
110
111                         RESIDUAL16_RESULT(xmm6);
112
113                         data_len--;
114                         r = data_len % 2;
115
116                         if(r) {
117                                 xmm4 = _mm_slli_si128(xmm4, 2);
118                                 xmm6 = xmm3;
119                                 xmm3 = _mm_slli_si128(xmm3, 2);
120                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
121                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
122
123                                 xmm6 = xmm4;
124                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
125                                 xmm5 = xmm3;
126                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
127                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
128                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
129                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
130
131                                 RESIDUAL16_RESULT(xmm6);
132
133                                 data_len--;
134                         }
135
136                         while(data_len) { /* data_len is a multiple of 2 */
137                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
138                                 xmm4 = _mm_slli_si128(xmm4, 4);
139                                 xmm6 = xmm3;
140                                 xmm3 = _mm_slli_si128(xmm3, 4);
141                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
142                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
143
144                                 xmm6 = xmm4;
145                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
146                                 xmm5 = xmm3;
147                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
148                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
149                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
150                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
151
152                                 RESIDUAL16_RESULT(xmm6);
153
154                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
155
156                                 xmm6 = xmm4;
157                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
158                                 xmm5 = xmm3;
159                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
160                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
161                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
162                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
163
164                                 RESIDUAL16_RESULT(xmm6);
165
166                                 data_len-=2;
167                         }
168 #else /* 16 XMM registers available */
169                         int r;
170                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
171                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
172                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
173                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
174                         switch(order)                                          /* ...and zero them out */
175                         {
176                         case 9:
177                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
178                         case 10:
179                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
180                         case 11:
181                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
182                         }
183                         xmm2 = _mm_setzero_si128();
184                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
185                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
186
187                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
188                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
189                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
190                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
191                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
192                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
193                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
194                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
195
196                         xmm7 = _mm_slli_si128(xmm1, 2);
197                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
198                         xmm2 = _mm_slli_si128(xmm0, 2);
199
200                         xmm9 = _mm_slli_si128(xmm1, 4);
201                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
202                         xmm8 = _mm_slli_si128(xmm0, 4);
203
204                         xmmB = _mm_slli_si128(xmm1, 6);
205                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
206                         xmmA = _mm_slli_si128(xmm0, 6);
207
208                         /* xmm0, xmm1: qlp_coeff
209                            xmm2, xmm7: qlp_coeff << 16 bit
210                            xmm8, xmm9: qlp_coeff << 2*16 bit
211                            xmmA, xmmB: qlp_coeff << 3*16 bit
212                            xmm3, xmm4: data */
213
214                         xmm6 = xmm4;
215                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
216                         xmm5 = xmm3;
217                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
218                         xmm6 = _mm_add_epi32(xmm6, xmm5);
219                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
220                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
221
222                         RESIDUAL16_RESULT(xmm6);
223
224                         data_len--;
225                         r = data_len % 4;
226
227                         while(r) {
228                                 xmm4 = _mm_slli_si128(xmm4, 2);
229                                 xmm6 = xmm3;
230                                 xmm3 = _mm_slli_si128(xmm3, 2);
231                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
232                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
233
234                                 xmm6 = xmm4;
235                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
236                                 xmm5 = xmm3;
237                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
238                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
239                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
240                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
241
242                                 RESIDUAL16_RESULT(xmm6);
243
244                                 data_len--; r--;
245                         }
246
247                         while(data_len) { /* data_len is a multiple of 4 */
248                                 xmm4 = _mm_slli_si128(xmm4, 8);
249                                 xmm6 = xmm3;
250                                 xmm3 = _mm_slli_si128(xmm3, 8);
251                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
252
253                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
254
255                                 xmm6 = xmm4;
256                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
257                                 xmm5 = xmm3;
258                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
259                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
260                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
261                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
262
263                                 RESIDUAL16_RESULT(xmm6);
264
265                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
266
267                                 xmm6 = xmm4;
268                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
269                                 xmm5 = xmm3;
270                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
271                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
272                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
273                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
274
275                                 RESIDUAL16_RESULT(xmm6);
276
277                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
278
279                                 xmm6 = xmm4;
280                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
281                                 xmm5 = xmm3;
282                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
283                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
284                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
285                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
286
287                                 RESIDUAL16_RESULT(xmm6);
288
289                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
290
291                                 xmm6 = xmm4;
292                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
293                                 xmm5 = xmm3;
294                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
295                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
296                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
297                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
298
299                                 RESIDUAL16_RESULT(xmm6);
300
301                                 data_len-=4;
302                         }
303 #endif
304                 } /* endif(order > 8) */
305                 else if(order > 4) { /* order == 5, 6, 7, 8 */
306                         if(order > 6) { /* order == 7, 8 */
307                                 if(order == 8) {
308                                         __m128i xmm0, xmm1, xmm3, xmm6;
309                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
310                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
311                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
312
313                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
314                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
315                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
316                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
317                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
318
319                                         /* xmm0: qlp_coeff
320                                            xmm3: data */
321
322                                         xmm6 = xmm3;
323                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
324                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
325                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
326
327                                         RESIDUAL16_RESULT(xmm6);
328
329                                         data_len--;
330
331                                         while(data_len) {
332                                                 xmm3 = _mm_slli_si128(xmm3, 2);
333                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
334
335                                                 xmm6 = xmm3;
336                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
337                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
338                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
339
340                                                 RESIDUAL16_RESULT(xmm6);
341
342                                                 data_len--;
343                                         }
344                                 }
345                                 else { /* order == 7 */
346                                         int r;
347                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
348                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
349                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
350                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
351                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
352
353                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
354                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
355                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
356                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
357                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
358                                         xmm2 = _mm_slli_si128(xmm0, 2);
359
360                                         /* xmm0: qlp_coeff
361                                            xmm2: qlp_coeff << 16 bit
362                                            xmm3: data */
363
364                                         xmm6 = xmm3;
365                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
366                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
367                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
368
369                                         RESIDUAL16_RESULT(xmm6);
370
371                                         data_len--;
372                                         r = data_len % 2;
373
374                                         if(r) {
375                                                 xmm3 = _mm_slli_si128(xmm3, 2);
376                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
377
378                                                 xmm6 = xmm3;
379                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
380                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
381                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
382
383                                                 RESIDUAL16_RESULT(xmm6);
384
385                                                 data_len--;
386                                         }
387
388                                         while(data_len) { /* data_len is a multiple of 2 */
389                                                 xmm3 = _mm_slli_si128(xmm3, 4);
390                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
391
392                                                 xmm6 = xmm3;
393                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
394                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
395                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
396
397                                                 RESIDUAL16_RESULT(xmm6);
398
399                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
400                                                 xmm6 = xmm3;
401                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
402                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
403                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
404
405                                                 RESIDUAL16_RESULT(xmm6);
406
407                                                 data_len-=2;
408                                         }
409                                 }
410                         }
411                         else { /* order == 5, 6 */
412                                 if(order == 6) {
413                                         int r;
414                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
415                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
416                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
417                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
418                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
419
420                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
421                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
422                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
423                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
424                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
425                                         xmm2 = _mm_slli_si128(xmm0, 2);
426                                         xmm4 = _mm_slli_si128(xmm0, 4);
427
428                                         /* xmm0: qlp_coeff
429                                            xmm2: qlp_coeff << 16 bit
430                                            xmm4: qlp_coeff << 2*16 bit
431                                            xmm3: data */
432
433                                         xmm6 = xmm3;
434                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
435                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
436                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
437
438                                         RESIDUAL16_RESULT(xmm6);
439
440                                         data_len--;
441                                         r = data_len % 3;
442
443                                         while(r) {
444                                                 xmm3 = _mm_slli_si128(xmm3, 2);
445                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
446
447                                                 xmm6 = xmm3;
448                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
449                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
450                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
451
452                                                 RESIDUAL16_RESULT(xmm6);
453
454                                                 data_len--; r--;
455                                         }
456
457                                         while(data_len) { /* data_len is a multiple of 3 */
458                                                 xmm3 = _mm_slli_si128(xmm3, 6);
459                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
460
461                                                 xmm6 = xmm3;
462                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
463                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
464                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
465
466                                                 RESIDUAL16_RESULT(xmm6);
467
468                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
469
470                                                 xmm6 = xmm3;
471                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
472                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
473                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
474
475                                                 RESIDUAL16_RESULT(xmm6);
476
477                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
478
479                                                 xmm6 = xmm3;
480                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
481                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
482                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
483
484                                                 RESIDUAL16_RESULT(xmm6);
485
486                                                 data_len-=3;
487                                         }
488                                 }
489                                 else { /* order == 5 */
490                                         int r;
491                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
492                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
493                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
494                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
495                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
496
497                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
498                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
499                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
500                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
501                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
502                                         xmm2 = _mm_slli_si128(xmm0, 2);
503                                         xmm4 = _mm_slli_si128(xmm0, 4);
504                                         xmm5 = _mm_slli_si128(xmm0, 6);
505
506                                         /* xmm0: qlp_coeff
507                                            xmm2: qlp_coeff << 16 bit
508                                            xmm4: qlp_coeff << 2*16 bit
509                                            xmm4: qlp_coeff << 3*16 bit
510                                            xmm3: data */
511
512                                         xmm6 = xmm3;
513                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
514                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
515                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
516
517                                         RESIDUAL16_RESULT(xmm6);
518
519                                         data_len--;
520                                         r = data_len % 4;
521
522                                         while(r) {
523                                                 xmm3 = _mm_slli_si128(xmm3, 2);
524                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
525
526                                                 xmm6 = xmm3;
527                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
528                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
529                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
530
531                                                 RESIDUAL16_RESULT(xmm6);
532
533                                                 data_len--; r--;
534                                         }
535
536                                         while(data_len) { /* data_len is a multiple of 4 */
537                                                 xmm3 = _mm_slli_si128(xmm3, 8);
538                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
539
540                                                 xmm6 = xmm3;
541                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
542                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
543                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
544
545                                                 RESIDUAL16_RESULT(xmm6);
546
547                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
548
549                                                 xmm6 = xmm3;
550                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
551                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
552                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
553
554                                                 RESIDUAL16_RESULT(xmm6);
555
556                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
557
558                                                 xmm6 = xmm3;
559                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
560                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
561                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
562
563                                                 RESIDUAL16_RESULT(xmm6);
564
565                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
566
567                                                 xmm6 = xmm3;
568                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
569                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
570                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
571
572                                                 RESIDUAL16_RESULT(xmm6);
573
574                                                 data_len-=4;
575                                         }
576                                 }
577                         }
578                 }
579                 else { /* order == 1, 2, 3, 4 */
580                         if(order > 2) {
581                                 if(order == 4) {
582                                         __m128i xmm0, xmm3, xmm6;
583                                         xmm6 = _mm_setzero_si128();
584                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
585                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
586
587                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
588                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
589                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
590
591                                         /* xmm0: qlp_coeff
592                                            xmm3: data */
593
594                                         xmm6 = xmm3;
595                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
596                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
597
598                                         RESIDUAL16_RESULT(xmm6);
599
600                                         data_len--;
601
602                                         while(data_len) {
603                                                 xmm3 = _mm_slli_si128(xmm3, 2);
604                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
605
606                                                 xmm6 = xmm3;
607                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
608                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
609
610                                                 RESIDUAL16_RESULT(xmm6);
611
612                                                 data_len--;
613                                         }
614                                 }
615                                 else { /* order == 3 */
616                                         int r;
617                                         __m128i xmm0, xmm1, xmm3, xmm6;
618                                         xmm6 = _mm_setzero_si128();
619                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
620                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
621                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
622
623                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
624                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
625                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
626                                         xmm1 = _mm_slli_si128(xmm0, 2);
627
628                                         /* xmm0: qlp_coeff
629                                            xmm1: qlp_coeff << 16 bit
630                                            xmm3: data */
631
632                                         xmm6 = xmm3;
633                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
634                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
635
636                                         RESIDUAL16_RESULT(xmm6);
637
638                                         data_len--;
639                                         r = data_len % 2;
640
641                                         if(r) {
642                                                 xmm3 = _mm_slli_si128(xmm3, 2);
643                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
644
645                                                 xmm6 = xmm3;
646                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
647                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
648
649                                                 RESIDUAL16_RESULT(xmm6);
650
651                                                 data_len--;
652                                         }
653
654                                         while(data_len) { /* data_len is a multiple of 2 */
655                                                 xmm3 = _mm_slli_si128(xmm3, 4);
656
657                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
658
659                                                 xmm6 = xmm3;
660                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
661                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
662
663                                                 RESIDUAL16_RESULT(xmm6);
664
665                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
666
667                                                 xmm6 = xmm3;
668                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
669                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
670
671                                                 RESIDUAL16_RESULT(xmm6);
672
673                                                 data_len-=2;
674                                         }
675                                 }
676                         }
677                         else {
678                                 if(order == 2) {
679                                         __m128i xmm0, xmm3, xmm6;
680                                         xmm6 = _mm_setzero_si128();
681                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
682                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
683                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
684
685                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
686                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
687                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
688
689                                         /* xmm0: qlp_coeff
690                                            xmm3: data */
691
692                                         xmm6 = xmm3;
693                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
694
695                                         RESIDUAL16_RESULT(xmm6);
696
697                                         data_len--;
698
699                                         while(data_len) {
700                                                 xmm3 = _mm_slli_si128(xmm3, 2);
701                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
702
703                                                 xmm6 = xmm3;
704                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
705
706                                                 RESIDUAL16_RESULT(xmm6);
707
708                                                 data_len--;
709                                         }
710                                 }
711                                 else { /* order == 1 */
712                                         for(i = 0; i < (int)data_len; i++)
713                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
714                                 }
715                         }
716                 }
717         }
718         else { /* order > 12 */
719                 for(i = 0; i < (int)data_len; i++) {
720                         sum = 0;
721                         switch(order) {
722                                 case 32: sum += qlp_coeff[31] * data[i-32];
723                                 case 31: sum += qlp_coeff[30] * data[i-31];
724                                 case 30: sum += qlp_coeff[29] * data[i-30];
725                                 case 29: sum += qlp_coeff[28] * data[i-29];
726                                 case 28: sum += qlp_coeff[27] * data[i-28];
727                                 case 27: sum += qlp_coeff[26] * data[i-27];
728                                 case 26: sum += qlp_coeff[25] * data[i-26];
729                                 case 25: sum += qlp_coeff[24] * data[i-25];
730                                 case 24: sum += qlp_coeff[23] * data[i-24];
731                                 case 23: sum += qlp_coeff[22] * data[i-23];
732                                 case 22: sum += qlp_coeff[21] * data[i-22];
733                                 case 21: sum += qlp_coeff[20] * data[i-21];
734                                 case 20: sum += qlp_coeff[19] * data[i-20];
735                                 case 19: sum += qlp_coeff[18] * data[i-19];
736                                 case 18: sum += qlp_coeff[17] * data[i-18];
737                                 case 17: sum += qlp_coeff[16] * data[i-17];
738                                 case 16: sum += qlp_coeff[15] * data[i-16];
739                                 case 15: sum += qlp_coeff[14] * data[i-15];
740                                 case 14: sum += qlp_coeff[13] * data[i-14];
741                                 case 13: sum += qlp_coeff[12] * data[i-13];
742                                          sum += qlp_coeff[11] * data[i-12];
743                                          sum += qlp_coeff[10] * data[i-11];
744                                          sum += qlp_coeff[ 9] * data[i-10];
745                                          sum += qlp_coeff[ 8] * data[i- 9];
746                                          sum += qlp_coeff[ 7] * data[i- 8];
747                                          sum += qlp_coeff[ 6] * data[i- 7];
748                                          sum += qlp_coeff[ 5] * data[i- 6];
749                                          sum += qlp_coeff[ 4] * data[i- 5];
750                                          sum += qlp_coeff[ 3] * data[i- 4];
751                                          sum += qlp_coeff[ 2] * data[i- 3];
752                                          sum += qlp_coeff[ 1] * data[i- 2];
753                                          sum += qlp_coeff[ 0] * data[i- 1];
754                         }
755                         residual[i] = data[i] - (sum >> lp_quantization);
756                 }
757         }
758 }
759
760 FLAC__SSE_TARGET("sse2")
761 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
762 {
763         int i;
764
765         FLAC__ASSERT(order > 0);
766         FLAC__ASSERT(order <= 32);
767
768         if(order <= 12) {
769                 if(order > 8) { /* order == 9, 10, 11, 12 */
770                         if(order > 10) { /* order == 11, 12 */
771                                 if(order == 12) {
772                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
773                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
774                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
775                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
776                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
777                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
778                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
779
780                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
781                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
782                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
783                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
784                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
785                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
786
787                                         for(i = 0; i < (int)data_len; i++) {
788                                                 //sum = 0;
789                                                 //sum += qlp_coeff[11] * data[i-12];
790                                                 //sum += qlp_coeff[10] * data[i-11];
791                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
792                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
793                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
794
795                                                 //sum += qlp_coeff[9] * data[i-10];
796                                                 //sum += qlp_coeff[8] * data[i-9];
797                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
798                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
800                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
801
802                                                 //sum += qlp_coeff[7] * data[i-8];
803                                                 //sum += qlp_coeff[6] * data[i-7];
804                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
805                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
807                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
808
809                                                 //sum += qlp_coeff[5] * data[i-6];
810                                                 //sum += qlp_coeff[4] * data[i-5];
811                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
812                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
813                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
814                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
815
816                                                 //sum += qlp_coeff[3] * data[i-4];
817                                                 //sum += qlp_coeff[2] * data[i-3];
818                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
819                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
820                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
821                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
822
823                                                 //sum += qlp_coeff[1] * data[i-2];
824                                                 //sum += qlp_coeff[0] * data[i-1];
825                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
826                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
827                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
828                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
829
830                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
831                                                 RESIDUAL_RESULT(xmm7);
832                                         }
833                                 }
834                                 else { /* order == 11 */
835                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
836                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
837                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
838                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
839                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
840                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
841                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
842
843                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
844                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
845                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
846                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
847                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
848
849                                         for(i = 0; i < (int)data_len; i++) {
850                                                 //sum = 0;
851                                                 //sum  = qlp_coeff[10] * data[i-11];
852                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
853                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
854
855                                                 //sum += qlp_coeff[9] * data[i-10];
856                                                 //sum += qlp_coeff[8] * data[i-9];
857                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
858                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
859                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
860                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
861
862                                                 //sum += qlp_coeff[7] * data[i-8];
863                                                 //sum += qlp_coeff[6] * data[i-7];
864                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
865                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
866                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
867                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
868
869                                                 //sum += qlp_coeff[5] * data[i-6];
870                                                 //sum += qlp_coeff[4] * data[i-5];
871                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
872                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
873                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
874                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
875
876                                                 //sum += qlp_coeff[3] * data[i-4];
877                                                 //sum += qlp_coeff[2] * data[i-3];
878                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
879                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
880                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
881                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
882
883                                                 //sum += qlp_coeff[1] * data[i-2];
884                                                 //sum += qlp_coeff[0] * data[i-1];
885                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
886                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
887                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
888                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
889
890                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
891                                                 RESIDUAL_RESULT(xmm7);
892                                         }
893                                 }
894                         }
895                         else { /* order == 9, 10 */
896                                 if(order == 10) {
897                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
898                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
899                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
900                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
901                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
902                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
903
904                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
905                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
906                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
907                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
908                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
909
910                                         for(i = 0; i < (int)data_len; i++) {
911                                                 //sum = 0;
912                                                 //sum += qlp_coeff[9] * data[i-10];
913                                                 //sum += qlp_coeff[8] * data[i-9];
914                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
915                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
916                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
917
918                                                 //sum += qlp_coeff[7] * data[i-8];
919                                                 //sum += qlp_coeff[6] * data[i-7];
920                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
921                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
922                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
923                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
924
925                                                 //sum += qlp_coeff[5] * data[i-6];
926                                                 //sum += qlp_coeff[4] * data[i-5];
927                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
928                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
929                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
930                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
931
932                                                 //sum += qlp_coeff[3] * data[i-4];
933                                                 //sum += qlp_coeff[2] * data[i-3];
934                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
935                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
936                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
937                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
938
939                                                 //sum += qlp_coeff[1] * data[i-2];
940                                                 //sum += qlp_coeff[0] * data[i-1];
941                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
942                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
943                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
944                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
945
946                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
947                                                 RESIDUAL_RESULT(xmm7);
948                                         }
949                                 }
950                                 else { /* order == 9 */
951                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
952                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
953                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
954                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
955                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
956                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
957
958                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
959                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
960                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
961                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
962
963                                         for(i = 0; i < (int)data_len; i++) {
964                                                 //sum = 0;
965                                                 //sum  = qlp_coeff[8] * data[i-9];
966                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
967                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
968
969                                                 //sum += qlp_coeff[7] * data[i-8];
970                                                 //sum += qlp_coeff[6] * data[i-7];
971                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
972                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
973                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
974                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
975
976                                                 //sum += qlp_coeff[5] * data[i-6];
977                                                 //sum += qlp_coeff[4] * data[i-5];
978                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
979                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
980                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
981                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
982
983                                                 //sum += qlp_coeff[3] * data[i-4];
984                                                 //sum += qlp_coeff[2] * data[i-3];
985                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
986                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
987                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
988                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
989
990                                                 //sum += qlp_coeff[1] * data[i-2];
991                                                 //sum += qlp_coeff[0] * data[i-1];
992                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
993                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
994                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
995                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
996
997                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
998                                                 RESIDUAL_RESULT(xmm7);
999                                         }
1000                                 }
1001                         }
1002                 }
1003                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1004                         if(order > 6) { /* order == 7, 8 */
1005                                 if(order == 8) {
1006                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1007                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1008                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1009                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1010                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1011
1012                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1013                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1014                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1015                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1016
1017                                         for(i = 0; i < (int)data_len; i++) {
1018                                                 //sum = 0;
1019                                                 //sum += qlp_coeff[7] * data[i-8];
1020                                                 //sum += qlp_coeff[6] * data[i-7];
1021                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1022                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1023                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1024
1025                                                 //sum += qlp_coeff[5] * data[i-6];
1026                                                 //sum += qlp_coeff[4] * data[i-5];
1027                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1028                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1029                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1030                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1031
1032                                                 //sum += qlp_coeff[3] * data[i-4];
1033                                                 //sum += qlp_coeff[2] * data[i-3];
1034                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1035                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1036                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1037                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1038
1039                                                 //sum += qlp_coeff[1] * data[i-2];
1040                                                 //sum += qlp_coeff[0] * data[i-1];
1041                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1042                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1043                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1044                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1045
1046                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1047                                                 RESIDUAL_RESULT(xmm7);
1048                                         }
1049                                 }
1050                                 else { /* order == 7 */
1051                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1052                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1053                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1054                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1055                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
1056
1057                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1058                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1059                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1060
1061                                         for(i = 0; i < (int)data_len; i++) {
1062                                                 //sum = 0;
1063                                                 //sum  = qlp_coeff[6] * data[i-7];
1064                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
1065                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1066
1067                                                 //sum += qlp_coeff[5] * data[i-6];
1068                                                 //sum += qlp_coeff[4] * data[i-5];
1069                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1070                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1071                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1072                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1073
1074                                                 //sum += qlp_coeff[3] * data[i-4];
1075                                                 //sum += qlp_coeff[2] * data[i-3];
1076                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1077                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1078                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1079                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1080
1081                                                 //sum += qlp_coeff[1] * data[i-2];
1082                                                 //sum += qlp_coeff[0] * data[i-1];
1083                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1084                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1085                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1086                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1087
1088                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1089                                                 RESIDUAL_RESULT(xmm7);
1090                                         }
1091                                 }
1092                         }
1093                         else { /* order == 5, 6 */
1094                                 if(order == 6) {
1095                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1096                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1097                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1098                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1099
1100                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1101                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1102                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1103
1104                                         for(i = 0; i < (int)data_len; i++) {
1105                                                 //sum = 0;
1106                                                 //sum += qlp_coeff[5] * data[i-6];
1107                                                 //sum += qlp_coeff[4] * data[i-5];
1108                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1109                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1110                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1111
1112                                                 //sum += qlp_coeff[3] * data[i-4];
1113                                                 //sum += qlp_coeff[2] * data[i-3];
1114                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1115                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1116                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1117                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1118
1119                                                 //sum += qlp_coeff[1] * data[i-2];
1120                                                 //sum += qlp_coeff[0] * data[i-1];
1121                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1122                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1123                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1124                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1125
1126                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1127                                                 RESIDUAL_RESULT(xmm7);
1128                                         }
1129                                 }
1130                                 else { /* order == 5 */
1131                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1132                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1133                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1134                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
1135
1136                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1137                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1138
1139                                         for(i = 0; i < (int)data_len; i++) {
1140                                                 //sum = 0;
1141                                                 //sum  = qlp_coeff[4] * data[i-5];
1142                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
1143                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1144
1145                                                 //sum += qlp_coeff[3] * data[i-4];
1146                                                 //sum += qlp_coeff[2] * data[i-3];
1147                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1148                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1149                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1150                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1151
1152                                                 //sum += qlp_coeff[1] * data[i-2];
1153                                                 //sum += qlp_coeff[0] * data[i-1];
1154                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1155                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1156                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1157                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1158
1159                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1160                                                 RESIDUAL_RESULT(xmm7);
1161                                         }
1162                                 }
1163                         }
1164                 }
1165                 else { /* order == 1, 2, 3, 4 */
1166                         if(order > 2) { /* order == 3, 4 */
1167                                 if(order == 4) {
1168                                         __m128i xmm0, xmm1, xmm6, xmm7;
1169                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1170                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1171
1172                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1173                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1174
1175                                         for(i = 0; i < (int)data_len; i++) {
1176                                                 //sum = 0;
1177                                                 //sum += qlp_coeff[3] * data[i-4];
1178                                                 //sum += qlp_coeff[2] * data[i-3];
1179                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1180                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1181                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1182
1183                                                 //sum += qlp_coeff[1] * data[i-2];
1184                                                 //sum += qlp_coeff[0] * data[i-1];
1185                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1186                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1187                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1188                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1189
1190                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1191                                                 RESIDUAL_RESULT(xmm7);
1192                                         }
1193                                 }
1194                                 else { /* order == 3 */
1195                                         __m128i xmm0, xmm1, xmm6, xmm7;
1196                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1197                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1198
1199                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1200
1201                                         for(i = 0; i < (int)data_len; i++) {
1202                                                 //sum = 0;
1203                                                 //sum  = qlp_coeff[2] * data[i-3];
1204                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1205                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1206
1207                                                 //sum += qlp_coeff[1] * data[i-2];
1208                                                 //sum += qlp_coeff[0] * data[i-1];
1209                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1210                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1211                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1212                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1213
1214                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1215                                                 RESIDUAL_RESULT(xmm7);
1216                                         }
1217                                 }
1218                         }
1219                         else { /* order == 1, 2 */
1220                                 if(order == 2) {
1221                                         __m128i xmm0, xmm7;
1222                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1223                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1224
1225                                         for(i = 0; i < (int)data_len; i++) {
1226                                                 //sum = 0;
1227                                                 //sum += qlp_coeff[1] * data[i-2];
1228                                                 //sum += qlp_coeff[0] * data[i-1];
1229                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1230                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1231                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
1232
1233                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1234                                                 RESIDUAL_RESULT(xmm7);
1235                                         }
1236                                 }
1237                                 else { /* order == 1 */
1238                                         for(i = 0; i < (int)data_len; i++)
1239                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1240                                 }
1241                         }
1242                 }
1243         }
1244         else { /* order > 12 */
1245                 FLAC__int32 sum;
1246                 for(i = 0; i < (int)data_len; i++) {
1247                         sum = 0;
1248                         switch(order) {
1249                                 case 32: sum += qlp_coeff[31] * data[i-32];
1250                                 case 31: sum += qlp_coeff[30] * data[i-31];
1251                                 case 30: sum += qlp_coeff[29] * data[i-30];
1252                                 case 29: sum += qlp_coeff[28] * data[i-29];
1253                                 case 28: sum += qlp_coeff[27] * data[i-28];
1254                                 case 27: sum += qlp_coeff[26] * data[i-27];
1255                                 case 26: sum += qlp_coeff[25] * data[i-26];
1256                                 case 25: sum += qlp_coeff[24] * data[i-25];
1257                                 case 24: sum += qlp_coeff[23] * data[i-24];
1258                                 case 23: sum += qlp_coeff[22] * data[i-23];
1259                                 case 22: sum += qlp_coeff[21] * data[i-22];
1260                                 case 21: sum += qlp_coeff[20] * data[i-21];
1261                                 case 20: sum += qlp_coeff[19] * data[i-20];
1262                                 case 19: sum += qlp_coeff[18] * data[i-19];
1263                                 case 18: sum += qlp_coeff[17] * data[i-18];
1264                                 case 17: sum += qlp_coeff[16] * data[i-17];
1265                                 case 16: sum += qlp_coeff[15] * data[i-16];
1266                                 case 15: sum += qlp_coeff[14] * data[i-15];
1267                                 case 14: sum += qlp_coeff[13] * data[i-14];
1268                                 case 13: sum += qlp_coeff[12] * data[i-13];
1269                                          sum += qlp_coeff[11] * data[i-12];
1270                                          sum += qlp_coeff[10] * data[i-11];
1271                                          sum += qlp_coeff[ 9] * data[i-10];
1272                                          sum += qlp_coeff[ 8] * data[i- 9];
1273                                          sum += qlp_coeff[ 7] * data[i- 8];
1274                                          sum += qlp_coeff[ 6] * data[i- 7];
1275                                          sum += qlp_coeff[ 5] * data[i- 6];
1276                                          sum += qlp_coeff[ 4] * data[i- 5];
1277                                          sum += qlp_coeff[ 3] * data[i- 4];
1278                                          sum += qlp_coeff[ 2] * data[i- 3];
1279                                          sum += qlp_coeff[ 1] * data[i- 2];
1280                                          sum += qlp_coeff[ 0] * data[i- 1];
1281                         }
1282                         residual[i] = data[i] - (sum >> lp_quantization);
1283                 }
1284         }
1285 }
1286
1287 FLAC__SSE_TARGET("sse2")
1288 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1289 {
1290         int i;
1291         FLAC__int32 sum;
1292
1293         FLAC__ASSERT(order > 0);
1294         FLAC__ASSERT(order <= 32);
1295         FLAC__ASSERT(data_len > 0);
1296
1297         if(order <= 12) {
1298                 FLAC__int32 curr;
1299                 if(order > 8) { /* order == 9, 10, 11, 12 */
1300 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
1301                         int r;
1302                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1303                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1304                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1305                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1306                         switch(order)                                          /* ...and zero them out */
1307                         {
1308                         case 9:
1309                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1310                         case 10:
1311                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1312                         case 11:
1313                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1314                         }
1315                         xmm2 = _mm_setzero_si128();
1316                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1317                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
1318
1319                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1320                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1321                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1322                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1323                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1324                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1325                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
1326                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
1327
1328                         xmm7 = _mm_slli_si128(xmm1, 2);
1329                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1330                         xmm2 = _mm_slli_si128(xmm0, 2);
1331
1332                         /* xmm0, xmm1: qlp_coeff
1333                            xmm2, xmm7: qlp_coeff << 16 bit
1334                            xmm3, xmm4: data */
1335
1336                         xmm6 = xmm4;
1337                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
1338                         xmm5 = xmm3;
1339                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
1340                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1341                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1342                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1343
1344                         DATA16_RESULT(xmm6);
1345
1346                         data_len--;
1347                         r = data_len % 2;
1348
1349                         if(r) {
1350                                 xmm4 = _mm_slli_si128(xmm4, 2);
1351                                 xmm6 = xmm3;
1352                                 xmm3 = _mm_slli_si128(xmm3, 2);
1353                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1354                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1355
1356                                 xmm6 = xmm4;
1357                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1358                                 xmm5 = xmm3;
1359                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1360                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1361                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1362                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1363
1364                                 DATA16_RESULT(xmm6);
1365
1366                                 data_len--;
1367                         }
1368
1369                         while(data_len) { /* data_len is a multiple of 2 */
1370                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1371                                 xmm4 = _mm_slli_si128(xmm4, 4);
1372                                 xmm6 = xmm3;
1373                                 xmm3 = _mm_slli_si128(xmm3, 4);
1374                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
1375                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1376
1377                                 xmm6 = xmm4;
1378                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1379                                 xmm5 = xmm3;
1380                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1381                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1382                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1383                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1384
1385                                 DATA16_RESULT(xmm6);
1386
1387                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1388
1389                                 xmm6 = xmm4;
1390                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1391                                 xmm5 = xmm3;
1392                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1393                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1394                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1395                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1396
1397                                 DATA16_RESULT(xmm6);
1398
1399                                 data_len-=2;
1400                         }
1401 #else /* 16 XMM registers available */
1402                         int r;
1403                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
1404                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1405                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1406                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1407                         switch(order)                                          /* ...and zero them out */
1408                         {
1409                         case 9:
1410                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1411                         case 10:
1412                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1413                         case 11:
1414                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1415                         }
1416                         xmm2 = _mm_setzero_si128();
1417                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1418                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
1419
1420                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1421                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1422                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1423                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1424                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1425                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1426                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
1427                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
1428
1429                         xmm7 = _mm_slli_si128(xmm1, 2);
1430                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1431                         xmm2 = _mm_slli_si128(xmm0, 2);
1432
1433                         xmm9 = _mm_slli_si128(xmm1, 4);
1434                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
1435                         xmm8 = _mm_slli_si128(xmm0, 4);
1436
1437                         xmmB = _mm_slli_si128(xmm1, 6);
1438                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
1439                         xmmA = _mm_slli_si128(xmm0, 6);
1440
1441                         /* xmm0, xmm1: qlp_coeff
1442                            xmm2, xmm7: qlp_coeff << 16 bit
1443                            xmm8, xmm9: qlp_coeff << 2*16 bit
1444                            xmmA, xmmB: qlp_coeff << 3*16 bit
1445                            xmm3, xmm4: data */
1446
1447                         xmm6 = xmm4;
1448                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
1449                         xmm5 = xmm3;
1450                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
1451                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1452                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1453                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1454
1455                         DATA16_RESULT(xmm6);
1456
1457                         data_len--;
1458                         r = data_len % 4;
1459
1460                         while(r) {
1461                                 xmm4 = _mm_slli_si128(xmm4, 2);
1462                                 xmm6 = xmm3;
1463                                 xmm3 = _mm_slli_si128(xmm3, 2);
1464                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1465                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1466
1467                                 xmm6 = xmm4;
1468                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1469                                 xmm5 = xmm3;
1470                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1471                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1472                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1473                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1474
1475                                 DATA16_RESULT(xmm6);
1476
1477                                 data_len--; r--;
1478                         }
1479
1480                         while(data_len) { /* data_len is a multiple of 4 */
1481                                 xmm4 = _mm_slli_si128(xmm4, 8);
1482                                 xmm6 = xmm3;
1483                                 xmm3 = _mm_slli_si128(xmm3, 8);
1484                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
1485
1486                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1487
1488                                 xmm6 = xmm4;
1489                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
1490                                 xmm5 = xmm3;
1491                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
1492                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1493                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1494                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1495
1496                                 DATA16_RESULT(xmm6);
1497
1498                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1499
1500                                 xmm6 = xmm4;
1501                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
1502                                 xmm5 = xmm3;
1503                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
1504                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1505                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1506                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1507
1508                                 DATA16_RESULT(xmm6);
1509
1510                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1511
1512                                 xmm6 = xmm4;
1513                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1514                                 xmm5 = xmm3;
1515                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1516                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1517                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1518                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1519
1520                                 DATA16_RESULT(xmm6);
1521
1522                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1523
1524                                 xmm6 = xmm4;
1525                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1526                                 xmm5 = xmm3;
1527                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1528                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1529                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1530                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1531
1532                                 DATA16_RESULT(xmm6);
1533
1534                                 data_len-=4;
1535                         }
1536 #endif
1537                 } /* endif(order > 8) */
1538                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1539                         if(order > 6) { /* order == 7, 8 */
1540                                 if(order == 8) {
1541                                         __m128i xmm0, xmm1, xmm3, xmm6;
1542                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1543                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1544                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1545
1546                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1547                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1548                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1549                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1550                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1551
1552                                         /* xmm0: qlp_coeff
1553                                            xmm3: data */
1554
1555                                         xmm6 = xmm3;
1556                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1557                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1558                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1559
1560                                         DATA16_RESULT(xmm6);
1561
1562                                         data_len--;
1563
1564                                         while(data_len) {
1565                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1566                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1567
1568                                                 xmm6 = xmm3;
1569                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1570                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1571                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1572
1573                                                 DATA16_RESULT(xmm6);
1574
1575                                                 data_len--;
1576                                         }
1577                                 }
1578                                 else { /* order == 7 */
1579                                         int r;
1580                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
1581                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1582                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1583                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
1584                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1585
1586                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1587                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1588                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1589                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1590                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1591                                         xmm2 = _mm_slli_si128(xmm0, 2);
1592
1593                                         /* xmm0: qlp_coeff
1594                                            xmm2: qlp_coeff << 16 bit
1595                                            xmm3: data */
1596
1597                                         xmm6 = xmm3;
1598                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1599                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1600                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1601
1602                                         DATA16_RESULT(xmm6);
1603
1604                                         data_len--;
1605                                         r = data_len % 2;
1606
1607                                         if(r) {
1608                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1609                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1610
1611                                                 xmm6 = xmm3;
1612                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1613                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1614                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1615
1616                                                 DATA16_RESULT(xmm6);
1617
1618                                                 data_len--;
1619                                         }
1620
1621                                         while(data_len) { /* data_len is a multiple of 2 */
1622                                                 xmm3 = _mm_slli_si128(xmm3, 4);
1623                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1624
1625                                                 xmm6 = xmm3;
1626                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1627                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1628                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1629
1630                                                 DATA16_RESULT(xmm6);
1631
1632                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1633                                                 xmm6 = xmm3;
1634                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1635                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1636                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1637
1638                                                 DATA16_RESULT(xmm6);
1639
1640                                                 data_len-=2;
1641                                         }
1642                                 }
1643                         }
1644                         else { /* order == 5, 6 */
1645                                 if(order == 6) {
1646                                         int r;
1647                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
1648                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1649                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1650                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
1651                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1652
1653                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1654                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1655                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1656                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1657                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1658                                         xmm2 = _mm_slli_si128(xmm0, 2);
1659                                         xmm4 = _mm_slli_si128(xmm0, 4);
1660
1661                                         /* xmm0: qlp_coeff
1662                                            xmm2: qlp_coeff << 16 bit
1663                                            xmm4: qlp_coeff << 2*16 bit
1664                                            xmm3: data */
1665
1666                                         xmm6 = xmm3;
1667                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1668                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1669                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1670
1671                                         DATA16_RESULT(xmm6);
1672
1673                                         data_len--;
1674                                         r = data_len % 3;
1675
1676                                         while(r) {
1677                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1678                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1679
1680                                                 xmm6 = xmm3;
1681                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1682                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1683                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1684
1685                                                 DATA16_RESULT(xmm6);
1686
1687                                                 data_len--; r--;
1688                                         }
1689
1690                                         while(data_len) { /* data_len is a multiple of 3 */
1691                                                 xmm3 = _mm_slli_si128(xmm3, 6);
1692                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1693
1694                                                 xmm6 = xmm3;
1695                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
1696                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1697                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1698
1699                                                 DATA16_RESULT(xmm6);
1700
1701                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1702
1703                                                 xmm6 = xmm3;
1704                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1705                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1706                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1707
1708                                                 DATA16_RESULT(xmm6);
1709
1710                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1711
1712                                                 xmm6 = xmm3;
1713                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1714                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1715                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1716
1717                                                 DATA16_RESULT(xmm6);
1718
1719                                                 data_len-=3;
1720                                         }
1721                                 }
1722                                 else { /* order == 5 */
1723                                         int r;
1724                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1725                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1726                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1727                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
1728                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1729
1730                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1731                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1732                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1733                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1734                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1735                                         xmm2 = _mm_slli_si128(xmm0, 2);
1736                                         xmm4 = _mm_slli_si128(xmm0, 4);
1737                                         xmm5 = _mm_slli_si128(xmm0, 6);
1738
1739                                         /* xmm0: qlp_coeff
1740                                            xmm2: qlp_coeff << 16 bit
1741                                            xmm4: qlp_coeff << 2*16 bit
1742                                            xmm4: qlp_coeff << 3*16 bit
1743                                            xmm3: data */
1744
1745                                         xmm6 = xmm3;
1746                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1747                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1748                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1749
1750                                         DATA16_RESULT(xmm6);
1751
1752                                         data_len--;
1753                                         r = data_len % 4;
1754
1755                                         while(r) {
1756                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1757                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1758
1759                                                 xmm6 = xmm3;
1760                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1761                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1762                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1763
1764                                                 DATA16_RESULT(xmm6);
1765
1766                                                 data_len--; r--;
1767                                         }
1768
1769                                         while(data_len) { /* data_len is a multiple of 4 */
1770                                                 xmm3 = _mm_slli_si128(xmm3, 8);
1771                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1772
1773                                                 xmm6 = xmm3;
1774                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
1775                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1776                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1777
1778                                                 DATA16_RESULT(xmm6);
1779
1780                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1781
1782                                                 xmm6 = xmm3;
1783                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
1784                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1785                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1786
1787                                                 DATA16_RESULT(xmm6);
1788
1789                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1790
1791                                                 xmm6 = xmm3;
1792                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1793                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1794                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1795
1796                                                 DATA16_RESULT(xmm6);
1797
1798                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1799
1800                                                 xmm6 = xmm3;
1801                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1802                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1803                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1804
1805                                                 DATA16_RESULT(xmm6);
1806
1807                                                 data_len-=4;
1808                                         }
1809                                 }
1810                         }
1811                 }
1812                 else { /* order == 1, 2, 3, 4 */
1813                         if(order > 2) {
1814                                 if(order == 4) {
1815                                         __m128i xmm0, xmm3, xmm6;
1816                                         xmm6 = _mm_setzero_si128();
1817                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1818                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1819
1820                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1821                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1822                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
1823
1824                                         /* xmm0: qlp_coeff
1825                                            xmm3: data */
1826
1827                                         xmm6 = xmm3;
1828                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1829                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1830
1831                                         DATA16_RESULT(xmm6);
1832
1833                                         data_len--;
1834
1835                                         while(data_len) {
1836                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1837                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1838
1839                                                 xmm6 = xmm3;
1840                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1841                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1842
1843                                                 DATA16_RESULT(xmm6);
1844
1845                                                 data_len--;
1846                                         }
1847                                 }
1848                                 else { /* order == 3 */
1849                                         int r;
1850                                         __m128i xmm0, xmm1, xmm3, xmm6;
1851                                         xmm6 = _mm_setzero_si128();
1852                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1853                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
1854                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1855
1856                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1857                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1858                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
1859                                         xmm1 = _mm_slli_si128(xmm0, 2);
1860
1861                                         /* xmm0: qlp_coeff
1862                                            xmm1: qlp_coeff << 16 bit
1863                                            xmm3: data */
1864
1865                                         xmm6 = xmm3;
1866                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1867                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1868
1869                                         DATA16_RESULT(xmm6);
1870
1871                                         data_len--;
1872                                         r = data_len % 2;
1873
1874                                         if(r) {
1875                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1876                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1877
1878                                                 xmm6 = xmm3;
1879                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1880                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1881
1882                                                 DATA16_RESULT(xmm6);
1883
1884                                                 data_len--;
1885                                         }
1886
1887                                         while(data_len) { /* data_len is a multiple of 2 */
1888                                                 xmm3 = _mm_slli_si128(xmm3, 4);
1889
1890                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1891
1892                                                 xmm6 = xmm3;
1893                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1894                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1895
1896                                                 DATA16_RESULT(xmm6);
1897
1898                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1899
1900                                                 xmm6 = xmm3;
1901                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1902                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1903
1904                                                 DATA16_RESULT(xmm6);
1905
1906                                                 data_len-=2;
1907                                         }
1908                                 }
1909                         }
1910                         else {
1911                                 if(order == 2) {
1912                                         __m128i xmm0, xmm3, xmm6;
1913                                         xmm6 = _mm_setzero_si128();
1914                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1915                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
1916                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1917
1918                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1919                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1920                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
1921
1922                                         /* xmm0: qlp_coeff
1923                                            xmm3: data */
1924
1925                                         xmm6 = xmm3;
1926                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1927
1928                                         DATA16_RESULT(xmm6);
1929
1930                                         data_len--;
1931
1932                                         while(data_len) {
1933                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1934                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1935
1936                                                 xmm6 = xmm3;
1937                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1938
1939                                                 DATA16_RESULT(xmm6);
1940
1941                                                 data_len--;
1942                                         }
1943                                 }
1944                                 else { /* order == 1 */
1945                                         for(i = 0; i < (int)data_len; i++)
1946                                                 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1947                                 }
1948                         }
1949                 }
1950         }
1951         else { /* order > 12 */
1952                 for(i = 0; i < (int)data_len; i++) {
1953                         sum = 0;
1954                         switch(order) {
1955                                 case 32: sum += qlp_coeff[31] * data[i-32];
1956                                 case 31: sum += qlp_coeff[30] * data[i-31];
1957                                 case 30: sum += qlp_coeff[29] * data[i-30];
1958                                 case 29: sum += qlp_coeff[28] * data[i-29];
1959                                 case 28: sum += qlp_coeff[27] * data[i-28];
1960                                 case 27: sum += qlp_coeff[26] * data[i-27];
1961                                 case 26: sum += qlp_coeff[25] * data[i-26];
1962                                 case 25: sum += qlp_coeff[24] * data[i-25];
1963                                 case 24: sum += qlp_coeff[23] * data[i-24];
1964                                 case 23: sum += qlp_coeff[22] * data[i-23];
1965                                 case 22: sum += qlp_coeff[21] * data[i-22];
1966                                 case 21: sum += qlp_coeff[20] * data[i-21];
1967                                 case 20: sum += qlp_coeff[19] * data[i-20];
1968                                 case 19: sum += qlp_coeff[18] * data[i-19];
1969                                 case 18: sum += qlp_coeff[17] * data[i-18];
1970                                 case 17: sum += qlp_coeff[16] * data[i-17];
1971                                 case 16: sum += qlp_coeff[15] * data[i-16];
1972                                 case 15: sum += qlp_coeff[14] * data[i-15];
1973                                 case 14: sum += qlp_coeff[13] * data[i-14];
1974                                 case 13: sum += qlp_coeff[12] * data[i-13];
1975                                          sum += qlp_coeff[11] * data[i-12];
1976                                          sum += qlp_coeff[10] * data[i-11];
1977                                          sum += qlp_coeff[ 9] * data[i-10];
1978                                          sum += qlp_coeff[ 8] * data[i- 9];
1979                                          sum += qlp_coeff[ 7] * data[i- 8];
1980                                          sum += qlp_coeff[ 6] * data[i- 7];
1981                                          sum += qlp_coeff[ 5] * data[i- 6];
1982                                          sum += qlp_coeff[ 4] * data[i- 5];
1983                                          sum += qlp_coeff[ 3] * data[i- 4];
1984                                          sum += qlp_coeff[ 2] * data[i- 3];
1985                                          sum += qlp_coeff[ 1] * data[i- 2];
1986                                          sum += qlp_coeff[ 0] * data[i- 1];
1987                         }
1988                         data[i] = residual[i] + (sum >> lp_quantization);
1989                 }
1990         }
1991 }
1992
1993 FLAC__SSE_TARGET("sse2")
1994 void FLAC__lpc_restore_signal_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1995 {
1996         int i;
1997
1998         FLAC__ASSERT(order > 0);
1999         FLAC__ASSERT(order <= 32);
2000
2001         if(order <= 12) {
2002                 if(order > 8) { /* order == 9, 10, 11, 12 */
2003                         if(order > 10) { /* order == 11, 12 */
2004                                 if(order == 12) {
2005                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2006                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
2007                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
2008                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
2009                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
2010                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
2011                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
2012
2013                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
2014                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
2015                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
2016                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
2017                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
2018                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
2019
2020                                         for(i = 0; i < (int)data_len; i++) {
2021                                                 //sum = 0;
2022                                                 //sum += qlp_coeff[11] * data[i-12];
2023                                                 //sum += qlp_coeff[10] * data[i-11];
2024                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
2025                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
2026                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
2027
2028                                                 //sum += qlp_coeff[9] * data[i-10];
2029                                                 //sum += qlp_coeff[8] * data[i-9];
2030                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2031                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2032                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2033                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2034
2035                                                 //sum += qlp_coeff[7] * data[i-8];
2036                                                 //sum += qlp_coeff[6] * data[i-7];
2037                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2038                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2039                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2040                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2041
2042                                                 //sum += qlp_coeff[5] * data[i-6];
2043                                                 //sum += qlp_coeff[4] * data[i-5];
2044                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2045                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2046                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2047                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2048
2049                                                 //sum += qlp_coeff[3] * data[i-4];
2050                                                 //sum += qlp_coeff[2] * data[i-3];
2051                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2052                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2053                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2054                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2055
2056                                                 //sum += qlp_coeff[1] * data[i-2];
2057                                                 //sum += qlp_coeff[0] * data[i-1];
2058                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2059                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2060                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2061                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2062
2063                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2064                                                 DATA_RESULT(xmm7);
2065                                         }
2066                                 }
2067                                 else { /* order == 11 */
2068                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2069                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2070                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2071                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2072                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2073                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2074                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
2075
2076                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2077                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2078                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2079                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2080                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2081
2082                                         for(i = 0; i < (int)data_len; i++) {
2083                                                 //sum = 0;
2084                                                 //sum  = qlp_coeff[10] * data[i-11];
2085                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
2086                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
2087
2088                                                 //sum += qlp_coeff[9] * data[i-10];
2089                                                 //sum += qlp_coeff[8] * data[i-9];
2090                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2091                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2092                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2093                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2094
2095                                                 //sum += qlp_coeff[7] * data[i-8];
2096                                                 //sum += qlp_coeff[6] * data[i-7];
2097                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2098                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2099                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2100                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2101
2102                                                 //sum += qlp_coeff[5] * data[i-6];
2103                                                 //sum += qlp_coeff[4] * data[i-5];
2104                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2105                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2106                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2107                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2108
2109                                                 //sum += qlp_coeff[3] * data[i-4];
2110                                                 //sum += qlp_coeff[2] * data[i-3];
2111                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2112                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2113                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2114                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2115
2116                                                 //sum += qlp_coeff[1] * data[i-2];
2117                                                 //sum += qlp_coeff[0] * data[i-1];
2118                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2119                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2120                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2121                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2122
2123                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2124                                                 DATA_RESULT(xmm7);
2125                                         }
2126                                 }
2127                         }
2128                         else { /* order == 9, 10 */
2129                                 if(order == 10) {
2130                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2131                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2132                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2133                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2134                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2135                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2136
2137                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2138                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2139                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2140                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2141                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2142
2143                                         for(i = 0; i < (int)data_len; i++) {
2144                                                 //sum = 0;
2145                                                 //sum += qlp_coeff[9] * data[i-10];
2146                                                 //sum += qlp_coeff[8] * data[i-9];
2147                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2148                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2149                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2150
2151                                                 //sum += qlp_coeff[7] * data[i-8];
2152                                                 //sum += qlp_coeff[6] * data[i-7];
2153                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2154                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2155                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2156                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2157
2158                                                 //sum += qlp_coeff[5] * data[i-6];
2159                                                 //sum += qlp_coeff[4] * data[i-5];
2160                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2161                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2162                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2163                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2164
2165                                                 //sum += qlp_coeff[3] * data[i-4];
2166                                                 //sum += qlp_coeff[2] * data[i-3];
2167                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2168                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2169                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2170                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2171
2172                                                 //sum += qlp_coeff[1] * data[i-2];
2173                                                 //sum += qlp_coeff[0] * data[i-1];
2174                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2175                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2176                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2177                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2178
2179                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2180                                                 DATA_RESULT(xmm7);
2181                                         }
2182                                 }
2183                                 else { /* order == 9 */
2184                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2185                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2186                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2187                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2188                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2189                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
2190
2191                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2192                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2193                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2194                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2195
2196                                         for(i = 0; i < (int)data_len; i++) {
2197                                                 //sum = 0;
2198                                                 //sum  = qlp_coeff[8] * data[i-9];
2199                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
2200                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2201
2202                                                 //sum += qlp_coeff[7] * data[i-8];
2203                                                 //sum += qlp_coeff[6] * data[i-7];
2204                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2205                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2206                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2207                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2208
2209                                                 //sum += qlp_coeff[5] * data[i-6];
2210                                                 //sum += qlp_coeff[4] * data[i-5];
2211                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2212                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2213                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2214                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2215
2216                                                 //sum += qlp_coeff[3] * data[i-4];
2217                                                 //sum += qlp_coeff[2] * data[i-3];
2218                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2219                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2220                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2221                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2222
2223                                                 //sum += qlp_coeff[1] * data[i-2];
2224                                                 //sum += qlp_coeff[0] * data[i-1];
2225                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2226                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2227                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2228                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2229
2230                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2231                                                 DATA_RESULT(xmm7);
2232                                         }
2233                                 }
2234                         }
2235                 }
2236                 else if(order > 4) { /* order == 5, 6, 7, 8 */
2237                         if(order > 6) { /* order == 7, 8 */
2238                                 if(order == 8) {
2239                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
2240                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2241                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2242                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2243                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2244
2245                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2246                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2247                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2248                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2249
2250                                         for(i = 0; i < (int)data_len; i++) {
2251                                                 //sum = 0;
2252                                                 //sum += qlp_coeff[7] * data[i-8];
2253                                                 //sum += qlp_coeff[6] * data[i-7];
2254                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2255                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2256                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
2257
2258                                                 //sum += qlp_coeff[5] * data[i-6];
2259                                                 //sum += qlp_coeff[4] * data[i-5];
2260                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2261                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2262                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2263                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2264
2265                                                 //sum += qlp_coeff[3] * data[i-4];
2266                                                 //sum += qlp_coeff[2] * data[i-3];
2267                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2268                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2269                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2270                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2271
2272                                                 //sum += qlp_coeff[1] * data[i-2];
2273                                                 //sum += qlp_coeff[0] * data[i-1];
2274                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2275                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2276                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2277                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2278
2279                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2280                                                 DATA_RESULT(xmm7);
2281                                         }
2282                                 }
2283                                 else { /* order == 7 */
2284                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
2285                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2286                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2287                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2288                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
2289
2290                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2291                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2292                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2293
2294                                         for(i = 0; i < (int)data_len; i++) {
2295                                                 //sum = 0;
2296                                                 //sum  = qlp_coeff[6] * data[i-7];
2297                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
2298                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
2299
2300                                                 //sum += qlp_coeff[5] * data[i-6];
2301                                                 //sum += qlp_coeff[4] * data[i-5];
2302                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2303                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2304                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2305                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2306
2307                                                 //sum += qlp_coeff[3] * data[i-4];
2308                                                 //sum += qlp_coeff[2] * data[i-3];
2309                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2310                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2311                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2312                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2313
2314                                                 //sum += qlp_coeff[1] * data[i-2];
2315                                                 //sum += qlp_coeff[0] * data[i-1];
2316                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2317                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2318                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2319                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2320
2321                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2322                                                 DATA_RESULT(xmm7);
2323                                         }
2324                                 }
2325                         }
2326                         else { /* order == 5, 6 */
2327                                 if(order == 6) {
2328                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
2329                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2330                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2331                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2332
2333                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2334                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2335                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2336
2337                                         for(i = 0; i < (int)data_len; i++) {
2338                                                 //sum = 0;
2339                                                 //sum += qlp_coeff[5] * data[i-6];
2340                                                 //sum += qlp_coeff[4] * data[i-5];
2341                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2342                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2343                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
2344
2345                                                 //sum += qlp_coeff[3] * data[i-4];
2346                                                 //sum += qlp_coeff[2] * data[i-3];
2347                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2348                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2349                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2350                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2351
2352                                                 //sum += qlp_coeff[1] * data[i-2];
2353                                                 //sum += qlp_coeff[0] * data[i-1];
2354                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2355                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2356                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2357                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2358
2359                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2360                                                 DATA_RESULT(xmm7);
2361                                         }
2362                                 }
2363                                 else { /* order == 5 */
2364                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
2365                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2366                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2367                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
2368
2369                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2370                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2371
2372                                         for(i = 0; i < (int)data_len; i++) {
2373                                                 //sum = 0;
2374                                                 //sum  = qlp_coeff[4] * data[i-5];
2375                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
2376                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
2377
2378                                                 //sum += qlp_coeff[3] * data[i-4];
2379                                                 //sum += qlp_coeff[2] * data[i-3];
2380                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2381                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2382                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2383                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2384
2385                                                 //sum += qlp_coeff[1] * data[i-2];
2386                                                 //sum += qlp_coeff[0] * data[i-1];
2387                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2388                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2389                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2390                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2391
2392                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2393                                                 DATA_RESULT(xmm7);
2394                                         }
2395                                 }
2396                         }
2397                 }
2398                 else { /* order == 1, 2, 3, 4 */
2399                         if(order > 2) { /* order == 3, 4 */
2400                                 if(order == 4) {
2401                                         __m128i xmm0, xmm1, xmm6, xmm7;
2402                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2403                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2404
2405                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2406                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2407
2408                                         for(i = 0; i < (int)data_len; i++) {
2409                                                 //sum = 0;
2410                                                 //sum += qlp_coeff[3] * data[i-4];
2411                                                 //sum += qlp_coeff[2] * data[i-3];
2412                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2413                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2414                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
2415
2416                                                 //sum += qlp_coeff[1] * data[i-2];
2417                                                 //sum += qlp_coeff[0] * data[i-1];
2418                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2419                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2420                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2421                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2422
2423                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2424                                                 DATA_RESULT(xmm7);
2425                                         }
2426                                 }
2427                                 else { /* order == 3 */
2428                                         __m128i xmm0, xmm1, xmm6, xmm7;
2429                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2430                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
2431
2432                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2433
2434                                         for(i = 0; i < (int)data_len; i++) {
2435                                                 //sum = 0;
2436                                                 //sum  = qlp_coeff[2] * data[i-3];
2437                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
2438                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
2439
2440                                                 //sum += qlp_coeff[1] * data[i-2];
2441                                                 //sum += qlp_coeff[0] * data[i-1];
2442                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2443                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2444                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2445                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2446
2447                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2448                                                 DATA_RESULT(xmm7);
2449                                         }
2450                                 }
2451                         }
2452                         else { /* order == 1, 2 */
2453                                 if(order == 2) {
2454                                         __m128i xmm0, xmm7;
2455                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2456                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2457
2458                                         for(i = 0; i < (int)data_len; i++) {
2459                                                 //sum = 0;
2460                                                 //sum += qlp_coeff[1] * data[i-2];
2461                                                 //sum += qlp_coeff[0] * data[i-1];
2462                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2463                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2464                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
2465
2466                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2467                                                 DATA_RESULT(xmm7);
2468                                         }
2469                                 }
2470                                 else { /* order == 1 */
2471                                         for(i = 0; i < (int)data_len; i++)
2472                                                 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
2473                                 }
2474                         }
2475                 }
2476         }
2477         else { /* order > 12 */
2478                 FLAC__int32 sum;
2479                 for(i = 0; i < (int)data_len; i++) {
2480                         sum = 0;
2481                         switch(order) {
2482                                 case 32: sum += qlp_coeff[31] * data[i-32];
2483                                 case 31: sum += qlp_coeff[30] * data[i-31];
2484                                 case 30: sum += qlp_coeff[29] * data[i-30];
2485                                 case 29: sum += qlp_coeff[28] * data[i-29];
2486                                 case 28: sum += qlp_coeff[27] * data[i-28];
2487                                 case 27: sum += qlp_coeff[26] * data[i-27];
2488                                 case 26: sum += qlp_coeff[25] * data[i-26];
2489                                 case 25: sum += qlp_coeff[24] * data[i-25];
2490                                 case 24: sum += qlp_coeff[23] * data[i-24];
2491                                 case 23: sum += qlp_coeff[22] * data[i-23];
2492                                 case 22: sum += qlp_coeff[21] * data[i-22];
2493                                 case 21: sum += qlp_coeff[20] * data[i-21];
2494                                 case 20: sum += qlp_coeff[19] * data[i-20];
2495                                 case 19: sum += qlp_coeff[18] * data[i-19];
2496                                 case 18: sum += qlp_coeff[17] * data[i-18];
2497                                 case 17: sum += qlp_coeff[16] * data[i-17];
2498                                 case 16: sum += qlp_coeff[15] * data[i-16];
2499                                 case 15: sum += qlp_coeff[14] * data[i-15];
2500                                 case 14: sum += qlp_coeff[13] * data[i-14];
2501                                 case 13: sum += qlp_coeff[12] * data[i-13];
2502                                          sum += qlp_coeff[11] * data[i-12];
2503                                          sum += qlp_coeff[10] * data[i-11];
2504                                          sum += qlp_coeff[ 9] * data[i-10];
2505                                          sum += qlp_coeff[ 8] * data[i- 9];
2506                                          sum += qlp_coeff[ 7] * data[i- 8];
2507                                          sum += qlp_coeff[ 6] * data[i- 7];
2508                                          sum += qlp_coeff[ 5] * data[i- 6];
2509                                          sum += qlp_coeff[ 4] * data[i- 5];
2510                                          sum += qlp_coeff[ 3] * data[i- 4];
2511                                          sum += qlp_coeff[ 2] * data[i- 3];
2512                                          sum += qlp_coeff[ 1] * data[i- 2];
2513                                          sum += qlp_coeff[ 0] * data[i- 1];
2514                         }
2515                         data[i] = residual[i] + (sum >> lp_quantization);
2516                 }
2517         }
2518 }
2519
2520 #endif /* FLAC__SSE2_SUPPORTED */
2521 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
2522 #endif /* FLAC__NO_ASM */
2523 #endif /* FLAC__INTEGER_ONLY_LIBRARY */