libFLAC : SSE optimisations.
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <emmintrin.h> /* SSE2 */
47
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
50
51 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
53
54 #if 0
55 FLAC__SSE_TARGET("sse2")
56 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
57 {
58         int i;
59         FLAC__int32 sum;
60
61         FLAC__ASSERT(order > 0);
62         FLAC__ASSERT(order <= 32);
63         FLAC__ASSERT(data_len > 0);
64
65         if(order <= 12) {
66                 FLAC__int32 curr;
67                 if(order > 8) { /* order == 9, 10, 11, 12 */
68 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
69                         int r;
70                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
71                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
72                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
73                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
74                         switch(order)                                          /* ...and zero them out */
75                         {
76                         case 9:
77                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
78                         case 10:
79                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
80                         case 11:
81                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
82                         }
83                         xmm2 = _mm_setzero_si128();
84                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
85                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
86
87                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
88                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
89                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
90                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
91                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
92                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
93                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
94                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
95
96                         xmm7 = _mm_slli_si128(xmm1, 2);
97                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
98                         xmm2 = _mm_slli_si128(xmm0, 2);
99
100                         /* xmm0, xmm1: qlp_coeff
101                            xmm2, xmm7: qlp_coeff << 16 bit
102                            xmm3, xmm4: data */
103
104                         xmm6 = xmm4;
105                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
106                         xmm5 = xmm3;
107                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
108                         xmm6 = _mm_add_epi32(xmm6, xmm5);
109                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
110                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
111
112                         RESIDUAL16_RESULT(xmm6);
113
114                         data_len--;
115                         r = data_len % 2;
116
117                         if(r) {
118                                 xmm4 = _mm_slli_si128(xmm4, 2);
119                                 xmm6 = xmm3;
120                                 xmm3 = _mm_slli_si128(xmm3, 2);
121                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
122                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
123
124                                 xmm6 = xmm4;
125                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
126                                 xmm5 = xmm3;
127                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
128                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
129                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
130                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
131
132                                 RESIDUAL16_RESULT(xmm6);
133
134                                 data_len--;
135                         }
136
137                         while(data_len) { /* data_len is a multiple of 2 */
138                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
139                                 xmm4 = _mm_slli_si128(xmm4, 4);
140                                 xmm6 = xmm3;
141                                 xmm3 = _mm_slli_si128(xmm3, 4);
142                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
143                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
144
145                                 xmm6 = xmm4;
146                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
147                                 xmm5 = xmm3;
148                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
149                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
150                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
151                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
152
153                                 RESIDUAL16_RESULT(xmm6);
154
155                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
156
157                                 xmm6 = xmm4;
158                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
159                                 xmm5 = xmm3;
160                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
161                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
162                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
163                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
164
165                                 RESIDUAL16_RESULT(xmm6);
166
167                                 data_len-=2;
168                         }
169 #else /* 16 XMM registers available */
170                         int r;
171                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
172                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
173                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
174                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
175                         switch(order)                                          /* ...and zero them out */
176                         {
177                         case 9:
178                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
179                         case 10:
180                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
181                         case 11:
182                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
183                         }
184                         xmm2 = _mm_setzero_si128();
185                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
186                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
187
188                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
189                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
190                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
191                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
192                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
193                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
194                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
195                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
196
197                         xmm7 = _mm_slli_si128(xmm1, 2);
198                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
199                         xmm2 = _mm_slli_si128(xmm0, 2);
200
201                         xmm9 = _mm_slli_si128(xmm1, 4);
202                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
203                         xmm8 = _mm_slli_si128(xmm0, 4);
204
205                         xmmB = _mm_slli_si128(xmm1, 6);
206                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
207                         xmmA = _mm_slli_si128(xmm0, 6);
208
209                         /* xmm0, xmm1: qlp_coeff
210                            xmm2, xmm7: qlp_coeff << 16 bit
211                            xmm8, xmm9: qlp_coeff << 2*16 bit
212                            xmmA, xmmB: qlp_coeff << 3*16 bit
213                            xmm3, xmm4: data */
214
215                         xmm6 = xmm4;
216                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
217                         xmm5 = xmm3;
218                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
219                         xmm6 = _mm_add_epi32(xmm6, xmm5);
220                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
221                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
222
223                         RESIDUAL16_RESULT(xmm6);
224
225                         data_len--;
226                         r = data_len % 4;
227
228                         while(r) {
229                                 xmm4 = _mm_slli_si128(xmm4, 2);
230                                 xmm6 = xmm3;
231                                 xmm3 = _mm_slli_si128(xmm3, 2);
232                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
233                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
234
235                                 xmm6 = xmm4;
236                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
237                                 xmm5 = xmm3;
238                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
239                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
240                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
241                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
242
243                                 RESIDUAL16_RESULT(xmm6);
244
245                                 data_len--; r--;
246                         }
247
248                         while(data_len) { /* data_len is a multiple of 4 */
249                                 xmm4 = _mm_slli_si128(xmm4, 8);
250                                 xmm6 = xmm3;
251                                 xmm3 = _mm_slli_si128(xmm3, 8);
252                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
253
254                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
255
256                                 xmm6 = xmm4;
257                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
258                                 xmm5 = xmm3;
259                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
260                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
261                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
262                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
263
264                                 RESIDUAL16_RESULT(xmm6);
265
266                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
267
268                                 xmm6 = xmm4;
269                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
270                                 xmm5 = xmm3;
271                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
272                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
273                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
274                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
275
276                                 RESIDUAL16_RESULT(xmm6);
277
278                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
279
280                                 xmm6 = xmm4;
281                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
282                                 xmm5 = xmm3;
283                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
284                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
285                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
286                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
287
288                                 RESIDUAL16_RESULT(xmm6);
289
290                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
291
292                                 xmm6 = xmm4;
293                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
294                                 xmm5 = xmm3;
295                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
296                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
297                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
298                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
299
300                                 RESIDUAL16_RESULT(xmm6);
301
302                                 data_len-=4;
303                         }
304 #endif
305                 } /* endif(order > 8) */
306                 else if(order > 4) { /* order == 5, 6, 7, 8 */
307                         if(order > 6) { /* order == 7, 8 */
308                                 if(order == 8) {
309                                         __m128i xmm0, xmm1, xmm3, xmm6;
310                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
311                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
312                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
313
314                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
315                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
316                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
317                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
318                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
319
320                                         /* xmm0: qlp_coeff
321                                            xmm3: data */
322
323                                         xmm6 = xmm3;
324                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
325                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
326                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
327
328                                         RESIDUAL16_RESULT(xmm6);
329
330                                         data_len--;
331
332                                         while(data_len) {
333                                                 xmm3 = _mm_slli_si128(xmm3, 2);
334                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
335
336                                                 xmm6 = xmm3;
337                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
338                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
339                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
340
341                                                 RESIDUAL16_RESULT(xmm6);
342
343                                                 data_len--;
344                                         }
345                                 }
346                                 else { /* order == 7 */
347                                         int r;
348                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
349                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
350                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
351                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
352                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
353
354                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
355                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
356                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
357                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
358                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
359                                         xmm2 = _mm_slli_si128(xmm0, 2);
360
361                                         /* xmm0: qlp_coeff
362                                            xmm2: qlp_coeff << 16 bit
363                                            xmm3: data */
364
365                                         xmm6 = xmm3;
366                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
367                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
368                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
369
370                                         RESIDUAL16_RESULT(xmm6);
371
372                                         data_len--;
373                                         r = data_len % 2;
374
375                                         if(r) {
376                                                 xmm3 = _mm_slli_si128(xmm3, 2);
377                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
378
379                                                 xmm6 = xmm3;
380                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
381                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
382                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
383
384                                                 RESIDUAL16_RESULT(xmm6);
385
386                                                 data_len--;
387                                         }
388
389                                         while(data_len) { /* data_len is a multiple of 2 */
390                                                 xmm3 = _mm_slli_si128(xmm3, 4);
391                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
392
393                                                 xmm6 = xmm3;
394                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
395                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
396                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
397
398                                                 RESIDUAL16_RESULT(xmm6);
399
400                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
401                                                 xmm6 = xmm3;
402                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
403                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
404                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
405
406                                                 RESIDUAL16_RESULT(xmm6);
407
408                                                 data_len-=2;
409                                         }
410                                 }
411                         }
412                         else { /* order == 5, 6 */
413                                 if(order == 6) {
414                                         int r;
415                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
416                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
417                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
418                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
419                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
420
421                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
422                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
423                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
424                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
425                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
426                                         xmm2 = _mm_slli_si128(xmm0, 2);
427                                         xmm4 = _mm_slli_si128(xmm0, 4);
428
429                                         /* xmm0: qlp_coeff
430                                            xmm2: qlp_coeff << 16 bit
431                                            xmm4: qlp_coeff << 2*16 bit
432                                            xmm3: data */
433
434                                         xmm6 = xmm3;
435                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
436                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
437                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
438
439                                         RESIDUAL16_RESULT(xmm6);
440
441                                         data_len--;
442                                         r = data_len % 3;
443
444                                         while(r) {
445                                                 xmm3 = _mm_slli_si128(xmm3, 2);
446                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
447
448                                                 xmm6 = xmm3;
449                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
450                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
451                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
452
453                                                 RESIDUAL16_RESULT(xmm6);
454
455                                                 data_len--; r--;
456                                         }
457
458                                         while(data_len) { /* data_len is a multiple of 3 */
459                                                 xmm3 = _mm_slli_si128(xmm3, 6);
460                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
461
462                                                 xmm6 = xmm3;
463                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
464                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
465                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
466
467                                                 RESIDUAL16_RESULT(xmm6);
468
469                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
470
471                                                 xmm6 = xmm3;
472                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
473                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
474                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
475
476                                                 RESIDUAL16_RESULT(xmm6);
477
478                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
479
480                                                 xmm6 = xmm3;
481                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
482                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
483                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
484
485                                                 RESIDUAL16_RESULT(xmm6);
486
487                                                 data_len-=3;
488                                         }
489                                 }
490                                 else { /* order == 5 */
491                                         int r;
492                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
493                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
494                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
495                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
496                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
497
498                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
499                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
500                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
501                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
502                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
503                                         xmm2 = _mm_slli_si128(xmm0, 2);
504                                         xmm4 = _mm_slli_si128(xmm0, 4);
505                                         xmm5 = _mm_slli_si128(xmm0, 6);
506
507                                         /* xmm0: qlp_coeff
508                                            xmm2: qlp_coeff << 16 bit
509                                            xmm4: qlp_coeff << 2*16 bit
510                                            xmm4: qlp_coeff << 3*16 bit
511                                            xmm3: data */
512
513                                         xmm6 = xmm3;
514                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
515                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
516                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
517
518                                         RESIDUAL16_RESULT(xmm6);
519
520                                         data_len--;
521                                         r = data_len % 4;
522
523                                         while(r) {
524                                                 xmm3 = _mm_slli_si128(xmm3, 2);
525                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
526
527                                                 xmm6 = xmm3;
528                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
529                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
530                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
531
532                                                 RESIDUAL16_RESULT(xmm6);
533
534                                                 data_len--; r--;
535                                         }
536
537                                         while(data_len) { /* data_len is a multiple of 4 */
538                                                 xmm3 = _mm_slli_si128(xmm3, 8);
539                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
540
541                                                 xmm6 = xmm3;
542                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
543                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
544                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
545
546                                                 RESIDUAL16_RESULT(xmm6);
547
548                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
549
550                                                 xmm6 = xmm3;
551                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
552                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
553                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
554
555                                                 RESIDUAL16_RESULT(xmm6);
556
557                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
558
559                                                 xmm6 = xmm3;
560                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
561                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
562                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
563
564                                                 RESIDUAL16_RESULT(xmm6);
565
566                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
567
568                                                 xmm6 = xmm3;
569                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
570                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
571                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
572
573                                                 RESIDUAL16_RESULT(xmm6);
574
575                                                 data_len-=4;
576                                         }
577                                 }
578                         }
579                 }
580                 else { /* order == 1, 2, 3, 4 */
581                         if(order > 2) {
582                                 if(order == 4) {
583                                         __m128i xmm0, xmm3, xmm6;
584                                         xmm6 = _mm_setzero_si128();
585                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
586                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
587
588                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
589                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
590                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
591
592                                         /* xmm0: qlp_coeff
593                                            xmm3: data */
594
595                                         xmm6 = xmm3;
596                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
597                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
598
599                                         RESIDUAL16_RESULT(xmm6);
600
601                                         data_len--;
602
603                                         while(data_len) {
604                                                 xmm3 = _mm_slli_si128(xmm3, 2);
605                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
606
607                                                 xmm6 = xmm3;
608                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
609                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
610
611                                                 RESIDUAL16_RESULT(xmm6);
612
613                                                 data_len--;
614                                         }
615                                 }
616                                 else { /* order == 3 */
617                                         int r;
618                                         __m128i xmm0, xmm1, xmm3, xmm6;
619                                         xmm6 = _mm_setzero_si128();
620                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
621                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
622                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
623
624                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
625                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
626                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
627                                         xmm1 = _mm_slli_si128(xmm0, 2);
628
629                                         /* xmm0: qlp_coeff
630                                            xmm1: qlp_coeff << 16 bit
631                                            xmm3: data */
632
633                                         xmm6 = xmm3;
634                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
635                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
636
637                                         RESIDUAL16_RESULT(xmm6);
638
639                                         data_len--;
640                                         r = data_len % 2;
641
642                                         if(r) {
643                                                 xmm3 = _mm_slli_si128(xmm3, 2);
644                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
645
646                                                 xmm6 = xmm3;
647                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
648                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
649
650                                                 RESIDUAL16_RESULT(xmm6);
651
652                                                 data_len--;
653                                         }
654
655                                         while(data_len) { /* data_len is a multiple of 2 */
656                                                 xmm3 = _mm_slli_si128(xmm3, 4);
657
658                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
659
660                                                 xmm6 = xmm3;
661                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
662                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
663
664                                                 RESIDUAL16_RESULT(xmm6);
665
666                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
667
668                                                 xmm6 = xmm3;
669                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
670                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
671
672                                                 RESIDUAL16_RESULT(xmm6);
673
674                                                 data_len-=2;
675                                         }
676                                 }
677                         }
678                         else {
679                                 if(order == 2) {
680                                         __m128i xmm0, xmm3, xmm6;
681                                         xmm6 = _mm_setzero_si128();
682                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
683                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
684                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
685
686                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
687                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
688                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
689
690                                         /* xmm0: qlp_coeff
691                                            xmm3: data */
692
693                                         xmm6 = xmm3;
694                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
695
696                                         RESIDUAL16_RESULT(xmm6);
697
698                                         data_len--;
699
700                                         while(data_len) {
701                                                 xmm3 = _mm_slli_si128(xmm3, 2);
702                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
703
704                                                 xmm6 = xmm3;
705                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
706
707                                                 RESIDUAL16_RESULT(xmm6);
708
709                                                 data_len--;
710                                         }
711                                 }
712                                 else { /* order == 1 */
713                                         for(i = 0; i < (int)data_len; i++)
714                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
715                                 }
716                         }
717                 }
718         }
719         else { /* order > 12 */
720                 for(i = 0; i < (int)data_len; i++) {
721                         sum = 0;
722                         switch(order) {
723                                 case 32: sum += qlp_coeff[31] * data[i-32];
724                                 case 31: sum += qlp_coeff[30] * data[i-31];
725                                 case 30: sum += qlp_coeff[29] * data[i-30];
726                                 case 29: sum += qlp_coeff[28] * data[i-29];
727                                 case 28: sum += qlp_coeff[27] * data[i-28];
728                                 case 27: sum += qlp_coeff[26] * data[i-27];
729                                 case 26: sum += qlp_coeff[25] * data[i-26];
730                                 case 25: sum += qlp_coeff[24] * data[i-25];
731                                 case 24: sum += qlp_coeff[23] * data[i-24];
732                                 case 23: sum += qlp_coeff[22] * data[i-23];
733                                 case 22: sum += qlp_coeff[21] * data[i-22];
734                                 case 21: sum += qlp_coeff[20] * data[i-21];
735                                 case 20: sum += qlp_coeff[19] * data[i-20];
736                                 case 19: sum += qlp_coeff[18] * data[i-19];
737                                 case 18: sum += qlp_coeff[17] * data[i-18];
738                                 case 17: sum += qlp_coeff[16] * data[i-17];
739                                 case 16: sum += qlp_coeff[15] * data[i-16];
740                                 case 15: sum += qlp_coeff[14] * data[i-15];
741                                 case 14: sum += qlp_coeff[13] * data[i-14];
742                                 case 13: sum += qlp_coeff[12] * data[i-13];
743                                          sum += qlp_coeff[11] * data[i-12];
744                                          sum += qlp_coeff[10] * data[i-11];
745                                          sum += qlp_coeff[ 9] * data[i-10];
746                                          sum += qlp_coeff[ 8] * data[i- 9];
747                                          sum += qlp_coeff[ 7] * data[i- 8];
748                                          sum += qlp_coeff[ 6] * data[i- 7];
749                                          sum += qlp_coeff[ 5] * data[i- 6];
750                                          sum += qlp_coeff[ 4] * data[i- 5];
751                                          sum += qlp_coeff[ 3] * data[i- 4];
752                                          sum += qlp_coeff[ 2] * data[i- 3];
753                                          sum += qlp_coeff[ 1] * data[i- 2];
754                                          sum += qlp_coeff[ 0] * data[i- 1];
755                         }
756                         residual[i] = data[i] - (sum >> lp_quantization);
757                 }
758         }
759 }
760 #else
761 FLAC__SSE_TARGET("sse2")
762 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
763 {
764         int i;
765         FLAC__int32 sum;
766         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
767
768         FLAC__ASSERT(order > 0);
769         FLAC__ASSERT(order <= 32);
770
771         if(order <= 12) {
772                 if(order > 8) {
773                         if(order > 10) {
774                                 if(order == 12) {
775                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
776                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
777                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
778                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
779                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
780                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
781                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
782                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
783                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
784                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
785                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
786                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
787                                         q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
788
789                                         for(i = 0; i < (int)data_len-3; i+=4) {
790                                                 __m128i summ, mull;
791                                                 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
792                                                 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
793                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
794                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
795                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
796                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
797                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
798                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
799                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
800                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
801                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
802                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
803                                                 summ = _mm_sra_epi32(summ, cnt);
804                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
805                                         }
806                                 }
807                                 else { /* order == 11 */
808                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
809                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
810                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
811                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
812                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
813                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
814                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
815                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
816                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
817                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
818                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
819                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
820
821                                         for(i = 0; i < (int)data_len-3; i+=4) {
822                                                 __m128i summ, mull;
823                                                 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
824                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
825                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
826                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
827                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
828                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
829                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
830                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
831                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
832                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
833                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
834                                                 summ = _mm_sra_epi32(summ, cnt);
835                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
836                                         }
837                                 }
838                         }
839                         else {
840                                 if(order == 10) {
841                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
842                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
843                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
844                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
845                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
846                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
847                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
848                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
849                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
850                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
851                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
852
853                                         for(i = 0; i < (int)data_len-3; i+=4) {
854                                                 __m128i summ, mull;
855                                                 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
856                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
857                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
858                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
859                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
860                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
861                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
862                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
863                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
864                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
865                                                 summ = _mm_sra_epi32(summ, cnt);
866                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
867                                         }
868                                 }
869                                 else { /* order == 9 */
870                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
871                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
872                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
873                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
874                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
875                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
876                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
877                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
878                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
879                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
880
881                                         for(i = 0; i < (int)data_len-3; i+=4) {
882                                                 __m128i summ, mull;
883                                                 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
884                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
885                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
886                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
887                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
888                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
889                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
890                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
891                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
892                                                 summ = _mm_sra_epi32(summ, cnt);
893                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
894                                         }
895                                 }
896                         }
897                 }
898                 else if(order > 4) {
899                         if(order > 6) {
900                                 if(order == 8) {
901                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7;
902                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
903                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
904                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
905                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
906                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
907                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
908                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
909                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
910
911                                         for(i = 0; i < (int)data_len-3; i+=4) {
912                                                 __m128i summ, mull;
913                                                 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
914                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
915                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
916                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
917                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
918                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
919                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
920                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
921                                                 summ = _mm_sra_epi32(summ, cnt);
922                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
923                                         }
924                                 }
925                                 else { /* order == 7 */
926                                         __m128i q0, q1, q2, q3, q4, q5, q6;
927                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
928                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
929                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
930                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
931                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
932                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
933                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
934
935                                         for(i = 0; i < (int)data_len-3; i+=4) {
936                                                 __m128i summ, mull;
937                                                 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
938                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
939                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
940                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
941                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
942                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
943                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
944                                                 summ = _mm_sra_epi32(summ, cnt);
945                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
946                                         }
947                                 }
948                         }
949                         else {
950                                 if(order == 6) {
951                                         __m128i q0, q1, q2, q3, q4, q5;
952                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
953                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
954                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
955                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
956                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
957                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
958
959                                         for(i = 0; i < (int)data_len-3; i+=4) {
960                                                 __m128i summ, mull;
961                                                 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
962                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
963                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
964                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
965                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
966                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
967                                                 summ = _mm_sra_epi32(summ, cnt);
968                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
969                                         }
970                                 }
971                                 else { /* order == 5 */
972                                         __m128i q0, q1, q2, q3, q4;
973                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
974                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
975                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
976                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
977                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
978
979                                         for(i = 0; i < (int)data_len-3; i+=4) {
980                                                 __m128i summ, mull;
981                                                 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
982                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
983                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
984                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
985                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
986                                                 summ = _mm_sra_epi32(summ, cnt);
987                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
988                                         }
989                                 }
990                         }
991                 }
992                 else {
993                         if(order > 2) {
994                                 if(order == 4) {
995                                         __m128i q0, q1, q2, q3;
996                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
997                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
998                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
999                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1000
1001                                         for(i = 0; i < (int)data_len-3; i+=4) {
1002                                                 __m128i summ, mull;
1003                                                 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
1004                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1005                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1006                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1007                                                 summ = _mm_sra_epi32(summ, cnt);
1008                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1009                                         }
1010                                 }
1011                                 else { /* order == 3 */
1012                                         __m128i q0, q1, q2;
1013                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1014                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1015                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1016
1017                                         for(i = 0; i < (int)data_len-3; i+=4) {
1018                                                 __m128i summ, mull;
1019                                                 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
1020                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1021                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1022                                                 summ = _mm_sra_epi32(summ, cnt);
1023                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1024                                         }
1025                                 }
1026                         }
1027                         else {
1028                                 if(order == 2) {
1029                                         __m128i q0, q1;
1030                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1031                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1032
1033                                         for(i = 0; i < (int)data_len-3; i+=4) {
1034                                                 __m128i summ, mull;
1035                                                 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
1036                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1037                                                 summ = _mm_sra_epi32(summ, cnt);
1038                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1039                                         }
1040                                 }
1041                                 else { /* order == 1 */
1042                                         __m128i q0;
1043                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1044
1045                                         for(i = 0; i < (int)data_len-3; i+=4) {
1046                                                 __m128i summ;
1047                                                 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
1048                                                 summ = _mm_sra_epi32(summ, cnt);
1049                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1050                                         }
1051                                 }
1052                         }
1053                 }
1054                 for(; i < (int)data_len; i++) {
1055                         sum = 0;
1056                         switch(order) {
1057                                 case 12: sum += qlp_coeff[11] * data[i-12];
1058                                 case 11: sum += qlp_coeff[10] * data[i-11];
1059                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
1060                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
1061                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
1062                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
1063                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
1064                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
1065                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
1066                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
1067                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
1068                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
1069                         }
1070                         residual[i] = data[i] - (sum >> lp_quantization);
1071                 }
1072         }
1073         else { /* order > 12 */
1074                 for(i = 0; i < (int)data_len; i++) {
1075                         sum = 0;
1076                         switch(order) {
1077                                 case 32: sum += qlp_coeff[31] * data[i-32];
1078                                 case 31: sum += qlp_coeff[30] * data[i-31];
1079                                 case 30: sum += qlp_coeff[29] * data[i-30];
1080                                 case 29: sum += qlp_coeff[28] * data[i-29];
1081                                 case 28: sum += qlp_coeff[27] * data[i-28];
1082                                 case 27: sum += qlp_coeff[26] * data[i-27];
1083                                 case 26: sum += qlp_coeff[25] * data[i-26];
1084                                 case 25: sum += qlp_coeff[24] * data[i-25];
1085                                 case 24: sum += qlp_coeff[23] * data[i-24];
1086                                 case 23: sum += qlp_coeff[22] * data[i-23];
1087                                 case 22: sum += qlp_coeff[21] * data[i-22];
1088                                 case 21: sum += qlp_coeff[20] * data[i-21];
1089                                 case 20: sum += qlp_coeff[19] * data[i-20];
1090                                 case 19: sum += qlp_coeff[18] * data[i-19];
1091                                 case 18: sum += qlp_coeff[17] * data[i-18];
1092                                 case 17: sum += qlp_coeff[16] * data[i-17];
1093                                 case 16: sum += qlp_coeff[15] * data[i-16];
1094                                 case 15: sum += qlp_coeff[14] * data[i-15];
1095                                 case 14: sum += qlp_coeff[13] * data[i-14];
1096                                 case 13: sum += qlp_coeff[12] * data[i-13];
1097                                          sum += qlp_coeff[11] * data[i-12];
1098                                          sum += qlp_coeff[10] * data[i-11];
1099                                          sum += qlp_coeff[ 9] * data[i-10];
1100                                          sum += qlp_coeff[ 8] * data[i- 9];
1101                                          sum += qlp_coeff[ 7] * data[i- 8];
1102                                          sum += qlp_coeff[ 6] * data[i- 7];
1103                                          sum += qlp_coeff[ 5] * data[i- 6];
1104                                          sum += qlp_coeff[ 4] * data[i- 5];
1105                                          sum += qlp_coeff[ 3] * data[i- 4];
1106                                          sum += qlp_coeff[ 2] * data[i- 3];
1107                                          sum += qlp_coeff[ 1] * data[i- 2];
1108                                          sum += qlp_coeff[ 0] * data[i- 1];
1109                         }
1110                         residual[i] = data[i] - (sum >> lp_quantization);
1111                 }
1112         }
1113 }
1114 #endif
1115
1116 FLAC__SSE_TARGET("sse2")
1117 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
1118 {
1119         int i;
1120
1121         FLAC__ASSERT(order > 0);
1122         FLAC__ASSERT(order <= 32);
1123
1124         if(order <= 12) {
1125                 if(order > 8) { /* order == 9, 10, 11, 12 */
1126                         if(order > 10) { /* order == 11, 12 */
1127                                 if(order == 12) {
1128                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1129                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
1130                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
1131                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
1132                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
1133                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
1134                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
1135
1136                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
1137                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
1138                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
1139                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
1140                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
1141                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
1142
1143                                         for(i = 0; i < (int)data_len; i++) {
1144                                                 //sum = 0;
1145                                                 //sum += qlp_coeff[11] * data[i-12];
1146                                                 //sum += qlp_coeff[10] * data[i-11];
1147                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
1148                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
1149                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
1150
1151                                                 //sum += qlp_coeff[9] * data[i-10];
1152                                                 //sum += qlp_coeff[8] * data[i-9];
1153                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
1154                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1155                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
1156                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1157
1158                                                 //sum += qlp_coeff[7] * data[i-8];
1159                                                 //sum += qlp_coeff[6] * data[i-7];
1160                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1161                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1162                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
1163                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1164
1165                                                 //sum += qlp_coeff[5] * data[i-6];
1166                                                 //sum += qlp_coeff[4] * data[i-5];
1167                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1168                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1169                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1170                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1171
1172                                                 //sum += qlp_coeff[3] * data[i-4];
1173                                                 //sum += qlp_coeff[2] * data[i-3];
1174                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1175                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1176                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1177                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1178
1179                                                 //sum += qlp_coeff[1] * data[i-2];
1180                                                 //sum += qlp_coeff[0] * data[i-1];
1181                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1182                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1183                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1184                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1185
1186                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1187                                                 RESIDUAL32_RESULT(xmm7);
1188                                         }
1189                                 }
1190                                 else { /* order == 11 */
1191                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1192                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1193                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1194                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1195                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1196                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
1197                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
1198
1199                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1200                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1201                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1202                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1203                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
1204
1205                                         for(i = 0; i < (int)data_len; i++) {
1206                                                 //sum = 0;
1207                                                 //sum  = qlp_coeff[10] * data[i-11];
1208                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
1209                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
1210
1211                                                 //sum += qlp_coeff[9] * data[i-10];
1212                                                 //sum += qlp_coeff[8] * data[i-9];
1213                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
1214                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1215                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
1216                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1217
1218                                                 //sum += qlp_coeff[7] * data[i-8];
1219                                                 //sum += qlp_coeff[6] * data[i-7];
1220                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1221                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1222                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
1223                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1224
1225                                                 //sum += qlp_coeff[5] * data[i-6];
1226                                                 //sum += qlp_coeff[4] * data[i-5];
1227                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1228                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1229                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1230                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1231
1232                                                 //sum += qlp_coeff[3] * data[i-4];
1233                                                 //sum += qlp_coeff[2] * data[i-3];
1234                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1235                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1236                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1237                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1238
1239                                                 //sum += qlp_coeff[1] * data[i-2];
1240                                                 //sum += qlp_coeff[0] * data[i-1];
1241                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1242                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1243                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1244                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1245
1246                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1247                                                 RESIDUAL32_RESULT(xmm7);
1248                                         }
1249                                 }
1250                         }
1251                         else { /* order == 9, 10 */
1252                                 if(order == 10) {
1253                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
1254                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1255                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1256                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1257                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1258                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
1259
1260                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1261                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1262                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1263                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1264                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
1265
1266                                         for(i = 0; i < (int)data_len; i++) {
1267                                                 //sum = 0;
1268                                                 //sum += qlp_coeff[9] * data[i-10];
1269                                                 //sum += qlp_coeff[8] * data[i-9];
1270                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
1271                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1272                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
1273
1274                                                 //sum += qlp_coeff[7] * data[i-8];
1275                                                 //sum += qlp_coeff[6] * data[i-7];
1276                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1277                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1278                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
1279                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1280
1281                                                 //sum += qlp_coeff[5] * data[i-6];
1282                                                 //sum += qlp_coeff[4] * data[i-5];
1283                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1284                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1285                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1286                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1287
1288                                                 //sum += qlp_coeff[3] * data[i-4];
1289                                                 //sum += qlp_coeff[2] * data[i-3];
1290                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1291                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1292                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1293                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1294
1295                                                 //sum += qlp_coeff[1] * data[i-2];
1296                                                 //sum += qlp_coeff[0] * data[i-1];
1297                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1298                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1299                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1300                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1301
1302                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1303                                                 RESIDUAL32_RESULT(xmm7);
1304                                         }
1305                                 }
1306                                 else { /* order == 9 */
1307                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
1308                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1309                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1310                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1311                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1312                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
1313
1314                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1315                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1316                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1317                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1318
1319                                         for(i = 0; i < (int)data_len; i++) {
1320                                                 //sum = 0;
1321                                                 //sum  = qlp_coeff[8] * data[i-9];
1322                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
1323                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
1324
1325                                                 //sum += qlp_coeff[7] * data[i-8];
1326                                                 //sum += qlp_coeff[6] * data[i-7];
1327                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1328                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1329                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
1330                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1331
1332                                                 //sum += qlp_coeff[5] * data[i-6];
1333                                                 //sum += qlp_coeff[4] * data[i-5];
1334                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1335                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1336                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1337                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1338
1339                                                 //sum += qlp_coeff[3] * data[i-4];
1340                                                 //sum += qlp_coeff[2] * data[i-3];
1341                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1342                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1343                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1344                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1345
1346                                                 //sum += qlp_coeff[1] * data[i-2];
1347                                                 //sum += qlp_coeff[0] * data[i-1];
1348                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1349                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1350                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1351                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1352
1353                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1354                                                 RESIDUAL32_RESULT(xmm7);
1355                                         }
1356                                 }
1357                         }
1358                 }
1359                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1360                         if(order > 6) { /* order == 7, 8 */
1361                                 if(order == 8) {
1362                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1363                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1364                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1365                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1366                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1367
1368                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1369                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1370                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1371                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1372
1373                                         for(i = 0; i < (int)data_len; i++) {
1374                                                 //sum = 0;
1375                                                 //sum += qlp_coeff[7] * data[i-8];
1376                                                 //sum += qlp_coeff[6] * data[i-7];
1377                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1378                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1379                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1380
1381                                                 //sum += qlp_coeff[5] * data[i-6];
1382                                                 //sum += qlp_coeff[4] * data[i-5];
1383                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1384                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1385                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1386                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1387
1388                                                 //sum += qlp_coeff[3] * data[i-4];
1389                                                 //sum += qlp_coeff[2] * data[i-3];
1390                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1391                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1392                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1393                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1394
1395                                                 //sum += qlp_coeff[1] * data[i-2];
1396                                                 //sum += qlp_coeff[0] * data[i-1];
1397                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1398                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1399                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1400                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1401
1402                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1403                                                 RESIDUAL32_RESULT(xmm7);
1404                                         }
1405                                 }
1406                                 else { /* order == 7 */
1407                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1408                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1409                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1410                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1411                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
1412
1413                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1414                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1415                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1416
1417                                         for(i = 0; i < (int)data_len; i++) {
1418                                                 //sum = 0;
1419                                                 //sum  = qlp_coeff[6] * data[i-7];
1420                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
1421                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1422
1423                                                 //sum += qlp_coeff[5] * data[i-6];
1424                                                 //sum += qlp_coeff[4] * data[i-5];
1425                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1426                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1427                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1428                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1429
1430                                                 //sum += qlp_coeff[3] * data[i-4];
1431                                                 //sum += qlp_coeff[2] * data[i-3];
1432                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1433                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1434                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1435                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1436
1437                                                 //sum += qlp_coeff[1] * data[i-2];
1438                                                 //sum += qlp_coeff[0] * data[i-1];
1439                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1440                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1441                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1442                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1443
1444                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1445                                                 RESIDUAL32_RESULT(xmm7);
1446                                         }
1447                                 }
1448                         }
1449                         else { /* order == 5, 6 */
1450                                 if(order == 6) {
1451                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1452                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1453                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1454                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1455
1456                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1457                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1458                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1459
1460                                         for(i = 0; i < (int)data_len; i++) {
1461                                                 //sum = 0;
1462                                                 //sum += qlp_coeff[5] * data[i-6];
1463                                                 //sum += qlp_coeff[4] * data[i-5];
1464                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1465                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1466                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1467
1468                                                 //sum += qlp_coeff[3] * data[i-4];
1469                                                 //sum += qlp_coeff[2] * data[i-3];
1470                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1471                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1472                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1473                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1474
1475                                                 //sum += qlp_coeff[1] * data[i-2];
1476                                                 //sum += qlp_coeff[0] * data[i-1];
1477                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1478                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1479                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1480                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1481
1482                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1483                                                 RESIDUAL32_RESULT(xmm7);
1484                                         }
1485                                 }
1486                                 else { /* order == 5 */
1487                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1488                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1489                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1490                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
1491
1492                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1493                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1494
1495                                         for(i = 0; i < (int)data_len; i++) {
1496                                                 //sum = 0;
1497                                                 //sum  = qlp_coeff[4] * data[i-5];
1498                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
1499                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1500
1501                                                 //sum += qlp_coeff[3] * data[i-4];
1502                                                 //sum += qlp_coeff[2] * data[i-3];
1503                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1504                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1505                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1506                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1507
1508                                                 //sum += qlp_coeff[1] * data[i-2];
1509                                                 //sum += qlp_coeff[0] * data[i-1];
1510                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1511                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1512                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1513                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1514
1515                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1516                                                 RESIDUAL32_RESULT(xmm7);
1517                                         }
1518                                 }
1519                         }
1520                 }
1521                 else { /* order == 1, 2, 3, 4 */
1522                         if(order > 2) { /* order == 3, 4 */
1523                                 if(order == 4) {
1524                                         __m128i xmm0, xmm1, xmm6, xmm7;
1525                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1526                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1527
1528                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1529                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1530
1531                                         for(i = 0; i < (int)data_len; i++) {
1532                                                 //sum = 0;
1533                                                 //sum += qlp_coeff[3] * data[i-4];
1534                                                 //sum += qlp_coeff[2] * data[i-3];
1535                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1536                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1537                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1538
1539                                                 //sum += qlp_coeff[1] * data[i-2];
1540                                                 //sum += qlp_coeff[0] * data[i-1];
1541                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1542                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1543                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1544                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1545
1546                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1547                                                 RESIDUAL32_RESULT(xmm7);
1548                                         }
1549                                 }
1550                                 else { /* order == 3 */
1551                                         __m128i xmm0, xmm1, xmm6, xmm7;
1552                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1553                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1554
1555                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1556
1557                                         for(i = 0; i < (int)data_len; i++) {
1558                                                 //sum = 0;
1559                                                 //sum  = qlp_coeff[2] * data[i-3];
1560                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1561                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1562
1563                                                 //sum += qlp_coeff[1] * data[i-2];
1564                                                 //sum += qlp_coeff[0] * data[i-1];
1565                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1566                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1567                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1568                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
1569
1570                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1571                                                 RESIDUAL32_RESULT(xmm7);
1572                                         }
1573                                 }
1574                         }
1575                         else { /* order == 1, 2 */
1576                                 if(order == 2) {
1577                                         __m128i xmm0, xmm7;
1578                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1579                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1580
1581                                         for(i = 0; i < (int)data_len; i++) {
1582                                                 //sum = 0;
1583                                                 //sum += qlp_coeff[1] * data[i-2];
1584                                                 //sum += qlp_coeff[0] * data[i-1];
1585                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1586                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1587                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
1588
1589                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1590                                                 RESIDUAL32_RESULT(xmm7);
1591                                         }
1592                                 }
1593                                 else { /* order == 1 */
1594                                         for(i = 0; i < (int)data_len; i++)
1595                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1596                                 }
1597                         }
1598                 }
1599         }
1600         else { /* order > 12 */
1601                 FLAC__int32 sum;
1602                 for(i = 0; i < (int)data_len; i++) {
1603                         sum = 0;
1604                         switch(order) {
1605                                 case 32: sum += qlp_coeff[31] * data[i-32];
1606                                 case 31: sum += qlp_coeff[30] * data[i-31];
1607                                 case 30: sum += qlp_coeff[29] * data[i-30];
1608                                 case 29: sum += qlp_coeff[28] * data[i-29];
1609                                 case 28: sum += qlp_coeff[27] * data[i-28];
1610                                 case 27: sum += qlp_coeff[26] * data[i-27];
1611                                 case 26: sum += qlp_coeff[25] * data[i-26];
1612                                 case 25: sum += qlp_coeff[24] * data[i-25];
1613                                 case 24: sum += qlp_coeff[23] * data[i-24];
1614                                 case 23: sum += qlp_coeff[22] * data[i-23];
1615                                 case 22: sum += qlp_coeff[21] * data[i-22];
1616                                 case 21: sum += qlp_coeff[20] * data[i-21];
1617                                 case 20: sum += qlp_coeff[19] * data[i-20];
1618                                 case 19: sum += qlp_coeff[18] * data[i-19];
1619                                 case 18: sum += qlp_coeff[17] * data[i-18];
1620                                 case 17: sum += qlp_coeff[16] * data[i-17];
1621                                 case 16: sum += qlp_coeff[15] * data[i-16];
1622                                 case 15: sum += qlp_coeff[14] * data[i-15];
1623                                 case 14: sum += qlp_coeff[13] * data[i-14];
1624                                 case 13: sum += qlp_coeff[12] * data[i-13];
1625                                          sum += qlp_coeff[11] * data[i-12];
1626                                          sum += qlp_coeff[10] * data[i-11];
1627                                          sum += qlp_coeff[ 9] * data[i-10];
1628                                          sum += qlp_coeff[ 8] * data[i- 9];
1629                                          sum += qlp_coeff[ 7] * data[i- 8];
1630                                          sum += qlp_coeff[ 6] * data[i- 7];
1631                                          sum += qlp_coeff[ 5] * data[i- 6];
1632                                          sum += qlp_coeff[ 4] * data[i- 5];
1633                                          sum += qlp_coeff[ 3] * data[i- 4];
1634                                          sum += qlp_coeff[ 2] * data[i- 3];
1635                                          sum += qlp_coeff[ 1] * data[i- 2];
1636                                          sum += qlp_coeff[ 0] * data[i- 1];
1637                         }
1638                         residual[i] = data[i] - (sum >> lp_quantization);
1639                 }
1640         }
1641 }
1642
1643 FLAC__SSE_TARGET("sse2")
1644 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1645 {
1646         int i;
1647         FLAC__int32 sum;
1648         if (order < 8) {
1649                 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
1650                 return;
1651         }
1652
1653         FLAC__ASSERT(order > 0);
1654         FLAC__ASSERT(order <= 32);
1655         FLAC__ASSERT(data_len > 0);
1656
1657         if(order <= 12) {
1658                 FLAC__int32 curr;
1659                 if(order > 8) { /* order == 9, 10, 11, 12 */
1660 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
1661                         int r;
1662                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1663                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1664                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1665                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1666                         switch(order)                                          /* ...and zero them out */
1667                         {
1668                         case 9:
1669                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1670                         case 10:
1671                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1672                         case 11:
1673                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1674                         }
1675                         xmm2 = _mm_setzero_si128();
1676                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1677                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
1678
1679                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1680                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1681                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1682                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1683                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1684                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1685                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
1686                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
1687
1688                         xmm7 = _mm_slli_si128(xmm1, 2);
1689                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1690                         xmm2 = _mm_slli_si128(xmm0, 2);
1691
1692                         /* xmm0, xmm1: qlp_coeff
1693                            xmm2, xmm7: qlp_coeff << 16 bit
1694                            xmm3, xmm4: data */
1695
1696                         xmm6 = xmm4;
1697                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
1698                         xmm5 = xmm3;
1699                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
1700                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1701                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1702                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1703
1704                         DATA16_RESULT(xmm6);
1705
1706                         data_len--;
1707                         r = data_len % 2;
1708
1709                         if(r) {
1710                                 xmm4 = _mm_slli_si128(xmm4, 2);
1711                                 xmm6 = xmm3;
1712                                 xmm3 = _mm_slli_si128(xmm3, 2);
1713                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1714                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1715
1716                                 xmm6 = xmm4;
1717                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1718                                 xmm5 = xmm3;
1719                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1720                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1721                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1722                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1723
1724                                 DATA16_RESULT(xmm6);
1725
1726                                 data_len--;
1727                         }
1728
1729                         while(data_len) { /* data_len is a multiple of 2 */
1730                                 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1731                                 xmm4 = _mm_slli_si128(xmm4, 4);
1732                                 xmm6 = xmm3;
1733                                 xmm3 = _mm_slli_si128(xmm3, 4);
1734                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
1735                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1736
1737                                 xmm6 = xmm4;
1738                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1739                                 xmm5 = xmm3;
1740                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1741                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1742                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1743                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1744
1745                                 DATA16_RESULT(xmm6);
1746
1747                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1748
1749                                 xmm6 = xmm4;
1750                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1751                                 xmm5 = xmm3;
1752                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1753                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1754                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1755                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1756
1757                                 DATA16_RESULT(xmm6);
1758
1759                                 data_len-=2;
1760                         }
1761 #else /* 16 XMM registers available */
1762                         int r;
1763                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
1764                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1765                         xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1766                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1767                         switch(order)                                          /* ...and zero them out */
1768                         {
1769                         case 9:
1770                                 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1771                         case 10:
1772                                 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1773                         case 11:
1774                                 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1775                         }
1776                         xmm2 = _mm_setzero_si128();
1777                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
1778                         xmm1 = _mm_packs_epi32(xmm1, xmm2);
1779
1780                         xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1781                         xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1782                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1783                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1784                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1785                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1786                         xmm4 = _mm_packs_epi32(xmm4, xmm2);
1787                         xmm3 = _mm_packs_epi32(xmm3, xmm5);
1788
1789                         xmm7 = _mm_slli_si128(xmm1, 2);
1790                         xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1791                         xmm2 = _mm_slli_si128(xmm0, 2);
1792
1793                         xmm9 = _mm_slli_si128(xmm1, 4);
1794                         xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
1795                         xmm8 = _mm_slli_si128(xmm0, 4);
1796
1797                         xmmB = _mm_slli_si128(xmm1, 6);
1798                         xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
1799                         xmmA = _mm_slli_si128(xmm0, 6);
1800
1801                         /* xmm0, xmm1: qlp_coeff
1802                            xmm2, xmm7: qlp_coeff << 16 bit
1803                            xmm8, xmm9: qlp_coeff << 2*16 bit
1804                            xmmA, xmmB: qlp_coeff << 3*16 bit
1805                            xmm3, xmm4: data */
1806
1807                         xmm6 = xmm4;
1808                         xmm6 = _mm_madd_epi16(xmm6, xmm1);
1809                         xmm5 = xmm3;
1810                         xmm5 = _mm_madd_epi16(xmm5, xmm0);
1811                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1812                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1813                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1814
1815                         DATA16_RESULT(xmm6);
1816
1817                         data_len--;
1818                         r = data_len % 4;
1819
1820                         while(r) {
1821                                 xmm4 = _mm_slli_si128(xmm4, 2);
1822                                 xmm6 = xmm3;
1823                                 xmm3 = _mm_slli_si128(xmm3, 2);
1824                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1825                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1826
1827                                 xmm6 = xmm4;
1828                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1829                                 xmm5 = xmm3;
1830                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1831                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1832                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1833                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1834
1835                                 DATA16_RESULT(xmm6);
1836
1837                                 data_len--; r--;
1838                         }
1839
1840                         while(data_len) { /* data_len is a multiple of 4 */
1841                                 xmm4 = _mm_slli_si128(xmm4, 8);
1842                                 xmm6 = xmm3;
1843                                 xmm3 = _mm_slli_si128(xmm3, 8);
1844                                 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
1845
1846                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1847
1848                                 xmm6 = xmm4;
1849                                 xmm6 = _mm_madd_epi16(xmm6, xmmB);
1850                                 xmm5 = xmm3;
1851                                 xmm5 = _mm_madd_epi16(xmm5, xmmA);
1852                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1853                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1854                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1855
1856                                 DATA16_RESULT(xmm6);
1857
1858                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1859
1860                                 xmm6 = xmm4;
1861                                 xmm6 = _mm_madd_epi16(xmm6, xmm9);
1862                                 xmm5 = xmm3;
1863                                 xmm5 = _mm_madd_epi16(xmm5, xmm8);
1864                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1865                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1866                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1867
1868                                 DATA16_RESULT(xmm6);
1869
1870                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1871
1872                                 xmm6 = xmm4;
1873                                 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1874                                 xmm5 = xmm3;
1875                                 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1876                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1877                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1878                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1879
1880                                 DATA16_RESULT(xmm6);
1881
1882                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1883
1884                                 xmm6 = xmm4;
1885                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1886                                 xmm5 = xmm3;
1887                                 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1888                                 xmm6 = _mm_add_epi32(xmm6, xmm5);
1889                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1890                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1891
1892                                 DATA16_RESULT(xmm6);
1893
1894                                 data_len-=4;
1895                         }
1896 #endif
1897                 } /* endif(order > 8) */
1898                 else if(order > 4) { /* order == 5, 6, 7, 8 */
1899                         if(order > 6) { /* order == 7, 8 */
1900                                 if(order == 8) {
1901                                         __m128i xmm0, xmm1, xmm3, xmm6;
1902                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1903                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1904                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1905
1906                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1907                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1908                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1909                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1910                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1911
1912                                         /* xmm0: qlp_coeff
1913                                            xmm3: data */
1914
1915                                         xmm6 = xmm3;
1916                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1917                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1918                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1919
1920                                         DATA16_RESULT(xmm6);
1921
1922                                         data_len--;
1923
1924                                         while(data_len) {
1925                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1926                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1927
1928                                                 xmm6 = xmm3;
1929                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1930                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1931                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1932
1933                                                 DATA16_RESULT(xmm6);
1934
1935                                                 data_len--;
1936                                         }
1937                                 }
1938                                 else { /* order == 7 */
1939                                         int r;
1940                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
1941                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1942                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1943                                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
1944                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
1945
1946                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1947                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1948                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1949                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1950                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
1951                                         xmm2 = _mm_slli_si128(xmm0, 2);
1952
1953                                         /* xmm0: qlp_coeff
1954                                            xmm2: qlp_coeff << 16 bit
1955                                            xmm3: data */
1956
1957                                         xmm6 = xmm3;
1958                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
1959                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1960                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1961
1962                                         DATA16_RESULT(xmm6);
1963
1964                                         data_len--;
1965                                         r = data_len % 2;
1966
1967                                         if(r) {
1968                                                 xmm3 = _mm_slli_si128(xmm3, 2);
1969                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1970
1971                                                 xmm6 = xmm3;
1972                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1973                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1974                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1975
1976                                                 DATA16_RESULT(xmm6);
1977
1978                                                 data_len--;
1979                                         }
1980
1981                                         while(data_len) { /* data_len is a multiple of 2 */
1982                                                 xmm3 = _mm_slli_si128(xmm3, 4);
1983                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1984
1985                                                 xmm6 = xmm3;
1986                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1987                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1988                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1989
1990                                                 DATA16_RESULT(xmm6);
1991
1992                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1993                                                 xmm6 = xmm3;
1994                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1995                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1996                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1997
1998                                                 DATA16_RESULT(xmm6);
1999
2000                                                 data_len-=2;
2001                                         }
2002                                 }
2003                         }
2004                         else { /* order == 5, 6 */
2005                                 if(order == 6) {
2006                                         int r;
2007                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
2008                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
2009                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
2010                                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
2011                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
2012
2013                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
2014                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
2015                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
2016                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
2017                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
2018                                         xmm2 = _mm_slli_si128(xmm0, 2);
2019                                         xmm4 = _mm_slli_si128(xmm0, 4);
2020
2021                                         /* xmm0: qlp_coeff
2022                                            xmm2: qlp_coeff << 16 bit
2023                                            xmm4: qlp_coeff << 2*16 bit
2024                                            xmm3: data */
2025
2026                                         xmm6 = xmm3;
2027                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
2028                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2029                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2030
2031                                         DATA16_RESULT(xmm6);
2032
2033                                         data_len--;
2034                                         r = data_len % 3;
2035
2036                                         while(r) {
2037                                                 xmm3 = _mm_slli_si128(xmm3, 2);
2038                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2039
2040                                                 xmm6 = xmm3;
2041                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2042                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2043                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2044
2045                                                 DATA16_RESULT(xmm6);
2046
2047                                                 data_len--; r--;
2048                                         }
2049
2050                                         while(data_len) { /* data_len is a multiple of 3 */
2051                                                 xmm3 = _mm_slli_si128(xmm3, 6);
2052                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
2053
2054                                                 xmm6 = xmm3;
2055                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
2056                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2057                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2058
2059                                                 DATA16_RESULT(xmm6);
2060
2061                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
2062
2063                                                 xmm6 = xmm3;
2064                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
2065                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2066                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2067
2068                                                 DATA16_RESULT(xmm6);
2069
2070                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2071
2072                                                 xmm6 = xmm3;
2073                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2074                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2075                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2076
2077                                                 DATA16_RESULT(xmm6);
2078
2079                                                 data_len-=3;
2080                                         }
2081                                 }
2082                                 else { /* order == 5 */
2083                                         int r;
2084                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2085                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
2086                                         xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
2087                                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
2088                                         xmm0 = _mm_packs_epi32(xmm0, xmm1);
2089
2090                                         xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
2091                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
2092                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
2093                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
2094                                         xmm3 = _mm_packs_epi32(xmm3, xmm1);
2095                                         xmm2 = _mm_slli_si128(xmm0, 2);
2096                                         xmm4 = _mm_slli_si128(xmm0, 4);
2097                                         xmm5 = _mm_slli_si128(xmm0, 6);
2098
2099                                         /* xmm0: qlp_coeff
2100                                            xmm2: qlp_coeff << 16 bit
2101                                            xmm4: qlp_coeff << 2*16 bit
2102                                            xmm4: qlp_coeff << 3*16 bit
2103                                            xmm3: data */
2104
2105                                         xmm6 = xmm3;
2106                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
2107                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2108                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2109
2110                                         DATA16_RESULT(xmm6);
2111
2112                                         data_len--;
2113                                         r = data_len % 4;
2114
2115                                         while(r) {
2116                                                 xmm3 = _mm_slli_si128(xmm3, 2);
2117                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2118
2119                                                 xmm6 = xmm3;
2120                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2121                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2122                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2123
2124                                                 DATA16_RESULT(xmm6);
2125
2126                                                 data_len--; r--;
2127                                         }
2128
2129                                         while(data_len) { /* data_len is a multiple of 4 */
2130                                                 xmm3 = _mm_slli_si128(xmm3, 8);
2131                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
2132
2133                                                 xmm6 = xmm3;
2134                                                 xmm6 = _mm_madd_epi16(xmm6, xmm5);
2135                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2136                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2137
2138                                                 DATA16_RESULT(xmm6);
2139
2140                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
2141
2142                                                 xmm6 = xmm3;
2143                                                 xmm6 = _mm_madd_epi16(xmm6, xmm4);
2144                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2145                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2146
2147                                                 DATA16_RESULT(xmm6);
2148
2149                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
2150
2151                                                 xmm6 = xmm3;
2152                                                 xmm6 = _mm_madd_epi16(xmm6, xmm2);
2153                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2154                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2155
2156                                                 DATA16_RESULT(xmm6);
2157
2158                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2159
2160                                                 xmm6 = xmm3;
2161                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2162                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
2163                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2164
2165                                                 DATA16_RESULT(xmm6);
2166
2167                                                 data_len-=4;
2168                                         }
2169                                 }
2170                         }
2171                 }
2172                 else { /* order == 1, 2, 3, 4 */
2173                         if(order > 2) {
2174                                 if(order == 4) {
2175                                         __m128i xmm0, xmm3, xmm6;
2176                                         xmm6 = _mm_setzero_si128();
2177                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
2178                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
2179
2180                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
2181                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
2182                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
2183
2184                                         /* xmm0: qlp_coeff
2185                                            xmm3: data */
2186
2187                                         xmm6 = xmm3;
2188                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
2189                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2190
2191                                         DATA16_RESULT(xmm6);
2192
2193                                         data_len--;
2194
2195                                         while(data_len) {
2196                                                 xmm3 = _mm_slli_si128(xmm3, 2);
2197                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2198
2199                                                 xmm6 = xmm3;
2200                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2201                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2202
2203                                                 DATA16_RESULT(xmm6);
2204
2205                                                 data_len--;
2206                                         }
2207                                 }
2208                                 else { /* order == 3 */
2209                                         int r;
2210                                         __m128i xmm0, xmm1, xmm3, xmm6;
2211                                         xmm6 = _mm_setzero_si128();
2212                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
2213                                         xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
2214                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
2215
2216                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
2217                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
2218                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
2219                                         xmm1 = _mm_slli_si128(xmm0, 2);
2220
2221                                         /* xmm0: qlp_coeff
2222                                            xmm1: qlp_coeff << 16 bit
2223                                            xmm3: data */
2224
2225                                         xmm6 = xmm3;
2226                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
2227                                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2228
2229                                         DATA16_RESULT(xmm6);
2230
2231                                         data_len--;
2232                                         r = data_len % 2;
2233
2234                                         if(r) {
2235                                                 xmm3 = _mm_slli_si128(xmm3, 2);
2236                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2237
2238                                                 xmm6 = xmm3;
2239                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2240                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2241
2242                                                 DATA16_RESULT(xmm6);
2243
2244                                                 data_len--;
2245                                         }
2246
2247                                         while(data_len) { /* data_len is a multiple of 2 */
2248                                                 xmm3 = _mm_slli_si128(xmm3, 4);
2249
2250                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
2251
2252                                                 xmm6 = xmm3;
2253                                                 xmm6 = _mm_madd_epi16(xmm6, xmm1);
2254                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2255
2256                                                 DATA16_RESULT(xmm6);
2257
2258                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2259
2260                                                 xmm6 = xmm3;
2261                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2262                                                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
2263
2264                                                 DATA16_RESULT(xmm6);
2265
2266                                                 data_len-=2;
2267                                         }
2268                                 }
2269                         }
2270                         else {
2271                                 if(order == 2) {
2272                                         __m128i xmm0, xmm3, xmm6;
2273                                         xmm6 = _mm_setzero_si128();
2274                                         xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
2275                                         xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
2276                                         xmm0 = _mm_packs_epi32(xmm0, xmm6);
2277
2278                                         xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
2279                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
2280                                         xmm3 = _mm_packs_epi32(xmm3, xmm6);
2281
2282                                         /* xmm0: qlp_coeff
2283                                            xmm3: data */
2284
2285                                         xmm6 = xmm3;
2286                                         xmm6 = _mm_madd_epi16(xmm6, xmm0);
2287
2288                                         DATA16_RESULT(xmm6);
2289
2290                                         data_len--;
2291
2292                                         while(data_len) {
2293                                                 xmm3 = _mm_slli_si128(xmm3, 2);
2294                                                 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
2295
2296                                                 xmm6 = xmm3;
2297                                                 xmm6 = _mm_madd_epi16(xmm6, xmm0);
2298
2299                                                 DATA16_RESULT(xmm6);
2300
2301                                                 data_len--;
2302                                         }
2303                                 }
2304                                 else { /* order == 1 */
2305                                         for(i = 0; i < (int)data_len; i++)
2306                                                 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
2307                                 }
2308                         }
2309                 }
2310         }
2311         else { /* order > 12 */
2312                 for(i = 0; i < (int)data_len; i++) {
2313                         sum = 0;
2314                         switch(order) {
2315                                 case 32: sum += qlp_coeff[31] * data[i-32];
2316                                 case 31: sum += qlp_coeff[30] * data[i-31];
2317                                 case 30: sum += qlp_coeff[29] * data[i-30];
2318                                 case 29: sum += qlp_coeff[28] * data[i-29];
2319                                 case 28: sum += qlp_coeff[27] * data[i-28];
2320                                 case 27: sum += qlp_coeff[26] * data[i-27];
2321                                 case 26: sum += qlp_coeff[25] * data[i-26];
2322                                 case 25: sum += qlp_coeff[24] * data[i-25];
2323                                 case 24: sum += qlp_coeff[23] * data[i-24];
2324                                 case 23: sum += qlp_coeff[22] * data[i-23];
2325                                 case 22: sum += qlp_coeff[21] * data[i-22];
2326                                 case 21: sum += qlp_coeff[20] * data[i-21];
2327                                 case 20: sum += qlp_coeff[19] * data[i-20];
2328                                 case 19: sum += qlp_coeff[18] * data[i-19];
2329                                 case 18: sum += qlp_coeff[17] * data[i-18];
2330                                 case 17: sum += qlp_coeff[16] * data[i-17];
2331                                 case 16: sum += qlp_coeff[15] * data[i-16];
2332                                 case 15: sum += qlp_coeff[14] * data[i-15];
2333                                 case 14: sum += qlp_coeff[13] * data[i-14];
2334                                 case 13: sum += qlp_coeff[12] * data[i-13];
2335                                          sum += qlp_coeff[11] * data[i-12];
2336                                          sum += qlp_coeff[10] * data[i-11];
2337                                          sum += qlp_coeff[ 9] * data[i-10];
2338                                          sum += qlp_coeff[ 8] * data[i- 9];
2339                                          sum += qlp_coeff[ 7] * data[i- 8];
2340                                          sum += qlp_coeff[ 6] * data[i- 7];
2341                                          sum += qlp_coeff[ 5] * data[i- 6];
2342                                          sum += qlp_coeff[ 4] * data[i- 5];
2343                                          sum += qlp_coeff[ 3] * data[i- 4];
2344                                          sum += qlp_coeff[ 2] * data[i- 3];
2345                                          sum += qlp_coeff[ 1] * data[i- 2];
2346                                          sum += qlp_coeff[ 0] * data[i- 1];
2347                         }
2348                         data[i] = residual[i] + (sum >> lp_quantization);
2349                 }
2350         }
2351 }
2352
2353 FLAC__SSE_TARGET("sse2")
2354 void FLAC__lpc_restore_signal_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
2355 {
2356         int i;
2357
2358         FLAC__ASSERT(order > 0);
2359         FLAC__ASSERT(order <= 32);
2360
2361         if(order <= 12) {
2362                 if(order > 8) { /* order == 9, 10, 11, 12 */
2363                         if(order > 10) { /* order == 11, 12 */
2364                                 if(order == 12) {
2365                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2366                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
2367                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
2368                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
2369                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
2370                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
2371                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
2372
2373                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
2374                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
2375                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
2376                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
2377                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
2378                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
2379
2380                                         for(i = 0; i < (int)data_len; i++) {
2381                                                 //sum = 0;
2382                                                 //sum += qlp_coeff[11] * data[i-12];
2383                                                 //sum += qlp_coeff[10] * data[i-11];
2384                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
2385                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
2386                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
2387
2388                                                 //sum += qlp_coeff[9] * data[i-10];
2389                                                 //sum += qlp_coeff[8] * data[i-9];
2390                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2391                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2392                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2393                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2394
2395                                                 //sum += qlp_coeff[7] * data[i-8];
2396                                                 //sum += qlp_coeff[6] * data[i-7];
2397                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2398                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2399                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2400                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2401
2402                                                 //sum += qlp_coeff[5] * data[i-6];
2403                                                 //sum += qlp_coeff[4] * data[i-5];
2404                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2405                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2406                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2407                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2408
2409                                                 //sum += qlp_coeff[3] * data[i-4];
2410                                                 //sum += qlp_coeff[2] * data[i-3];
2411                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2412                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2413                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2414                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2415
2416                                                 //sum += qlp_coeff[1] * data[i-2];
2417                                                 //sum += qlp_coeff[0] * data[i-1];
2418                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2419                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2420                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2421                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2422
2423                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2424                                                 DATA32_RESULT(xmm7);
2425                                         }
2426                                 }
2427                                 else { /* order == 11 */
2428                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2429                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2430                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2431                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2432                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2433                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2434                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
2435
2436                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2437                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2438                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2439                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2440                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2441
2442                                         for(i = 0; i < (int)data_len; i++) {
2443                                                 //sum = 0;
2444                                                 //sum  = qlp_coeff[10] * data[i-11];
2445                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
2446                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
2447
2448                                                 //sum += qlp_coeff[9] * data[i-10];
2449                                                 //sum += qlp_coeff[8] * data[i-9];
2450                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2451                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2452                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2453                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2454
2455                                                 //sum += qlp_coeff[7] * data[i-8];
2456                                                 //sum += qlp_coeff[6] * data[i-7];
2457                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2458                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2459                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2460                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2461
2462                                                 //sum += qlp_coeff[5] * data[i-6];
2463                                                 //sum += qlp_coeff[4] * data[i-5];
2464                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2465                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2466                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2467                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2468
2469                                                 //sum += qlp_coeff[3] * data[i-4];
2470                                                 //sum += qlp_coeff[2] * data[i-3];
2471                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2472                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2473                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2474                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2475
2476                                                 //sum += qlp_coeff[1] * data[i-2];
2477                                                 //sum += qlp_coeff[0] * data[i-1];
2478                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2479                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2480                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2481                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2482
2483                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2484                                                 DATA32_RESULT(xmm7);
2485                                         }
2486                                 }
2487                         }
2488                         else { /* order == 9, 10 */
2489                                 if(order == 10) {
2490                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2491                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2492                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2493                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2494                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2495                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2496
2497                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2498                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2499                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2500                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2501                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2502
2503                                         for(i = 0; i < (int)data_len; i++) {
2504                                                 //sum = 0;
2505                                                 //sum += qlp_coeff[9] * data[i-10];
2506                                                 //sum += qlp_coeff[8] * data[i-9];
2507                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2508                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2509                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2510
2511                                                 //sum += qlp_coeff[7] * data[i-8];
2512                                                 //sum += qlp_coeff[6] * data[i-7];
2513                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2514                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2515                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2516                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2517
2518                                                 //sum += qlp_coeff[5] * data[i-6];
2519                                                 //sum += qlp_coeff[4] * data[i-5];
2520                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2521                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2522                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2523                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2524
2525                                                 //sum += qlp_coeff[3] * data[i-4];
2526                                                 //sum += qlp_coeff[2] * data[i-3];
2527                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2528                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2529                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2530                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2531
2532                                                 //sum += qlp_coeff[1] * data[i-2];
2533                                                 //sum += qlp_coeff[0] * data[i-1];
2534                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2535                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2536                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2537                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
2538
2539                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2540                                                 DATA32_RESULT(xmm7);
2541                                         }
2542                                 }
2543                                 else { /* order == 9 */
2544                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2545                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2546                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2547                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2548                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2549                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
2550
2551                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2552                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2553                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2554                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2555
2556                                         for(i = 0; i < (int)data_len; i++) {
2557                                                 //sum = 0;
2558                                                 //sum  = qlp_coeff[8] * data[i-9];
2559                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
2560                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2561
2562                                                 //sum += qlp_coeff[7] * data[i-8];
2563                                                 //sum += qlp_coeff[6] * data[i-7];
2564                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2565                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2566                                                 xmm6 =&n