SIMD: Add const qualifier where appropriate
[flac.git] / src / libFLAC / lpc_intrin_sse41.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2016  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE4_1_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <smmintrin.h> /* SSE4.1 */
49
50 #if defined FLAC__CPU_IA32 /* unused for x64 */
51
52 #define RESIDUAL64_RESULT(xmmN)  residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt))
53 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization))
54
55 FLAC__SSE_TARGET("sse4.1")
56 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
57 {
58         int i;
59         const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
60
61         FLAC__ASSERT(order > 0);
62         FLAC__ASSERT(order <= 32);
63         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
64
65         if(order <= 12) {
66                 if(order > 8) { /* order == 9, 10, 11, 12 */
67                         if(order > 10) { /* order == 11, 12 */
68                                 if(order == 12) {
69                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
70                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
71                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
72                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
73                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
74                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
75                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
76
77                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
78                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
79                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
80                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
81                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
82                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
83
84                                         for(i = 0; i < (int)data_len; i++) {
85                                                 //sum = 0;
86                                                 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
87                                                 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
88                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
89                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
90                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
91
92                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
93                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
94                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
95                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
96                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
97                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
98
99                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
100                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
101                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
102                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
103                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
104                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
105
106                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
107                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
108                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
109                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
110                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
111                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
112
113                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
114                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
115                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
116                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
117                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
118                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
119
120                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
121                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
122                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
123                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
124                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
125                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
126
127                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
128                                                 RESIDUAL64_RESULT1(xmm7);
129                                         }
130                                 }
131                                 else { /* order == 11 */
132                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
133                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
134                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
135                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
136                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
137                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
138                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
139
140                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
141                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
142                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
143                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
144                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
145
146                                         for(i = 0; i < (int)data_len; i++) {
147                                                 //sum = 0;
148                                                 //sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
149                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
150                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
151
152                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
153                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
154                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
155                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
156                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
157                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
158
159                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
160                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
161                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
162                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
163                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
164                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
165
166                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
167                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
168                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
169                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
170                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
171                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
172
173                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
174                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
175                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
176                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
177                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
178                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
179
180                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
181                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
182                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
183                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
184                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
185                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
186
187                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
188                                                 RESIDUAL64_RESULT1(xmm7);
189                                         }
190                                 }
191                         }
192                         else { /* order == 9, 10 */
193                                 if(order == 10) {
194                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
195                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
196                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
197                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
198                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
199                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
200
201                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
202                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
203                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
204                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
205                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
206
207                                         for(i = 0; i < (int)data_len; i++) {
208                                                 //sum = 0;
209                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
210                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
211                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
212                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
213                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
214
215                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
216                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
217                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
218                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
219                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
220                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
221
222                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
223                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
224                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
225                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
226                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
227                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
228
229                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
230                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
231                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
232                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
233                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
234                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
235
236                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
237                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
238                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
239                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
240                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
241                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
242
243                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
244                                                 RESIDUAL64_RESULT(xmm7);
245                                         }
246                                 }
247                                 else { /* order == 9 */
248                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
249                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
250                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
251                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
252                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
253                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
254
255                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
256                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
257                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
258                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
259
260                                         for(i = 0; i < (int)data_len; i++) {
261                                                 //sum = 0;
262                                                 //sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
263                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
264                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
265
266                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
267                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
268                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
269                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
270                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
271                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
272
273                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
274                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
275                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
276                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
277                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
278                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
279
280                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
281                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
282                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
283                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
284                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
285                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
286
287                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
288                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
289                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
290                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
291                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
292                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
293
294                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
295                                                 RESIDUAL64_RESULT(xmm7);
296                                         }
297                                 }
298                         }
299                 }
300                 else if(order > 4) { /* order == 5, 6, 7, 8 */
301                         if(order > 6) { /* order == 7, 8 */
302                                 if(order == 8) {
303                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
304                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
305                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
306                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
307                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
308
309                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
310                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
311                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
312                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
313
314                                         for(i = 0; i < (int)data_len; i++) {
315                                                 //sum = 0;
316                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
317                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
318                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
319                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
320                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
321
322                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
323                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
324                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
325                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
326                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
327                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
328
329                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
330                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
331                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
332                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
333                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
334                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
335
336                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
337                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
338                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
339                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
340                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
341                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
342
343                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
344                                                 RESIDUAL64_RESULT(xmm7);
345                                         }
346                                 }
347                                 else { /* order == 7 */
348                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
349                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
350                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
351                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
352                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
353
354                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
355                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
356                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
357
358                                         for(i = 0; i < (int)data_len; i++) {
359                                                 //sum = 0;
360                                                 //sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
361                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
362                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
363
364                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
365                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
366                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
367                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
368                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
369                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
370
371                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
372                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
373                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
374                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
375                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
376                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
377
378                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
379                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
380                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
381                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
382                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
383                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
384
385                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
386                                                 RESIDUAL64_RESULT(xmm7);
387                                         }
388                                 }
389                         }
390                         else { /* order == 5, 6 */
391                                 if(order == 6) {
392                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
393                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
394                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
395                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
396
397                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
398                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
399                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
400
401                                         for(i = 0; i < (int)data_len; i++) {
402                                                 //sum = 0;
403                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
404                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
405                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
406                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
407                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
408
409                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
410                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
411                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
412                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
413                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
414                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
415
416                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
417                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
418                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
419                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
420                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
421                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
422
423                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
424                                                 RESIDUAL64_RESULT(xmm7);
425                                         }
426                                 }
427                                 else { /* order == 5 */
428                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
429                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
430                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
431                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
432
433                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
434                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
435
436                                         for(i = 0; i < (int)data_len; i++) {
437                                                 //sum = 0;
438                                                 //sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
439                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
440                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
441
442                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
443                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
444                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
445                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
446                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
447                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
448
449                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
450                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
451                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
452                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
454                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
455
456                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
457                                                 RESIDUAL64_RESULT(xmm7);
458                                         }
459                                 }
460                         }
461                 }
462                 else { /* order == 1, 2, 3, 4 */
463                         if(order > 2) { /* order == 3, 4 */
464                                 if(order == 4) {
465                                         __m128i xmm0, xmm1, xmm6, xmm7;
466                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
467                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
468
469                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
470                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
471
472                                         for(i = 0; i < (int)data_len; i++) {
473                                                 //sum = 0;
474                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
475                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
476                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
477                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
478                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
479
480                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
481                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
482                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
483                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
484                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
485                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
486
487                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
488                                                 RESIDUAL64_RESULT(xmm7);
489                                         }
490                                 }
491                                 else { /* order == 3 */
492                                         __m128i xmm0, xmm1, xmm6, xmm7;
493                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
494                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
495
496                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
497
498                                         for(i = 0; i < (int)data_len; i++) {
499                                                 //sum = 0;
500                                                 //sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
501                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
502                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
503
504                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
505                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
506                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
507                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
508                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
509                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
510
511                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
512                                                 RESIDUAL64_RESULT(xmm7);
513                                         }
514                                 }
515                         }
516                         else { /* order == 1, 2 */
517                                 if(order == 2) {
518                                         __m128i xmm0, xmm7;
519                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
520                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
521
522                                         for(i = 0; i < (int)data_len; i++) {
523                                                 //sum = 0;
524                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
525                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
526                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
527                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
528                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
529
530                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
531                                                 RESIDUAL64_RESULT(xmm7);
532                                         }
533                                 }
534                                 else { /* order == 1 */
535                                         __m128i xmm0, xmm7;
536                                         xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
537
538                                         for(i = 0; i < (int)data_len; i++) {
539                                                 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
540                                                 xmm7 = _mm_cvtsi32_si128(data[i-1]);
541                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
542                                                 RESIDUAL64_RESULT(xmm7);
543                                         }
544                                 }
545                         }
546                 }
547         }
548         else { /* order > 12 */
549                 FLAC__int64 sum;
550                 for(i = 0; i < (int)data_len; i++) {
551                         sum = 0;
552                         switch(order) {
553                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
554                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
555                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
556                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
557                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
558                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
559                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
560                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
561                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
562                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
563                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
564                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
565                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
566                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
567                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
568                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
569                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
570                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
571                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
572                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
573                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
574                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
575                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
576                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
577                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
578                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
579                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
580                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
581                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
582                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
583                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
584                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
585                         }
586                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
587                 }
588         }
589 }
590
591 FLAC__SSE_TARGET("sse4.1")
592 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
593 {
594         int i;
595         const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
596
597         if (!data_len)
598                 return;
599
600         FLAC__ASSERT(order > 0);
601         FLAC__ASSERT(order <= 32);
602         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
603
604         if(order <= 12) {
605                 if(order > 8) { /* order == 9, 10, 11, 12 */
606                         if(order > 10) { /* order == 11, 12 */
607                                 __m128i qlp[6], dat[6];
608                                 __m128i summ, temp;
609                                 qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));            // 0  q[1]  0  q[0]
610                                 qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));            // 0  q[3]  0  q[2]
611                                 qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));            // 0  q[5]  0  q[4]
612                                 qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));            // 0  q[7]  0  q[6]
613                                 qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+8)));            // 0  q[9]  0  q[8]
614                                 if (order == 12)
615                                         qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+10)));   // 0  q[11] 0  q[10]
616                                 else
617                                         qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10]));                                  // 0    0   0  q[10]
618
619                                 dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-12)), _MM_SHUFFLE(2,0,3,1));   // 0  d[i-12] 0  d[i-11]
620                                 dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-10)), _MM_SHUFFLE(2,0,3,1));   // 0  d[i-10] 0  d[i-9]
621                                 dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));   // 0  d[i-8]  0  d[i-7]
622                                 dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));   // 0  d[i-6]  0  d[i-5]
623                                 dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));   // 0  d[i-4]  0  d[i-3]
624                                 dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));   // 0  d[i-2]  0  d[i-1]
625
626                                 summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
627                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
628                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
629                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
630                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
631                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
632
633                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));    // ?_64  sum_64
634                                 summ = _mm_srl_epi64(summ, cnt);                                                // ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
635                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);     // ?  ?  ?  d[i]
636                                 data[0] = _mm_cvtsi128_si32(temp);
637
638                                 for(i = 1; i < (int)data_len; i++) {
639                                         temp = _mm_slli_si128(temp, 8);
640                                         dat[5] = _mm_alignr_epi8(dat[5], dat[4], 8);    //  ?  d[i-11] ?  d[i-10]
641                                         dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8);    //  ?  d[i-9]  ?  d[i-8]
642                                         dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);    //  ?  d[i-7]  ?  d[i-6]
643                                         dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);    //  ?  d[i-5]  ?  d[i-4]
644                                         dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);    //  ?  d[i-3]  ?  d[i-2]
645                                         dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);    //  ?  d[i-1]  ?  d[i  ]
646
647                                         summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
648                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
649                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
650                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
651                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
652                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
653
654                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));    // ?_64  sum_64
655                                         summ = _mm_srl_epi64(summ, cnt);                                                // ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
656                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);     // ?  ?  ?  d[i]
657                                         data[i] = _mm_cvtsi128_si32(temp);
658                                 }
659                         }
660                         else { /* order == 9, 10 */
661                                 __m128i qlp[5], dat[5];
662                                 __m128i summ, temp;
663                                 qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
664                                 qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
665                                 qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
666                                 qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));
667                                 if (order == 10)
668                                         qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+8)));
669                                 else
670                                         qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8]));
671
672                                 dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-10)), _MM_SHUFFLE(2,0,3,1));
673                                 dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
674                                 dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
675                                 dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
676                                 dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
677
678                                 summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
679                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
680                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
681                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
682                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
683
684                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
685                                 summ = _mm_srl_epi64(summ, cnt);
686                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
687                                 data[0] = _mm_cvtsi128_si32(temp);
688
689                                 for(i = 1; i < (int)data_len; i++) {
690                                         temp = _mm_slli_si128(temp, 8);
691                                         dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8);
692                                         dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
693                                         dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
694                                         dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
695                                         dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
696
697                                         summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
698                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
699                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
700                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
701                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
702
703                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
704                                         summ = _mm_srl_epi64(summ, cnt);
705                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
706                                         data[i] = _mm_cvtsi128_si32(temp);
707                                 }
708                         }
709                 }
710                 else if(order > 4) { /* order == 5, 6, 7, 8 */
711                         if(order > 6) { /* order == 7, 8 */
712                                 __m128i qlp[4], dat[4];
713                                 __m128i summ, temp;
714                                 qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
715                                 qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
716                                 qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
717                                 if (order == 8)
718                                         qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));
719                                 else
720                                         qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6]));
721
722                                 dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
723                                 dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
724                                 dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
725                                 dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
726
727                                 summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
728                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
729                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
730                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
731
732                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
733                                 summ = _mm_srl_epi64(summ, cnt);
734                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
735                                 data[0] = _mm_cvtsi128_si32(temp);
736
737                                 for(i = 1; i < (int)data_len; i++) {
738                                         temp = _mm_slli_si128(temp, 8);
739                                         dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
740                                         dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
741                                         dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
742                                         dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
743
744                                         summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
745                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
746                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
747                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
748
749                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
750                                         summ = _mm_srl_epi64(summ, cnt);
751                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
752                                         data[i] = _mm_cvtsi128_si32(temp);
753                                 }
754                         }
755                         else { /* order == 5, 6 */
756                                 __m128i qlp[3], dat[3];
757                                 __m128i summ, temp;
758                                 qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
759                                 qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
760                                 if (order == 6)
761                                         qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
762                                 else
763                                         qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4]));
764
765                                 dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
766                                 dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
767                                 dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
768
769                                 summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
770                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
771                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
772
773                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
774                                 summ = _mm_srl_epi64(summ, cnt);
775                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
776                                 data[0] = _mm_cvtsi128_si32(temp);
777
778                                 for(i = 1; i < (int)data_len; i++) {
779                                         temp = _mm_slli_si128(temp, 8);
780                                         dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
781                                         dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
782                                         dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
783
784                                         summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
785                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
786                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
787
788                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
789                                         summ = _mm_srl_epi64(summ, cnt);
790                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
791                                         data[i] = _mm_cvtsi128_si32(temp);
792                                 }
793                         }
794                 }
795                 else { /* order == 1, 2, 3, 4 */
796                         if(order > 2) { /* order == 3, 4 */
797                                 __m128i qlp[2], dat[2];
798                                 __m128i summ, temp;
799                                 qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
800                                 if (order == 4)
801                                         qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
802                                 else
803                                         qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2]));
804
805                                 dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
806                                 dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
807
808                                 summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
809                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
810
811                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
812                                 summ = _mm_srl_epi64(summ, cnt);
813                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
814                                 data[0] = _mm_cvtsi128_si32(temp);
815
816                                 for(i = 1; i < (int)data_len; i++) {
817                                         temp = _mm_slli_si128(temp, 8);
818                                         dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
819                                         dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
820
821                                         summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
822                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
823
824                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
825                                         summ = _mm_srl_epi64(summ, cnt);
826                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
827                                         data[i] = _mm_cvtsi128_si32(temp);
828                                 }
829                         }
830                         else { /* order == 1, 2 */
831                                 if(order == 2) {
832                                         __m128i qlp0, dat0;
833                                         __m128i summ, temp;
834                                         qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff)));
835
836                                         dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
837
838                                         summ = _mm_mul_epi32(dat0, qlp0);
839
840                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
841                                         summ = _mm_srl_epi64(summ, cnt);
842                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
843                                         data[0] = _mm_cvtsi128_si32(temp);
844
845                                         for(i = 1; i < (int)data_len; i++) {
846                                                 dat0 = _mm_alignr_epi8(dat0, _mm_slli_si128(temp, 8), 8);
847
848                                                 summ = _mm_mul_epi32(dat0, qlp0);
849
850                                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
851                                                 summ = _mm_srl_epi64(summ, cnt);
852                                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
853                                                 data[i] = _mm_cvtsi128_si32(temp);
854                                         }
855                                 }
856                                 else { /* order == 1 */
857                                         __m128i qlp0;
858                                         __m128i summ, temp;
859                                         qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
860                                         temp = _mm_cvtsi32_si128(data[-1]);
861
862                                         summ = _mm_mul_epi32(temp, qlp0);
863                                         summ = _mm_srl_epi64(summ, cnt);
864                                         temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
865                                         data[0] = _mm_cvtsi128_si32(temp);
866
867                                         for(i = 1; i < (int)data_len; i++) {
868                                                 summ = _mm_mul_epi32(temp, qlp0);
869                                                 summ = _mm_srl_epi64(summ, cnt);
870                                                 temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
871                                                 data[i] = _mm_cvtsi128_si32(temp);
872                                         }
873                                 }
874                         }
875                 }
876         }
877         else { /* order > 12 */
878                 FLAC__int64 sum;
879                 for(i = 0; i < (int)data_len; i++) {
880                         sum = 0;
881                         switch(order) {
882                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
883                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
884                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
885                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
886                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
887                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
888                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
889                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
890                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
891                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
892                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
893                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
894                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
895                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
896                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
897                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
898                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
899                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
900                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
901                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
902                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
903                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
904                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
905                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
906                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
907                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
908                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
909                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
910                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
911                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
912                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
913                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
914                         }
915                         data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
916                 }
917         }
918 }
919
920 #endif /* defined FLAC__CPU_IA32 */
921
922 FLAC__SSE_TARGET("sse4.1")
923 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
924 {
925         int i;
926         FLAC__int32 sum;
927         const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
928
929         FLAC__ASSERT(order > 0);
930         FLAC__ASSERT(order <= 32);
931
932         if(order <= 12) {
933                 if(order > 8) {
934                         if(order > 10) {
935                                 if(order == 12) {
936                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
937                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
938                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
939                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
940                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
941                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
942                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
943                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
944                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
945                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
946                                         q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
947                                         q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
948                                         q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
949
950                                         for(i = 0; i < (int)data_len-3; i+=4) {
951                                                 __m128i summ, mull;
952                                                 summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
953                                                 mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
954                                                 mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
955                                                 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
956                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
957                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
958                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
959                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
960                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
961                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
962                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
963                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
964                                                 summ = _mm_sra_epi32(summ, cnt);
965                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
966                                         }
967                                 }
968                                 else { /* order == 11 */
969                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
970                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
971                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
972                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
973                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
974                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
975                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
976                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
977                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
978                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
979                                         q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
980                                         q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
981
982                                         for(i = 0; i < (int)data_len-3; i+=4) {
983                                                 __m128i summ, mull;
984                                                 summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
985                                                 mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
986                                                 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
987                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
988                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
989                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
990                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
991                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
992                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
993                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
994                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
995                                                 summ = _mm_sra_epi32(summ, cnt);
996                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
997                                         }
998                                 }
999                         }
1000                         else {
1001                                 if(order == 10) {
1002                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
1003                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1004                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1005                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1006                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1007                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1008                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1009                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1010                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1011                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1012                                         q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
1013
1014                                         for(i = 0; i < (int)data_len-3; i+=4) {
1015                                                 __m128i summ, mull;
1016                                                 summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
1017                                                 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
1018                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1019                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1020                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1021                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1022                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1023                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1024                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1025                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1026                                                 summ = _mm_sra_epi32(summ, cnt);
1027                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1028                                         }
1029                                 }
1030                                 else { /* order == 9 */
1031                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
1032                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1033                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1034                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1035                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1036                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1037                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1038                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1039                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1040                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1041
1042                                         for(i = 0; i < (int)data_len-3; i+=4) {
1043                                                 __m128i summ, mull;
1044                                                 summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
1045                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1046                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1047                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1048                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1049                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1050                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1051                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1052                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1053                                                 summ = _mm_sra_epi32(summ, cnt);
1054                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1055                                         }
1056                                 }
1057                         }
1058                 }
1059                 else if(order > 4) {
1060                         if(order > 6) {
1061                                 if(order == 8) {
1062                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7;
1063                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1064                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1065                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1066                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1067                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1068                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1069                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1070                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1071
1072                                         for(i = 0; i < (int)data_len-3; i+=4) {
1073                                                 __m128i summ, mull;
1074                                                 summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
1075                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1076                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1077                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1078                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1079                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1080                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1081                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1082                                                 summ = _mm_sra_epi32(summ, cnt);
1083                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1084                                         }
1085                                 }
1086                                 else { /* order == 7 */
1087                                         __m128i q0, q1, q2, q3, q4, q5, q6;
1088                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1089                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1090                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1091                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1092                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1093                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1094                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1095
1096                                         for(i = 0; i < (int)data_len-3; i+=4) {
1097                                                 __m128i summ, mull;
1098                                                 summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
1099                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1100                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1101                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1102                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1103                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1104                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1105                                                 summ = _mm_sra_epi32(summ, cnt);
1106                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1107                                         }
1108                                 }
1109                         }
1110                         else {
1111                                 if(order == 6) {
1112                                         __m128i q0, q1, q2, q3, q4, q5;
1113                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1114                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1115                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1116                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1117                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1118                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1119
1120                                         for(i = 0; i < (int)data_len-3; i+=4) {
1121                                                 __m128i summ, mull;
1122                                                 summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
1123                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1124                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1125                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1126                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1127                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1128                                                 summ = _mm_sra_epi32(summ, cnt);
1129                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1130                                         }
1131                                 }
1132                                 else { /* order == 5 */
1133                                         __m128i q0, q1, q2, q3, q4;
1134                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1135                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1136                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1137                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1138                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1139
1140                                         for(i = 0; i < (int)data_len-3; i+=4) {
1141                                                 __m128i summ, mull;
1142                                                 summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
1143                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1144                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1145                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1146                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1147                                                 summ = _mm_sra_epi32(summ, cnt);
1148                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1149                                         }
1150                                 }
1151                         }
1152                 }
1153                 else {
1154                         if(order > 2) {
1155                                 if(order == 4) {
1156                                         __m128i q0, q1, q2, q3;
1157                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1158                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1159                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1160                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1161
1162                                         for(i = 0; i < (int)data_len-3; i+=4) {
1163                                                 __m128i summ, mull;
1164                                                 summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
1165                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1166                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1167                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1168                                                 summ = _mm_sra_epi32(summ, cnt);
1169                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1170                                         }
1171                                 }
1172                                 else { /* order == 3 */
1173                                         __m128i q0, q1, q2;
1174                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1175                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1176                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1177
1178                                         for(i = 0; i < (int)data_len-3; i+=4) {
1179                                                 __m128i summ, mull;
1180                                                 summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
1181                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1182                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1183                                                 summ = _mm_sra_epi32(summ, cnt);
1184                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1185                                         }
1186                                 }
1187                         }
1188                         else {
1189                                 if(order == 2) {
1190                                         __m128i q0, q1;
1191                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1192                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1193
1194                                         for(i = 0; i < (int)data_len-3; i+=4) {
1195                                                 __m128i summ, mull;
1196                                                 summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
1197                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1198                                                 summ = _mm_sra_epi32(summ, cnt);
1199                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1200                                         }
1201                                 }
1202                                 else { /* order == 1 */
1203                                         __m128i q0;
1204                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1205
1206                                         for(i = 0; i < (int)data_len-3; i+=4) {
1207                                                 __m128i summ;
1208                                                 summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
1209                                                 summ = _mm_sra_epi32(summ, cnt);
1210                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1211                                         }
1212                                 }
1213                         }
1214                 }
1215                 for(; i < (int)data_len; i++) {
1216                         sum = 0;
1217                         switch(order) {
1218                                 case 12: sum += qlp_coeff[11] * data[i-12];
1219                                 case 11: sum += qlp_coeff[10] * data[i-11];
1220                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
1221                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
1222                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
1223                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
1224                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
1225                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
1226                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
1227                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
1228                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
1229                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
1230                         }
1231                         residual[i] = data[i] - (sum >> lp_quantization);
1232                 }
1233         }
1234         else { /* order > 12 */
1235                 for(i = 0; i < (int)data_len; i++) {
1236                         sum = 0;
1237                         switch(order) {
1238                                 case 32: sum += qlp_coeff[31] * data[i-32];
1239                                 case 31: sum += qlp_coeff[30] * data[i-31];
1240                                 case 30: sum += qlp_coeff[29] * data[i-30];
1241                                 case 29: sum += qlp_coeff[28] * data[i-29];
1242                                 case 28: sum += qlp_coeff[27] * data[i-28];
1243                                 case 27: sum += qlp_coeff[26] * data[i-27];
1244                                 case 26: sum += qlp_coeff[25] * data[i-26];
1245                                 case 25: sum += qlp_coeff[24] * data[i-25];
1246                                 case 24: sum += qlp_coeff[23] * data[i-24];
1247                                 case 23: sum += qlp_coeff[22] * data[i-23];
1248                                 case 22: sum += qlp_coeff[21] * data[i-22];
1249                                 case 21: sum += qlp_coeff[20] * data[i-21];
1250                                 case 20: sum += qlp_coeff[19] * data[i-20];
1251                                 case 19: sum += qlp_coeff[18] * data[i-19];
1252                                 case 18: sum += qlp_coeff[17] * data[i-18];
1253                                 case 17: sum += qlp_coeff[16] * data[i-17];
1254                                 case 16: sum += qlp_coeff[15] * data[i-16];
1255                                 case 15: sum += qlp_coeff[14] * data[i-15];
1256                                 case 14: sum += qlp_coeff[13] * data[i-14];
1257                                 case 13: sum += qlp_coeff[12] * data[i-13];
1258                                          sum += qlp_coeff[11] * data[i-12];
1259                                          sum += qlp_coeff[10] * data[i-11];
1260                                          sum += qlp_coeff[ 9] * data[i-10];
1261                                          sum += qlp_coeff[ 8] * data[i- 9];
1262                                          sum += qlp_coeff[ 7] * data[i- 8];
1263                                          sum += qlp_coeff[ 6] * data[i- 7];
1264                                          sum += qlp_coeff[ 5] * data[i- 6];
1265                                          sum += qlp_coeff[ 4] * data[i- 5];
1266                                          sum += qlp_coeff[ 3] * data[i- 4];
1267                                          sum += qlp_coeff[ 2] * data[i- 3];
1268                                          sum += qlp_coeff[ 1] * data[i- 2];
1269                                          sum += qlp_coeff[ 0] * data[i- 1];
1270                         }
1271                         residual[i] = data[i] - (sum >> lp_quantization);
1272                 }
1273         }
1274 }
1275
1276 #endif /* FLAC__SSE4_1_SUPPORTED */
1277 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1278 #endif /* FLAC__NO_ASM */
1279 #endif /* FLAC__INTEGER_ONLY_LIBRARY */