lpc_intrin_sse41.c : Optimize decoding 24 bit files on 32 bit platforms.
[flac.git] / src / libFLAC / lpc_intrin_sse41.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE4_1_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <smmintrin.h> /* SSE4.1 */
47
48 #if defined FLAC__CPU_IA32 /* unused for x64 */
49
50 #define RESIDUAL64_RESULT(xmmN)  residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt))
51 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization))
52
53 FLAC__SSE_TARGET("sse4.1")
54 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
55 {
56         int i;
57         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
58
59         FLAC__ASSERT(order > 0);
60         FLAC__ASSERT(order <= 32);
61         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
62
63         if(order <= 12) {
64                 if(order > 8) { /* order == 9, 10, 11, 12 */
65                         if(order > 10) { /* order == 11, 12 */
66                                 if(order == 12) {
67                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
68                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
69                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
70                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
71                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
72                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
73                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
74
75                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
76                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
77                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
78                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
79                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
80                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
81
82                                         for(i = 0; i < (int)data_len; i++) {
83                                                 //sum = 0;
84                                                 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
85                                                 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
86                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
87                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
88                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
89
90                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
91                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
92                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
93                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
94                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
95                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
96
97                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
98                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
99                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
100                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
101                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
102                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
103
104                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
105                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
106                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
107                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
108                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
109                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
110
111                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
112                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
113                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
114                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
115                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
116                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
117
118                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
119                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
120                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
121                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
122                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
123                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
124
125                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
126                                                 RESIDUAL64_RESULT1(xmm7);
127                                         }
128                                 }
129                                 else { /* order == 11 */
130                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
131                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
132                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
133                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
134                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
135                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
136                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
137
138                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
139                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
140                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
141                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
142                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
143
144                                         for(i = 0; i < (int)data_len; i++) {
145                                                 //sum = 0;
146                                                 //sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
147                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
148                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
149
150                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
151                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
152                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
153                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
154                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
155                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
156
157                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
158                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
159                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
160                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
161                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
162                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
163
164                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
165                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
166                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
167                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
168                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
169                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
170
171                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
172                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
173                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
174                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
175                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
176                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
177
178                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
179                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
180                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
181                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
182                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
183                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
184
185                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
186                                                 RESIDUAL64_RESULT1(xmm7);
187                                         }
188                                 }
189                         }
190                         else { /* order == 9, 10 */
191                                 if(order == 10) {
192                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
193                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
194                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
195                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
196                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
197                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
198
199                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
200                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
201                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
202                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
203                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
204
205                                         for(i = 0; i < (int)data_len; i++) {
206                                                 //sum = 0;
207                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
208                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
209                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
210                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
211                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
212
213                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
214                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
215                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
216                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
217                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
218                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
219
220                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
221                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
222                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
223                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
224                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
225                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
226
227                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
228                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
229                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
230                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
231                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
232                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
233
234                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
235                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
236                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
237                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
238                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
239                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
240
241                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
242                                                 RESIDUAL64_RESULT(xmm7);
243                                         }
244                                 }
245                                 else { /* order == 9 */
246                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
247                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
248                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
249                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
250                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
251                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
252
253                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
254                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
255                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
256                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
257
258                                         for(i = 0; i < (int)data_len; i++) {
259                                                 //sum = 0;
260                                                 //sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
261                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
262                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
263
264                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
265                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
266                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
267                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
268                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
269                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
270
271                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
272                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
273                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
274                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
275                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
276                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
277
278                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
279                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
280                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
281                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
282                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
283                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
284
285                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
286                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
287                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
288                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
289                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
290                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
291
292                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
293                                                 RESIDUAL64_RESULT(xmm7);
294                                         }
295                                 }
296                         }
297                 }
298                 else if(order > 4) { /* order == 5, 6, 7, 8 */
299                         if(order > 6) { /* order == 7, 8 */
300                                 if(order == 8) {
301                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
302                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
303                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
304                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
305                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
306
307                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
308                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
309                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
310                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
311
312                                         for(i = 0; i < (int)data_len; i++) {
313                                                 //sum = 0;
314                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
315                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
316                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
317                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
318                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
319
320                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
321                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
322                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
323                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
324                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
325                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
326
327                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
328                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
329                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
330                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
331                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
332                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
333
334                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
335                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
336                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
337                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
338                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
339                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
340
341                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
342                                                 RESIDUAL64_RESULT(xmm7);
343                                         }
344                                 }
345                                 else { /* order == 7 */
346                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
347                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
348                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
349                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
350                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
351
352                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
353                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
354                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
355
356                                         for(i = 0; i < (int)data_len; i++) {
357                                                 //sum = 0;
358                                                 //sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
359                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
360                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
361
362                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
363                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
364                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
365                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
366                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
367                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
368
369                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
370                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
371                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
372                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
373                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
374                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
375
376                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
377                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
378                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
379                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
380                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
381                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
382
383                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
384                                                 RESIDUAL64_RESULT(xmm7);
385                                         }
386                                 }
387                         }
388                         else { /* order == 5, 6 */
389                                 if(order == 6) {
390                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
391                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
392                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
393                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
394
395                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
396                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
397                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
398
399                                         for(i = 0; i < (int)data_len; i++) {
400                                                 //sum = 0;
401                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
402                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
403                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
404                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
405                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
406
407                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
408                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
409                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
410                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
411                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
412                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
413
414                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
415                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
416                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
417                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
418                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
419                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
420
421                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
422                                                 RESIDUAL64_RESULT(xmm7);
423                                         }
424                                 }
425                                 else { /* order == 5 */
426                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
427                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
428                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
429                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
430
431                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
432                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
433
434                                         for(i = 0; i < (int)data_len; i++) {
435                                                 //sum = 0;
436                                                 //sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
437                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
438                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
439
440                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
441                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
442                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
443                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
444                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
445                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
446
447                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
448                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
449                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
450                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
451                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
452                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
453
454                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
455                                                 RESIDUAL64_RESULT(xmm7);
456                                         }
457                                 }
458                         }
459                 }
460                 else { /* order == 1, 2, 3, 4 */
461                         if(order > 2) { /* order == 3, 4 */
462                                 if(order == 4) {
463                                         __m128i xmm0, xmm1, xmm6, xmm7;
464                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
465                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
466
467                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
468                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
469
470                                         for(i = 0; i < (int)data_len; i++) {
471                                                 //sum = 0;
472                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
473                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
474                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
475                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
476                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
477
478                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
479                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
480                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
481                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
482                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
483                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
484
485                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
486                                                 RESIDUAL64_RESULT(xmm7);
487                                         }
488                                 }
489                                 else { /* order == 3 */
490                                         __m128i xmm0, xmm1, xmm6, xmm7;
491                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
492                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
493
494                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
495
496                                         for(i = 0; i < (int)data_len; i++) {
497                                                 //sum = 0;
498                                                 //sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
499                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
500                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
501
502                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
503                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
504                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
505                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
506                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
507                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
508
509                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
510                                                 RESIDUAL64_RESULT(xmm7);
511                                         }
512                                 }
513                         }
514                         else { /* order == 1, 2 */
515                                 if(order == 2) {
516                                         __m128i xmm0, xmm7;
517                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
518                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
519
520                                         for(i = 0; i < (int)data_len; i++) {
521                                                 //sum = 0;
522                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
523                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
524                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
525                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
526                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
527
528                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
529                                                 RESIDUAL64_RESULT(xmm7);
530                                         }
531                                 }
532                                 else { /* order == 1 */
533                                         __m128i xmm0, xmm7;
534                                         xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
535
536                                         for(i = 0; i < (int)data_len; i++) {
537                                                 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
538                                                 xmm7 = _mm_cvtsi32_si128(data[i-1]);
539                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
540                                                 RESIDUAL64_RESULT(xmm7);
541                                         }
542                                 }
543                         }
544                 }
545         }
546         else { /* order > 12 */
547                 FLAC__int64 sum;
548                 for(i = 0; i < (int)data_len; i++) {
549                         sum = 0;
550                         switch(order) {
551                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
552                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
553                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
554                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
555                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
556                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
557                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
558                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
559                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
560                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
561                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
562                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
563                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
564                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
565                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
566                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
567                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
568                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
569                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
570                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
571                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
572                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
573                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
574                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
575                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
576                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
577                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
578                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
579                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
580                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
581                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
582                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
583                         }
584                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
585                 }
586         }
587 }
588
589 FLAC__SSE_TARGET("sse4.1")
590 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
591 {
592         int i;
593         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
594
595         if (!data_len)
596                 return;
597
598         FLAC__ASSERT(order > 0);
599         FLAC__ASSERT(order <= 32);
600         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
601
602         if(order <= 12) {
603                 if(order > 8) { /* order == 9, 10, 11, 12 */
604                         if(order > 10) { /* order == 11, 12 */
605                                 __m128i qlp[6], dat[6];
606                                 __m128i summ, temp;
607                                 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));        // 0  0  q[1]  q[0]
608                                 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));        // 0  0  q[3]  q[2]
609                                 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));        // 0  0  q[5]  q[4]
610                                 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));        // 0  0  q[7]  q[6]
611                                 qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));        // 0  0  q[9]  q[8]
612                                 if (order == 12)
613                                         qlp[5] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10));       // 0  0  q[11] q[10]
614                                 else
615                                         qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10]);                                      // 0  0  0     q[10]
616
617                                 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));       // 0  q[0]  0  q[1]
618                                 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));       // 0  q[2]  0  q[3]
619                                 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));       // 0  q[4]  0  q[5]
620                                 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));       // 0  q[5]  0  q[7]
621                                 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1));       // 0  q[8]  0  q[9]
622                                 qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2,0,3,1));       // 0  q[10] 0  q[11]
623
624                                 dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-12)));        // ?  d[i-11]  ?  d[i-12]
625                                 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10)));        // ?  d[i-9]   ?  d[i-10]
626                                 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));        // ?  d[i-7]   ?  d[i-8]
627                                 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));        // ?  d[i-5]   ?  d[i-6]
628                                 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));        // ?  d[i-3]   ?  d[i-4]
629                                 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));        // ?  d[i-1]   ?  d[i-2]
630
631                                 summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
632                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
633                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
634                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
635                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
636                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
637
638                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));    // ?_64  sum_64
639                                 summ = _mm_srl_epi64(summ, cnt);                                                // ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
640                                 temp = _mm_cvtsi32_si128(residual[0]);                                  // 0  0  0  r[i]
641                                 temp = _mm_add_epi32(temp, summ);                                               // ?  ?  ?  d[i]
642                                 data[0] = _mm_cvtsi128_si32(temp);
643
644                                 for(i = 1; i < (int)data_len; i++) {
645                                         dat[5] = _mm_alignr_epi8(dat[4], dat[5], 8);    //  ?  d[i-10] ?  d[i-11]
646                                         dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);    //  ?  d[i-8]  ?  d[i-9]
647                                         dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);    //  ?  d[i-6]  ?  d[i-7]
648                                         dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);    //  ?  d[i-4]  ?  d[i-5]
649                                         dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);    //  ?  d[i-2]  ?  d[i-3]
650                                         dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);    //  ?  d[i  ]  ?  d[i-1]
651
652                                         summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
653                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
654                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
655                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
656                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
657                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
658
659                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));    // ?_64  sum_64
660                                         summ = _mm_srl_epi64(summ, cnt);                                                // ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
661                                         temp = _mm_cvtsi32_si128(residual[i]);                                  // 0  0  0  r[i]
662                                         temp = _mm_add_epi32(temp, summ);                                               // ?  ?  ?  d[i]
663                                         data[i] = _mm_cvtsi128_si32(temp);
664                                 }
665                         }
666                         else { /* order == 9, 10 */
667                                 __m128i qlp[5], dat[5];
668                                 __m128i summ, temp;
669                                 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
670                                 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
671                                 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
672                                 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
673                                 if (order == 10)
674                                         qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
675                                 else
676                                         qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8]);
677
678                                 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
679                                 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
680                                 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
681                                 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));
682                                 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1));
683
684                                 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10)));
685                                 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));
686                                 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
687                                 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
688                                 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
689
690                                 summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
691                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
692                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
693                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
694                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
695
696                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
697                                 summ = _mm_srl_epi64(summ, cnt);
698                                 temp = _mm_cvtsi32_si128(residual[0]);
699                                 temp = _mm_add_epi32(temp, summ);
700                                 data[0] = _mm_cvtsi128_si32(temp);
701
702                                 for(i = 1; i < (int)data_len; i++) {
703                                         dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);
704                                         dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
705                                         dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
706                                         dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
707                                         dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
708
709                                         summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
710                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
711                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
712                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
713                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
714
715                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
716                                         summ = _mm_srl_epi64(summ, cnt);
717                                         temp = _mm_cvtsi32_si128(residual[i]);
718                                         temp = _mm_add_epi32(temp, summ);
719                                         data[i] = _mm_cvtsi128_si32(temp);
720                                 }
721                         }
722                 }
723                 else if(order > 4) { /* order == 5, 6, 7, 8 */
724                         if(order > 6) { /* order == 7, 8 */
725                                 __m128i qlp[4], dat[4];
726                                 __m128i summ, temp;
727                                 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
728                                 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
729                                 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
730                                 if (order == 8)
731                                         qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
732                                 else
733                                         qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6]);
734
735                                 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
736                                 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
737                                 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
738                                 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));
739
740                                 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));
741                                 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
742                                 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
743                                 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
744
745                                 summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
746                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
747                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
748                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
749
750                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
751                                 summ = _mm_srl_epi64(summ, cnt);
752                                 temp = _mm_cvtsi32_si128(residual[0]);
753                                 temp = _mm_add_epi32(temp, summ);
754                                 data[0] = _mm_cvtsi128_si32(temp);
755
756                                 for(i = 1; i < (int)data_len; i++) {
757                                         dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
758                                         dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
759                                         dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
760                                         dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
761
762                                         summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
763                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
764                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
765                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
766
767                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
768                                         summ = _mm_srl_epi64(summ, cnt);
769                                         temp = _mm_cvtsi32_si128(residual[i]);
770                                         temp = _mm_add_epi32(temp, summ);
771                                         data[i] = _mm_cvtsi128_si32(temp);
772                                 }
773                         }
774                         else { /* order == 5, 6 */
775                                 __m128i qlp[3], dat[3];
776                                 __m128i summ, temp;
777                                 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
778                                 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
779                                 if (order == 6)
780                                         qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
781                                 else
782                                         qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4]);
783
784                                 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
785                                 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
786                                 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
787
788                                 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
789                                 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
790                                 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
791
792                                 summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
793                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
794                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
795
796                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
797                                 summ = _mm_srl_epi64(summ, cnt);
798                                 temp = _mm_cvtsi32_si128(residual[0]);
799                                 temp = _mm_add_epi32(temp, summ);
800                                 data[0] = _mm_cvtsi128_si32(temp);
801
802                                 for(i = 1; i < (int)data_len; i++) {
803                                         dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
804                                         dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
805                                         dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
806
807                                         summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
808                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
809                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
810
811                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
812                                         summ = _mm_srl_epi64(summ, cnt);
813                                         temp = _mm_cvtsi32_si128(residual[i]);
814                                         temp = _mm_add_epi32(temp, summ);
815                                         data[i] = _mm_cvtsi128_si32(temp);
816                                 }
817                         }
818                 }
819                 else { /* order == 1, 2, 3, 4 */
820                         if(order > 2) { /* order == 3, 4 */
821                                 __m128i qlp[2], dat[2];
822                                 __m128i summ, temp;
823                                 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
824                                 if (order == 4)
825                                         qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
826                                 else
827                                         qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2]);
828
829                                 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
830                                 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
831
832                                 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
833                                 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
834
835                                 summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
836                                 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
837
838                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
839                                 summ = _mm_srl_epi64(summ, cnt);
840                                 temp = _mm_cvtsi32_si128(residual[0]);
841                                 temp = _mm_add_epi32(temp, summ);
842                                 data[0] = _mm_cvtsi128_si32(temp);
843
844                                 for(i = 1; i < (int)data_len; i++) {
845                                         dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
846                                         dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
847
848                                         summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
849                                         summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
850
851                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
852                                         summ = _mm_srl_epi64(summ, cnt);
853                                         temp = _mm_cvtsi32_si128(residual[i]);
854                                         temp = _mm_add_epi32(temp, summ);
855                                         data[i] = _mm_cvtsi128_si32(temp);
856                                 }
857                         }
858                         else { /* order == 1, 2 */
859                                 if(order == 2) {
860                                         __m128i qlp0, dat0;
861                                         __m128i summ, temp;
862                                         qlp0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff));
863                                         qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFFLE(2,0,3,1));
864
865                                         dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
866
867                                         summ = _mm_mul_epi32(dat0, qlp0) ;
868
869                                         summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
870                                         summ = _mm_srl_epi64(summ, cnt);
871                                         temp = _mm_cvtsi32_si128(residual[0]);
872                                         temp = _mm_add_epi32(temp, summ);
873                                         data[0] = _mm_cvtsi128_si32(temp);
874
875                                         for(i = 1; i < (int)data_len; i++) {
876                                                 dat0 = _mm_alignr_epi8(temp, dat0, 8);
877
878                                                 summ = _mm_mul_epi32(dat0, qlp0) ;
879
880                                                 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
881                                                 summ = _mm_srl_epi64(summ, cnt);
882                                                 temp = _mm_cvtsi32_si128(residual[i]);
883                                                 temp = _mm_add_epi32(temp, summ);
884                                                 data[i] = _mm_cvtsi128_si32(temp);
885                                         }
886                                 }
887                                 else { /* order == 1 */
888                                         __m128i qlp0;
889                                         __m128i summ, temp;
890                                         qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
891                                         temp = _mm_cvtsi32_si128(data[-1]);
892
893                                         summ = _mm_mul_epi32(temp, qlp0);
894                                         summ = _mm_srl_epi64(summ, cnt);
895                                         temp = _mm_cvtsi32_si128(residual[0]);
896                                         temp = _mm_add_epi32(temp, summ);
897                                         data[0] = _mm_cvtsi128_si32(temp);
898
899                                         for(i = 1; i < (int)data_len; i++) {
900                                                 summ = _mm_mul_epi32(temp, qlp0) ;
901                                                 summ = _mm_srl_epi64(summ, cnt);
902                                                 temp = _mm_cvtsi32_si128(residual[i]);
903                                                 temp = _mm_add_epi32(temp, summ);
904                                                 data[i] = _mm_cvtsi128_si32(temp);
905                                         }
906                                 }
907                         }
908                 }
909         }
910         else { /* order > 12 */
911                 FLAC__int64 sum;
912                 for(i = 0; i < (int)data_len; i++) {
913                         sum = 0;
914                         switch(order) {
915                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
916                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
917                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
918                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
919                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
920                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
921                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
922                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
923                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
924                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
925                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
926                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
927                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
928                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
929                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
930                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
931                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
932                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
933                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
934                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
935                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
936                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
937                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
938                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
939                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
940                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
941                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
942                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
943                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
944                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
945                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
946                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
947                         }
948                         data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
949                 }
950         }
951 }
952
953 #endif /* defined FLAC__CPU_IA32 */
954
955 FLAC__SSE_TARGET("sse4.1")
956 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
957 {
958         int i;
959         FLAC__int32 sum;
960         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
961
962         FLAC__ASSERT(order > 0);
963         FLAC__ASSERT(order <= 32);
964
965         if(order <= 12) {
966                 if(order > 8) {
967                         if(order > 10) {
968                                 if(order == 12) {
969                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
970                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
971                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
972                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
973                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
974                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
975                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
976                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
977                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
978                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
979                                         q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
980                                         q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
981                                         q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
982
983                                         for(i = 0; i < (int)data_len-3; i+=4) {
984                                                 __m128i summ, mull;
985                                                 summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
986                                                 mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
987                                                 mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
988                                                 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
989                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
990                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
991                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
992                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
993                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
994                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
995                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
996                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
997                                                 summ = _mm_sra_epi32(summ, cnt);
998                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
999                                         }
1000                                 }
1001                                 else { /* order == 11 */
1002                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
1003                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1004                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1005                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1006                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1007                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1008                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1009                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1010                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1011                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1012                                         q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
1013                                         q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
1014
1015                                         for(i = 0; i < (int)data_len-3; i+=4) {
1016                                                 __m128i summ, mull;
1017                                                 summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
1018                                                 mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
1019                                                 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
1020                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1021                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1022                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1023                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1024                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1025                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1026                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1027                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1028                                                 summ = _mm_sra_epi32(summ, cnt);
1029                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1030                                         }
1031                                 }
1032                         }
1033                         else {
1034                                 if(order == 10) {
1035                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
1036                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1037                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1038                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1039                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1040                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1041                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1042                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1043                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1044                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1045                                         q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
1046
1047                                         for(i = 0; i < (int)data_len-3; i+=4) {
1048                                                 __m128i summ, mull;
1049                                                 summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
1050                                                 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
1051                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1052                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1053                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1054                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1055                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1056                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1057                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1058                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1059                                                 summ = _mm_sra_epi32(summ, cnt);
1060                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1061                                         }
1062                                 }
1063                                 else { /* order == 9 */
1064                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
1065                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1066                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1067                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1068                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1069                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1070                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1071                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1072                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1073                                         q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1074
1075                                         for(i = 0; i < (int)data_len-3; i+=4) {
1076                                                 __m128i summ, mull;
1077                                                 summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
1078                                                 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1079                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1080                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1081                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1082                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1083                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1084                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1085                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1086                                                 summ = _mm_sra_epi32(summ, cnt);
1087                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1088                                         }
1089                                 }
1090                         }
1091                 }
1092                 else if(order > 4) {
1093                         if(order > 6) {
1094                                 if(order == 8) {
1095                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7;
1096                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1097                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1098                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1099                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1100                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1101                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1102                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1103                                         q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1104
1105                                         for(i = 0; i < (int)data_len-3; i+=4) {
1106                                                 __m128i summ, mull;
1107                                                 summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
1108                                                 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1109                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1110                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1111                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1112                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1113                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1114                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1115                                                 summ = _mm_sra_epi32(summ, cnt);
1116                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1117                                         }
1118                                 }
1119                                 else { /* order == 7 */
1120                                         __m128i q0, q1, q2, q3, q4, q5, q6;
1121                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1122                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1123                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1124                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1125                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1126                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1127                                         q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1128
1129                                         for(i = 0; i < (int)data_len-3; i+=4) {
1130                                                 __m128i summ, mull;
1131                                                 summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
1132                                                 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1133                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1134                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1135                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1136                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1137                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1138                                                 summ = _mm_sra_epi32(summ, cnt);
1139                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1140                                         }
1141                                 }
1142                         }
1143                         else {
1144                                 if(order == 6) {
1145                                         __m128i q0, q1, q2, q3, q4, q5;
1146                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1147                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1148                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1149                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1150                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1151                                         q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1152
1153                                         for(i = 0; i < (int)data_len-3; i+=4) {
1154                                                 __m128i summ, mull;
1155                                                 summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
1156                                                 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1157                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1158                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1159                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1160                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1161                                                 summ = _mm_sra_epi32(summ, cnt);
1162                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1163                                         }
1164                                 }
1165                                 else { /* order == 5 */
1166                                         __m128i q0, q1, q2, q3, q4;
1167                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1168                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1169                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1170                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1171                                         q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1172
1173                                         for(i = 0; i < (int)data_len-3; i+=4) {
1174                                                 __m128i summ, mull;
1175                                                 summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
1176                                                 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1177                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1178                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1179                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1180                                                 summ = _mm_sra_epi32(summ, cnt);
1181                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1182                                         }
1183                                 }
1184                         }
1185                 }
1186                 else {
1187                         if(order > 2) {
1188                                 if(order == 4) {
1189                                         __m128i q0, q1, q2, q3;
1190                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1191                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1192                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1193                                         q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1194
1195                                         for(i = 0; i < (int)data_len-3; i+=4) {
1196                                                 __m128i summ, mull;
1197                                                 summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
1198                                                 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1199                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1200                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1201                                                 summ = _mm_sra_epi32(summ, cnt);
1202                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1203                                         }
1204                                 }
1205                                 else { /* order == 3 */
1206                                         __m128i q0, q1, q2;
1207                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1208                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1209                                         q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1210
1211                                         for(i = 0; i < (int)data_len-3; i+=4) {
1212                                                 __m128i summ, mull;
1213                                                 summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
1214                                                 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1215                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1216                                                 summ = _mm_sra_epi32(summ, cnt);
1217                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1218                                         }
1219                                 }
1220                         }
1221                         else {
1222                                 if(order == 2) {
1223                                         __m128i q0, q1;
1224                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1225                                         q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1226
1227                                         for(i = 0; i < (int)data_len-3; i+=4) {
1228                                                 __m128i summ, mull;
1229                                                 summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
1230                                                 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1231                                                 summ = _mm_sra_epi32(summ, cnt);
1232                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1233                                         }
1234                                 }
1235                                 else { /* order == 1 */
1236                                         __m128i q0;
1237                                         q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1238
1239                                         for(i = 0; i < (int)data_len-3; i+=4) {
1240                                                 __m128i summ;
1241                                                 summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
1242                                                 summ = _mm_sra_epi32(summ, cnt);
1243                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1244                                         }
1245                                 }
1246                         }
1247                 }
1248                 for(; i < (int)data_len; i++) {
1249                         sum = 0;
1250                         switch(order) {
1251                                 case 12: sum += qlp_coeff[11] * data[i-12];
1252                                 case 11: sum += qlp_coeff[10] * data[i-11];
1253                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
1254                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
1255                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
1256                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
1257                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
1258                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
1259                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
1260                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
1261                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
1262                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
1263                         }
1264                         residual[i] = data[i] - (sum >> lp_quantization);
1265                 }
1266         }
1267         else { /* order > 12 */
1268                 for(i = 0; i < (int)data_len; i++) {
1269                         sum = 0;
1270                         switch(order) {
1271                                 case 32: sum += qlp_coeff[31] * data[i-32];
1272                                 case 31: sum += qlp_coeff[30] * data[i-31];
1273                                 case 30: sum += qlp_coeff[29] * data[i-30];
1274                                 case 29: sum += qlp_coeff[28] * data[i-29];
1275                                 case 28: sum += qlp_coeff[27] * data[i-28];
1276                                 case 27: sum += qlp_coeff[26] * data[i-27];
1277                                 case 26: sum += qlp_coeff[25] * data[i-26];
1278                                 case 25: sum += qlp_coeff[24] * data[i-25];
1279                                 case 24: sum += qlp_coeff[23] * data[i-24];
1280                                 case 23: sum += qlp_coeff[22] * data[i-23];
1281                                 case 22: sum += qlp_coeff[21] * data[i-22];
1282                                 case 21: sum += qlp_coeff[20] * data[i-21];
1283                                 case 20: sum += qlp_coeff[19] * data[i-20];
1284                                 case 19: sum += qlp_coeff[18] * data[i-19];
1285                                 case 18: sum += qlp_coeff[17] * data[i-18];
1286                                 case 17: sum += qlp_coeff[16] * data[i-17];
1287                                 case 16: sum += qlp_coeff[15] * data[i-16];
1288                                 case 15: sum += qlp_coeff[14] * data[i-15];
1289                                 case 14: sum += qlp_coeff[13] * data[i-14];
1290                                 case 13: sum += qlp_coeff[12] * data[i-13];
1291                                          sum += qlp_coeff[11] * data[i-12];
1292                                          sum += qlp_coeff[10] * data[i-11];
1293                                          sum += qlp_coeff[ 9] * data[i-10];
1294                                          sum += qlp_coeff[ 8] * data[i- 9];
1295                                          sum += qlp_coeff[ 7] * data[i- 8];
1296                                          sum += qlp_coeff[ 6] * data[i- 7];
1297                                          sum += qlp_coeff[ 5] * data[i- 6];
1298                                          sum += qlp_coeff[ 4] * data[i- 5];
1299                                          sum += qlp_coeff[ 3] * data[i- 4];
1300                                          sum += qlp_coeff[ 2] * data[i- 3];
1301                                          sum += qlp_coeff[ 1] * data[i- 2];
1302                                          sum += qlp_coeff[ 0] * data[i- 1];
1303                         }
1304                         residual[i] = data[i] - (sum >> lp_quantization);
1305                 }
1306         }
1307 }
1308
1309 #endif /* FLAC__SSE4_1_SUPPORTED */
1310 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1311 #endif /* FLAC__NO_ASM */
1312 #endif /* FLAC__INTEGER_ONLY_LIBRARY */