56da80f579906ba1234be4cf98329c2d5019d613
[flac.git] / src / libFLAC / lpc_intrin_sse41.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE4_1_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <smmintrin.h> /* SSE4.1 */
47
48 #ifdef FLAC__CPU_IA32
49 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt));
50 #define     DATA_RESULT(xmmN) data[i] = residual[i] + _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt));
51 #define RESIDUAL_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization));
52 #define     DATA_RESULT1(xmmN) data[i] = residual[i] + _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization));
53 #else
54 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
55 #define     DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
56 #define RESIDUAL_RESULT1(xmmN) RESIDUAL_RESULT(xmmN)
57 #define     DATA_RESULT1(xmmN) DATA_RESULT(xmmN)
58 #endif
59
60 FLAC__SSE_TARGET("sse4.1")
61 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
62 {
63         int i;
64 #ifdef FLAC__CPU_IA32
65         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
66 #endif
67
68         FLAC__ASSERT(order > 0);
69         FLAC__ASSERT(order <= 32);
70         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_srai_epi64() so we have to use _mm_srli_epi64() */
71
72         if(order <= 12) {
73                 if(order > 8) { /* order == 9, 10, 11, 12 */
74                         if(order > 10) { /* order == 11, 12 */
75                                 if(order == 12) {
76                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
77                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
78                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
79                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
80                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
81                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
82                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
83
84                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
85                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
86                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
87                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
88                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
89                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
90
91                                         for(i = 0; i < (int)data_len; i++) {
92                                                 //sum = 0;
93                                                 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
94                                                 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
95                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
96                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
97                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
98
99                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
100                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
101                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
102                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
103                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
104                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
105
106                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
107                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
108                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
109                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
110                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
111                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
112
113                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
114                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
115                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
116                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
117                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
118                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
119
120                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
121                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
122                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
123                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
124                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
125                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
126
127                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
128                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
129                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
130                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
131                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
132                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
133
134                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
135                                                 RESIDUAL_RESULT1(xmm7);
136                                         }
137                                 }
138                                 else { /* order == 11 */
139                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
140                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
141                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
142                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
143                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
144                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
145                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
146
147                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
148                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
149                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
150                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
151                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
152
153                                         for(i = 0; i < (int)data_len; i++) {
154                                                 //sum = 0;
155                                                 //sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
156                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
157                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
158
159                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
160                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
161                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
162                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
163                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
164                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
165
166                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
167                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
168                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
169                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
170                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
171                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
172
173                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
174                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
175                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
176                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
177                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
178                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
179
180                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
181                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
182                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
183                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
184                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
185                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
186
187                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
188                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
189                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
190                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
191                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
192                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
193
194                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
195                                                 RESIDUAL_RESULT1(xmm7);
196                                         }
197                                 }
198                         }
199                         else { /* order == 9, 10 */
200                                 if(order == 10) {
201                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
202                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
203                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
204                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
205                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
206                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
207
208                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
209                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
210                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
211                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
212                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
213
214                                         for(i = 0; i < (int)data_len; i++) {
215                                                 //sum = 0;
216                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
217                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
218                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
219                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
220                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
221
222                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
223                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
224                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
225                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
226                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
227                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
228
229                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
230                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
231                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
232                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
233                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
234                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
235
236                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
237                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
238                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
239                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
240                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
241                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
242
243                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
244                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
245                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
246                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
247                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
248                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
249
250                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
251                                                 RESIDUAL_RESULT(xmm7);
252                                         }
253                                 }
254                                 else { /* order == 9 */
255                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
256                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
257                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
258                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
259                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
260                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
261
262                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
263                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
264                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
265                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
266
267                                         for(i = 0; i < (int)data_len; i++) {
268                                                 //sum = 0;
269                                                 //sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
270                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
271                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
272
273                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
274                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
275                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
276                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
277                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
278                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
279
280                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
281                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
282                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
283                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
284                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
285                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
286
287                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
288                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
289                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
290                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
291                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
292                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
293
294                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
295                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
296                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
297                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
298                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
299                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
300
301                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
302                                                 RESIDUAL_RESULT(xmm7);
303                                         }
304                                 }
305                         }
306                 }
307                 else if(order > 4) { /* order == 5, 6, 7, 8 */
308                         if(order > 6) { /* order == 7, 8 */
309                                 if(order == 8) {
310                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
311                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
312                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
313                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
314                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
315
316                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
317                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
318                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
319                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
320
321                                         for(i = 0; i < (int)data_len; i++) {
322                                                 //sum = 0;
323                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
324                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
325                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
326                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
327                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
328
329                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
330                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
331                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
332                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
333                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
334                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
335
336                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
337                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
338                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
339                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
340                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
341                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
342
343                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
344                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
345                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
346                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
347                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
348                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
349
350                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
351                                                 RESIDUAL_RESULT(xmm7);
352                                         }
353                                 }
354                                 else { /* order == 7 */
355                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
356                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
357                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
358                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
359                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
360
361                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
362                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
363                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
364
365                                         for(i = 0; i < (int)data_len; i++) {
366                                                 //sum = 0;
367                                                 //sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
368                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
369                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
370
371                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
372                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
373                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
374                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
375                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
376                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
377
378                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
379                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
380                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
381                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
382                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
383                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
384
385                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
386                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
387                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
388                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
389                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
390                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
391
392                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
393                                                 RESIDUAL_RESULT(xmm7);
394                                         }
395                                 }
396                         }
397                         else { /* order == 5, 6 */
398                                 if(order == 6) {
399                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
400                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
401                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
402                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
403
404                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
405                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
406                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
407
408                                         for(i = 0; i < (int)data_len; i++) {
409                                                 //sum = 0;
410                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
411                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
412                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
413                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
414                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
415
416                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
417                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
418                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
419                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
420                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
421                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
422
423                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
424                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
425                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
426                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
427                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
428                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
429
430                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
431                                                 RESIDUAL_RESULT(xmm7);
432                                         }
433                                 }
434                                 else { /* order == 5 */
435                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
436                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
437                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
438                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
439
440                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
441                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
442
443                                         for(i = 0; i < (int)data_len; i++) {
444                                                 //sum = 0;
445                                                 //sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
446                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
447                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
448
449                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
450                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
451                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
452                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
454                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
455
456                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
457                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
458                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
459                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
460                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
461                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
462
463                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
464                                                 RESIDUAL_RESULT(xmm7);
465                                         }
466                                 }
467                         }
468                 }
469                 else { /* order == 1, 2, 3, 4 */
470                         if(order > 2) { /* order == 3, 4 */
471                                 if(order == 4) {
472                                         __m128i xmm0, xmm1, xmm6, xmm7;
473                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
474                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
475
476                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
477                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
478
479                                         for(i = 0; i < (int)data_len; i++) {
480                                                 //sum = 0;
481                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
482                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
483                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
484                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
485                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
486
487                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
488                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
489                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
490                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
491                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
492                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
493
494                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
495                                                 RESIDUAL_RESULT(xmm7);
496                                         }
497                                 }
498                                 else { /* order == 3 */
499                                         __m128i xmm0, xmm1, xmm6, xmm7;
500                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
501                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
502
503                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
504
505                                         for(i = 0; i < (int)data_len; i++) {
506                                                 //sum = 0;
507                                                 //sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
508                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
509                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
510
511                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
512                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
513                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
514                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
515                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
516                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
517
518                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
519                                                 RESIDUAL_RESULT(xmm7);
520                                         }
521                                 }
522                         }
523                         else { /* order == 1, 2 */
524                                 if(order == 2) {
525                                         __m128i xmm0, xmm7;
526                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
527                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
528
529                                         for(i = 0; i < (int)data_len; i++) {
530                                                 //sum = 0;
531                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
532                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
533                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
534                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
535                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
536
537                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
538                                                 RESIDUAL_RESULT(xmm7);
539                                         }
540                                 }
541                                 else { /* order == 1 */
542                                         __m128i xmm0, xmm7;
543                                         xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
544
545                                         for(i = 0; i < (int)data_len; i++) {
546                                                 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
547                                                 xmm7 = _mm_cvtsi32_si128(data[i-1]);
548                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
549                                                 RESIDUAL_RESULT(xmm7);
550                                         }
551                                 }
552                         }
553                 }
554         }
555         else { /* order > 12 */
556                 FLAC__int64 sum;
557                 for(i = 0; i < (int)data_len; i++) {
558                         sum = 0;
559                         switch(order) {
560                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
561                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
562                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
563                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
564                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
565                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
566                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
567                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
568                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
569                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
570                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
571                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
572                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
573                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
574                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
575                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
576                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
577                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
578                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
579                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
580                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
581                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
582                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
583                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
584                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
585                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
586                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
587                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
588                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
589                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
590                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
591                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
592                         }
593                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
594                 }
595         }
596 }
597
598 FLAC__SSE_TARGET("sse4.1")
599 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
600 {
601         int i;
602 #ifdef FLAC__CPU_IA32
603         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
604 #endif
605
606         FLAC__ASSERT(order > 0);
607         FLAC__ASSERT(order <= 32);
608         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_srai_epi64() so we have to use _mm_srli_epi64() */
609
610         if(order <= 12) {
611                 if(order > 8) { /* order == 9, 10, 11, 12 */
612                         if(order > 10) { /* order == 11, 12 */
613                                 if(order == 12) {
614                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
615                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
616                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
617                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
618                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
619                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
620                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
621
622                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
623                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
624                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
625                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
626                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
627                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
628
629                                         for(i = 0; i < (int)data_len; i++) {
630                                                 //sum = 0;
631                                                 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
632                                                 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
633                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
634                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
635                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
636
637                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
638                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
639                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
640                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
641                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
642                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
643
644                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
645                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
646                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
647                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
648                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
649                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
650
651                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
652                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
653                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
654                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
655                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
656                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
657
658                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
659                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
660                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
661                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
662                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
663                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
664
665                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
666                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
667                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
668                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
669                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
670                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
671
672                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
673                                                 DATA_RESULT1(xmm7);
674                                         }
675                                 }
676                                 else { /* order == 11 */
677                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
678                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
679                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
680                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
681                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
682                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
683                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
684
685                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
686                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
687                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
688                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
689                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
690
691                                         for(i = 0; i < (int)data_len; i++) {
692                                                 //sum = 0;
693                                                 //sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
694                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
695                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
696
697                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
698                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
699                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
700                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
701                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
702                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
703
704                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
705                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
706                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
707                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
708                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
709                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
710
711                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
712                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
713                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
714                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
715                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
716                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
717
718                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
719                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
720                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
721                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
722                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
723                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
724
725                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
726                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
727                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
728                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
729                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
730                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
731
732                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
733                                                 DATA_RESULT1(xmm7);
734                                         }
735                                 }
736                         }
737                         else { /* order == 9, 10 */
738                                 if(order == 10) {
739                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
740                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
741                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
742                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
743                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
744                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
745
746                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
747                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
748                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
749                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
750                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
751
752                                         for(i = 0; i < (int)data_len; i++) {
753                                                 //sum = 0;
754                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
755                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
756                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
757                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
758                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
759
760                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
761                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
762                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
763                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
764                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
765                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
766
767                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
768                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
769                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
770                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
771                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
772                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
773
774                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
775                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
776                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
777                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
778                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
779                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
780
781                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
782                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
783                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
784                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
785                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
786                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
787
788                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
789                                                 DATA_RESULT(xmm7);
790                                         }
791                                 }
792                                 else { /* order == 9 */
793                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
794                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
795                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
796                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
797                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
798                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
799
800                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
801                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
802                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
803                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
804
805                                         for(i = 0; i < (int)data_len; i++) {
806                                                 //sum = 0;
807                                                 //sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
808                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
809                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
810
811                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
812                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
813                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
814                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
815                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
816                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
817
818                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
819                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
820                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
821                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
822                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
823                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
824
825                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
826                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
827                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
828                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
829                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
830                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
831
832                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
833                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
834                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
835                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
836                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
837                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
838
839                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
840                                                 DATA_RESULT(xmm7);
841                                         }
842                                 }
843                         }
844                 }
845                 else if(order > 4) { /* order == 5, 6, 7, 8 */
846                         if(order > 6) { /* order == 7, 8 */
847                                 if(order == 8) {
848                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
849                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
850                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
851                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
852                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
853
854                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
855                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
856                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
857                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
858
859                                         for(i = 0; i < (int)data_len; i++) {
860                                                 //sum = 0;
861                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
862                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
863                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
864                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
865                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
866
867                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
868                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
869                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
870                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
871                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
872                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
873
874                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
875                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
876                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
877                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
878                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
879                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
880
881                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
882                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
883                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
884                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
885                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
886                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
887
888                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
889                                                 DATA_RESULT(xmm7);
890                                         }
891                                 }
892                                 else { /* order == 7 */
893                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
894                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
895                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
896                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
897                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
898
899                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
900                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
901                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
902
903                                         for(i = 0; i < (int)data_len; i++) {
904                                                 //sum = 0;
905                                                 //sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
906                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
907                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
908
909                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
910                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
911                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
912                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
913                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
914                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
915
916                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
917                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
918                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
919                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
920                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
921                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
922
923                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
924                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
925                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
926                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
927                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
928                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
929
930                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
931                                                 DATA_RESULT(xmm7);
932                                         }
933                                 }
934                         }
935                         else { /* order == 5, 6 */
936                                 if(order == 6) {
937                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
938                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
939                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
940                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
941
942                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
943                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
944                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
945
946                                         for(i = 0; i < (int)data_len; i++) {
947                                                 //sum = 0;
948                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
949                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
950                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
951                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
952                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
953
954                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
955                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
956                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
957                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
958                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
959                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
960
961                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
962                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
963                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
964                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
965                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
966                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
967
968                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
969                                                 DATA_RESULT(xmm7);
970                                         }
971                                 }
972                                 else { /* order == 5 */
973                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
974                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
975                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
976                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
977
978                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
979                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
980
981                                         for(i = 0; i < (int)data_len; i++) {
982                                                 //sum = 0;
983                                                 //sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
984                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
985                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
986
987                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
988                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
989                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
990                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
991                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
992                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
993
994                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
995                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
996                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
997                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
998                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
999                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
1000
1001                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1002                                                 DATA_RESULT(xmm7);
1003                                         }
1004                                 }
1005                         }
1006                 }
1007                 else { /* order == 1, 2, 3, 4 */
1008                         if(order > 2) { /* order == 3, 4 */
1009                                 if(order == 4) {
1010                                         __m128i xmm0, xmm1, xmm6, xmm7;
1011                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1012                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1013
1014                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1015                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1016
1017                                         for(i = 0; i < (int)data_len; i++) {
1018                                                 //sum = 0;
1019                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
1020                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
1021                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1022                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1023                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1024
1025                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1026                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1027                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1028                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1029                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1030                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
1031
1032                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1033                                                 DATA_RESULT(xmm7);
1034                                         }
1035                                 }
1036                                 else { /* order == 3 */
1037                                         __m128i xmm0, xmm1, xmm6, xmm7;
1038                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1039                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1040
1041                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1042
1043                                         for(i = 0; i < (int)data_len; i++) {
1044                                                 //sum = 0;
1045                                                 //sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
1046                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1047                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1048
1049                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1050                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1051                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1052                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1053                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1054                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
1055
1056                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1057                                                 DATA_RESULT(xmm7);
1058                                         }
1059                                 }
1060                         }
1061                         else { /* order == 1, 2 */
1062                                 if(order == 2) {
1063                                         __m128i xmm0, xmm7;
1064                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1065                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1066
1067                                         for(i = 0; i < (int)data_len; i++) {
1068                                                 //sum = 0;
1069                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1070                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1071                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1072                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1073                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
1074
1075                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1076                                                 DATA_RESULT(xmm7);
1077                                         }
1078                                 }
1079                                 else { /* order == 1 */
1080                                         __m128i xmm0, xmm7;
1081                                         xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
1082
1083                                         for(i = 0; i < (int)data_len; i++) {
1084                                                 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
1085                                                 xmm7 = _mm_cvtsi32_si128(data[i-1]);
1086                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
1087                                                 DATA_RESULT(xmm7);
1088                                         }
1089                                 }
1090                         }
1091                 }
1092         }
1093         else { /* order > 12 */
1094                 FLAC__int64 sum;
1095                 for(i = 0; i < (int)data_len; i++) {
1096                         sum = 0;
1097                         switch(order) {
1098                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1099                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1100                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1101                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1102                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1103                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1104                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1105                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1106                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1107                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1108                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1109                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1110                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1111                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1112                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1113                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1114                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1115                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1116                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1117                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1118                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1119                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1120                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1121                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1122                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1123                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1124                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1125                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1126                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1127                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1128                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1129                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1130                         }
1131                         data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1132                 }
1133         }
1134 }
1135
1136 #endif /* FLAC__SSE4_1_SUPPORTED */
1137 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1138 #endif /* FLAC__NO_ASM */
1139 #endif /* FLAC__INTEGER_ONLY_LIBRARY */