Improve x86 instrinsic implementation.
[flac.git] / src / libFLAC / lpc_intrin_sse41.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #if HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #include "share/compat.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
42 #ifdef FLAC__SSE4_SUPPORTED
43
44 #include "FLAC/assert.h"
45 #include "FLAC/format.h"
46 #include "private/lpc.h"
47
48 #include <smmintrin.h> /* SSE4.1 */
49
50 #ifdef FLAC__CPU_IA32
51 #if defined _MSC_VER || defined __INTEL_COMPILER
52 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(xmmN.m128i_i64[0] >> lp_quantization);
53 #define     DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(xmmN.m128i_i64[0] >> lp_quantization);
54 #else
55 #define RESIDUAL_RESULT(xmmN) { \
56         FLAC__int64 tmp[2]; \
57         _mm_storel_epi64((__m128i *)tmp, xmmN); \
58         residual[i] = data[i] - (FLAC__int32)(tmp[0] >> lp_quantization); \
59         }
60 #define DATA_RESULT(xmmN) { \
61         FLAC__int64 tmp[2]; \
62         _mm_storel_epi64((__m128i *)tmp, xmmN); \
63         data[i] = residual[i] + (FLAC__int32)(tmp[0] >> lp_quantization); \
64         }
65 #endif
66 #else
67 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
68 #define     DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
69 #endif
70
71 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
72 {
73         int i;
74
75         FLAC__ASSERT(order > 0);
76         FLAC__ASSERT(order <= 32);
77
78         if(order <= 12) {
79                 if(order > 8) { /* order == 9, 10, 11, 12 */
80                         if(order > 10) { /* order == 11, 12 */
81                                 if(order == 12) {
82                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
84                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
85                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
86                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
87                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
88                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
89
90                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
91                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
92                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
93                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
94                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
95                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
96
97                                         for(i = 0; i < (int)data_len; i++) {
98                                                 //sum = 0;
99                                                 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
100                                                 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
101                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
102                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
103                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
104
105                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
106                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
107                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
108                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
109                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
110                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
111
112                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
113                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
114                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
115                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
116                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
117                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
118
119                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
120                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
121                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
122                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
123                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
124                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
125
126                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
127                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
128                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
129                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
130                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
131                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
132
133                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
134                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
135                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
136                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
137                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
138                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
139
140                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
141                                                 RESIDUAL_RESULT(xmm7);
142                                         }
143                                 }
144                                 else { /* order == 11 */
145                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
146                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
147                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
148                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
149                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
150                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
151                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
152
153                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
154                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
155                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
156                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
157                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
158
159                                         for(i = 0; i < (int)data_len; i++) {
160                                                 //sum = 0;
161                                                 //sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
162                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
163                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
164
165                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
166                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
167                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
168                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
169                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
170                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
171
172                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
173                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
174                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
175                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
176                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
177                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
178
179                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
180                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
181                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
182                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
183                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
184                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
185
186                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
187                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
188                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
189                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
190                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
191                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
192
193                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
194                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
195                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
196                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
197                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
198                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
199
200                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
201                                                 RESIDUAL_RESULT(xmm7);
202                                         }
203                                 }
204                         }
205                         else { /* order == 9, 10 */
206                                 if(order == 10) {
207                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
208                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
209                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
210                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
211                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
212                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
213
214                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
215                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
216                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
217                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
218                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
219
220                                         for(i = 0; i < (int)data_len; i++) {
221                                                 //sum = 0;
222                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
223                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
224                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
225                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
226                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
227
228                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
229                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
230                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
231                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
232                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
233                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
234
235                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
236                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
237                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
238                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
239                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
240                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
241
242                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
243                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
244                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
245                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
246                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
247                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
248
249                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
250                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
251                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
252                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
253                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
254                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
255
256                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
257                                                 RESIDUAL_RESULT(xmm7);
258                                         }
259                                 }
260                                 else { /* order == 9 */
261                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
262                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
263                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
264                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
265                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
266                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
267
268                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
269                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
270                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
271                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
272
273                                         for(i = 0; i < (int)data_len; i++) {
274                                                 //sum = 0;
275                                                 //sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
276                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
277                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
278
279                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
280                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
281                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
282                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
283                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
284                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
285
286                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
287                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
288                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
289                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
290                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
291                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
292
293                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
294                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
295                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
296                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
297                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
298                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
299
300                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
301                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
302                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
303                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
304                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
305                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
306
307                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
308                                                 RESIDUAL_RESULT(xmm7);
309                                         }
310                                 }
311                         }
312                 }
313                 else if(order > 4) { /* order == 5, 6, 7, 8 */
314                         if(order > 6) { /* order == 7, 8 */
315                                 if(order == 8) {
316                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
317                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
318                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
319                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
320                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
321
322                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
323                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
324                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
325                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
326
327                                         for(i = 0; i < (int)data_len; i++) {
328                                                 //sum = 0;
329                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
330                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
331                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
332                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
333                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
334
335                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
336                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
337                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
338                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
339                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
340                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
341
342                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
343                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
344                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
345                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
346                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
347                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
348
349                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
350                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
351                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
352                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
353                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
354                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
355
356                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
357                                                 RESIDUAL_RESULT(xmm7);
358                                         }
359                                 }
360                                 else { /* order == 7 */
361                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
362                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
363                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
364                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
365                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
366
367                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
368                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
369                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
370
371                                         for(i = 0; i < (int)data_len; i++) {
372                                                 //sum = 0;
373                                                 //sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
374                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
375                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
376
377                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
378                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
379                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
380                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
381                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
382                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
383
384                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
385                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
386                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
387                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
388                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
389                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
390
391                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
392                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
393                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
394                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
395                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
396                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
397
398                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
399                                                 RESIDUAL_RESULT(xmm7);
400                                         }
401                                 }
402                         }
403                         else { /* order == 5, 6 */
404                                 if(order == 6) {
405                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
406                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
407                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
408                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
409
410                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
411                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
412                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
413
414                                         for(i = 0; i < (int)data_len; i++) {
415                                                 //sum = 0;
416                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
417                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
418                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
419                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
420                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
421
422                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
423                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
424                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
425                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
426                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
427                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
428
429                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
430                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
431                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
432                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
433                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
434                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
435
436                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
437                                                 RESIDUAL_RESULT(xmm7);
438                                         }
439                                 }
440                                 else { /* order == 5 */
441                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
442                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
443                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
444                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
445
446                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
447                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
448
449                                         for(i = 0; i < (int)data_len; i++) {
450                                                 //sum = 0;
451                                                 //sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
452                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
453                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
454
455                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
456                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
457                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
458                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
459                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
460                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
461
462                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
463                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
464                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
465                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
466                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
467                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
468
469                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
470                                                 RESIDUAL_RESULT(xmm7);
471                                         }
472                                 }
473                         }
474                 }
475                 else { /* order == 1, 2, 3, 4 */
476                         if(order > 2) { /* order == 3, 4 */
477                                 if(order == 4) {
478                                         __m128i xmm0, xmm1, xmm6, xmm7;
479                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
480                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
481
482                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
483                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
484
485                                         for(i = 0; i < (int)data_len; i++) {
486                                                 //sum = 0;
487                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
488                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
489                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
490                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
491                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
492
493                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
494                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
495                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
496                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
497                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
498                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
499
500                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
501                                                 RESIDUAL_RESULT(xmm7);
502                                         }
503                                 }
504                                 else { /* order == 3 */
505                                         __m128i xmm0, xmm1, xmm6, xmm7;
506                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
507                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
508
509                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
510
511                                         for(i = 0; i < (int)data_len; i++) {
512                                                 //sum = 0;
513                                                 //sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
514                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
515                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
516
517                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
518                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
519                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
520                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
521                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
522                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
523
524                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
525                                                 RESIDUAL_RESULT(xmm7);
526                                         }
527                                 }
528                         }
529                         else { /* order == 1, 2 */
530                                 if(order == 2) {
531                                         __m128i xmm0, xmm7;
532                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
533                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
534
535                                         for(i = 0; i < (int)data_len; i++) {
536                                                 //sum = 0;
537                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
538                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
539                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
540                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
541                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
542
543                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
544                                                 RESIDUAL_RESULT(xmm7);
545                                         }
546                                 }
547                                 else { /* order == 1 */
548                                         for(i = 0; i < (int)data_len; i++)
549                                                 residual[i] = data[i] - (FLAC__int32)((qlp_coeff[0] * (FLAC__int64)data[i-1]) >> lp_quantization);
550                                 }
551                         }
552                 }
553         }
554         else { /* order > 12 */
555                 FLAC__int64 sum;
556                 for(i = 0; i < (int)data_len; i++) {
557                         sum = 0;
558                         switch(order) {
559                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
560                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
561                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
562                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
563                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
564                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
565                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
566                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
567                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
568                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
569                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
570                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
571                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
572                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
573                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
574                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
575                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
576                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
577                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
578                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
579                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
580                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
581                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
582                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
583                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
584                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
585                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
586                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
587                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
588                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
589                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
590                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
591                         }
592                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
593                 }
594         }
595 }
596
597 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
598 {
599         int i;
600
601         FLAC__ASSERT(order > 0);
602         FLAC__ASSERT(order <= 32);
603
604         if(order <= 12) {
605                 if(order > 8) { /* order == 9, 10, 11, 12 */
606                         if(order > 10) { /* order == 11, 12 */
607                                 if(order == 12) {
608                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
609                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
610                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
611                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
612                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
613                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
614                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
615
616                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
617                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
618                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
619                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
620                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
621                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
622
623                                         for(i = 0; i < (int)data_len; i++) {
624                                                 //sum = 0;
625                                                 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
626                                                 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
627                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
628                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
629                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
630
631                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
632                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
633                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
634                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
635                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
636                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
637
638                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
639                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
640                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
641                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
642                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
643                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
644
645                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
646                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
647                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
648                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
649                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
650                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
651
652                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
653                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
654                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
655                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
656                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
657                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
658
659                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
660                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
661                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
662                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
663                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
664                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
665
666                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
667                                                 DATA_RESULT(xmm7);
668                                         }
669                                 }
670                                 else { /* order == 11 */
671                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
672                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
673                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
674                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
675                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
676                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
677                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
678
679                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
680                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
681                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
682                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
683                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
684
685                                         for(i = 0; i < (int)data_len; i++) {
686                                                 //sum = 0;
687                                                 //sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
688                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
689                                                 xmm7 = _mm_mul_epi32(xmm7, xmm5);
690
691                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
692                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
693                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
694                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
695                                                 xmm6 = _mm_mul_epi32(xmm6, xmm4);
696                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
697
698                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
699                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
700                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
701                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
702                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
703                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
704
705                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
706                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
707                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
708                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
709                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
710                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
711
712                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
713                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
714                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
715                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
716                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
717                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
718
719                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
720                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
721                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
722                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
723                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
724                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
725
726                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
727                                                 DATA_RESULT(xmm7);
728                                         }
729                                 }
730                         }
731                         else { /* order == 9, 10 */
732                                 if(order == 10) {
733                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
734                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
735                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
736                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
737                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
738                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
739
740                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
741                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
742                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
743                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
744                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
745
746                                         for(i = 0; i < (int)data_len; i++) {
747                                                 //sum = 0;
748                                                 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
749                                                 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
750                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
751                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
752                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
753
754                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
755                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
756                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
757                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
758                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
759                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
760
761                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
762                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
763                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
764                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
765                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
766                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
767
768                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
769                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
770                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
771                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
772                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
773                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
774
775                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
776                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
777                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
778                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
779                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
780                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
781
782                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
783                                                 DATA_RESULT(xmm7);
784                                         }
785                                 }
786                                 else { /* order == 9 */
787                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
788                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
789                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
790                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
791                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
792                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
793
794                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
795                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
796                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
797                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
798
799                                         for(i = 0; i < (int)data_len; i++) {
800                                                 //sum = 0;
801                                                 //sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
802                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
803                                                 xmm7 = _mm_mul_epi32(xmm7, xmm4);
804
805                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
806                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
807                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
808                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
809                                                 xmm6 = _mm_mul_epi32(xmm6, xmm3);
810                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
811
812                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
813                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
814                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
815                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
816                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
817                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
818
819                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
820                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
821                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
822                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
823                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
824                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
825
826                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
827                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
828                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
829                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
830                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
831                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
832
833                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
834                                                 DATA_RESULT(xmm7);
835                                         }
836                                 }
837                         }
838                 }
839                 else if(order > 4) { /* order == 5, 6, 7, 8 */
840                         if(order > 6) { /* order == 7, 8 */
841                                 if(order == 8) {
842                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
843                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
844                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
845                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
846                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
847
848                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
849                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
850                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
851                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
852
853                                         for(i = 0; i < (int)data_len; i++) {
854                                                 //sum = 0;
855                                                 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
856                                                 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
857                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
858                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
859                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
860
861                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
862                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
863                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
864                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
865                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
866                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
867
868                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
869                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
870                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
871                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
872                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
873                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
874
875                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
876                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
877                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
878                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
879                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
880                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
881
882                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
883                                                 DATA_RESULT(xmm7);
884                                         }
885                                 }
886                                 else { /* order == 7 */
887                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
888                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
889                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
890                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
891                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
892
893                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
894                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
895                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
896
897                                         for(i = 0; i < (int)data_len; i++) {
898                                                 //sum = 0;
899                                                 //sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
900                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
901                                                 xmm7 = _mm_mul_epi32(xmm7, xmm3);
902
903                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
904                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
905                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
906                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
907                                                 xmm6 = _mm_mul_epi32(xmm6, xmm2);
908                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
909
910                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
911                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
912                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
913                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
914                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
915                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
916
917                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
918                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
919                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
920                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
921                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
922                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
923
924                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
925                                                 DATA_RESULT(xmm7);
926                                         }
927                                 }
928                         }
929                         else { /* order == 5, 6 */
930                                 if(order == 6) {
931                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
932                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
933                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
934                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
935
936                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
937                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
938                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
939
940                                         for(i = 0; i < (int)data_len; i++) {
941                                                 //sum = 0;
942                                                 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
943                                                 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
944                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
945                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
946                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
947
948                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
949                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
950                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
951                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
952                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
953                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
954
955                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
956                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
957                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
958                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
959                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
960                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
961
962                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
963                                                 DATA_RESULT(xmm7);
964                                         }
965                                 }
966                                 else { /* order == 5 */
967                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
968                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
969                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
970                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
971
972                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
973                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
974
975                                         for(i = 0; i < (int)data_len; i++) {
976                                                 //sum = 0;
977                                                 //sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
978                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
979                                                 xmm7 = _mm_mul_epi32(xmm7, xmm2);
980
981                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
982                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
983                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
984                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
985                                                 xmm6 = _mm_mul_epi32(xmm6, xmm1);
986                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
987
988                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
989                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
990                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
991                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
992                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
993                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
994
995                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
996                                                 DATA_RESULT(xmm7);
997                                         }
998                                 }
999                         }
1000                 }
1001                 else { /* order == 1, 2, 3, 4 */
1002                         if(order > 2) { /* order == 3, 4 */
1003                                 if(order == 4) {
1004                                         __m128i xmm0, xmm1, xmm6, xmm7;
1005                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1006                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1007
1008                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1009                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1010
1011                                         for(i = 0; i < (int)data_len; i++) {
1012                                                 //sum = 0;
1013                                                 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
1014                                                 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
1015                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1016                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1017                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1018
1019                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1020                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1021                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1022                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1023                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1024                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
1025
1026                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1027                                                 DATA_RESULT(xmm7);
1028                                         }
1029                                 }
1030                                 else { /* order == 3 */
1031                                         __m128i xmm0, xmm1, xmm6, xmm7;
1032                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1033                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1034
1035                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1036
1037                                         for(i = 0; i < (int)data_len; i++) {
1038                                                 //sum = 0;
1039                                                 //sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
1040                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1041                                                 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1042
1043                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1044                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1045                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1046                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1047                                                 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1048                                                 xmm7 = _mm_add_epi64(xmm7, xmm6);
1049
1050                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1051                                                 DATA_RESULT(xmm7);
1052                                         }
1053                                 }
1054                         }
1055                         else { /* order == 1, 2 */
1056                                 if(order == 2) {
1057                                         __m128i xmm0, xmm7;
1058                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1059                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1060
1061                                         for(i = 0; i < (int)data_len; i++) {
1062                                                 //sum = 0;
1063                                                 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1064                                                 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1065                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1066                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1067                                                 xmm7 = _mm_mul_epi32(xmm7, xmm0);
1068
1069                                                 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1070                                                 DATA_RESULT(xmm7);
1071                                         }
1072                                 }
1073                                 else { /* order == 1 */
1074                                         for(i = 0; i < (int)data_len; i++)
1075                                                 data[i] = residual[i] + (FLAC__int32)((qlp_coeff[0] * (FLAC__int64)data[i-1]) >> lp_quantization);
1076                                 }
1077                         }
1078                 }
1079         }
1080         else { /* order > 12 */
1081                 FLAC__int64 sum;
1082                 for(i = 0; i < (int)data_len; i++) {
1083                         sum = 0;
1084                         switch(order) {
1085                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1086                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1087                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1088                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1089                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1090                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1091                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1092                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1093                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1094                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1095                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1096                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1097                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1098                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1099                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1100                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1101                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1102                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1103                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1104                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1105                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1106                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1107                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1108                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1109                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1110                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1111                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1112                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1113                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1114                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1115                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1116                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1117                         }
1118                         data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1119                 }
1120         }
1121 }
1122
1123 #endif /* FLAC__SSE4_SUPPORTED */
1124 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1125 #endif /* FLAC__NO_ASM */
1126 #endif /* FLAC__INTEGER_ONLY_LIBRARY */