libFLAC/cpu.c: Get rid of OS_IS_ANDROID function
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2014  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <emmintrin.h> /* SSE2 */
49
50 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
52
53 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
55
56 FLAC__SSE_TARGET("sse2")
57 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
58 {
59         int i;
60         FLAC__int32 sum;
61         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
62
63         FLAC__ASSERT(order > 0);
64         FLAC__ASSERT(order <= 32);
65
66         if(order <= 12) {
67                 if(order > 8) {
68                         if(order > 10) {
69                                 if(order == 12) {
70                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
71                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
72                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
73                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
74                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
75                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
76                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
77                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
78                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
79                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
80                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
81                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
82                                         q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
83
84                                         for(i = 0; i < (int)data_len-3; i+=4) {
85                                                 __m128i summ, mull;
86                                                 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
87                                                 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
88                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
89                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
90                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
91                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
92                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
93                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
94                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
95                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
96                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
97                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
98                                                 summ = _mm_sra_epi32(summ, cnt);
99                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
100                                         }
101                                 }
102                                 else { /* order == 11 */
103                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
104                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
105                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
106                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
107                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
108                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
109                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
110                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
111                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
112                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
113                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
114                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
115
116                                         for(i = 0; i < (int)data_len-3; i+=4) {
117                                                 __m128i summ, mull;
118                                                 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
119                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
120                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
121                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
122                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
123                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
124                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
125                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
126                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
127                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
128                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
129                                                 summ = _mm_sra_epi32(summ, cnt);
130                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
131                                         }
132                                 }
133                         }
134                         else {
135                                 if(order == 10) {
136                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
137                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
138                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
139                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
140                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
141                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
142                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
143                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
144                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
145                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
146                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
147
148                                         for(i = 0; i < (int)data_len-3; i+=4) {
149                                                 __m128i summ, mull;
150                                                 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
151                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
152                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
153                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
154                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
155                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
156                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
157                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
158                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
159                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
160                                                 summ = _mm_sra_epi32(summ, cnt);
161                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
162                                         }
163                                 }
164                                 else { /* order == 9 */
165                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
166                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
167                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
168                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
169                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
170                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
171                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
172                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
173                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
174                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
175
176                                         for(i = 0; i < (int)data_len-3; i+=4) {
177                                                 __m128i summ, mull;
178                                                 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
179                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
180                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
181                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
182                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
183                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
184                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
185                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
186                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
187                                                 summ = _mm_sra_epi32(summ, cnt);
188                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
189                                         }
190                                 }
191                         }
192                 }
193                 else if(order > 4) {
194                         if(order > 6) {
195                                 if(order == 8) {
196                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7;
197                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
198                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
199                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
200                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
201                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
202                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
203                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
204                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
205
206                                         for(i = 0; i < (int)data_len-3; i+=4) {
207                                                 __m128i summ, mull;
208                                                 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
209                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
210                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
211                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
212                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
213                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
214                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
215                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
216                                                 summ = _mm_sra_epi32(summ, cnt);
217                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
218                                         }
219                                 }
220                                 else { /* order == 7 */
221                                         __m128i q0, q1, q2, q3, q4, q5, q6;
222                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
223                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
224                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
225                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
226                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
227                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
228                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
229
230                                         for(i = 0; i < (int)data_len-3; i+=4) {
231                                                 __m128i summ, mull;
232                                                 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
233                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239                                                 summ = _mm_sra_epi32(summ, cnt);
240                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
241                                         }
242                                 }
243                         }
244                         else {
245                                 if(order == 6) {
246                                         __m128i q0, q1, q2, q3, q4, q5;
247                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
248                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
249                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
250                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
251                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
252                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
253
254                                         for(i = 0; i < (int)data_len-3; i+=4) {
255                                                 __m128i summ, mull;
256                                                 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
257                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262                                                 summ = _mm_sra_epi32(summ, cnt);
263                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
264                                         }
265                                 }
266                                 else { /* order == 5 */
267                                         __m128i q0, q1, q2, q3, q4;
268                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
269                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
270                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
271                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
272                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
273
274                                         for(i = 0; i < (int)data_len-3; i+=4) {
275                                                 __m128i summ, mull;
276                                                 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
277                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
278                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
279                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
280                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
281                                                 summ = _mm_sra_epi32(summ, cnt);
282                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
283                                         }
284                                 }
285                         }
286                 }
287                 else {
288                         if(order > 2) {
289                                 if(order == 4) {
290                                         __m128i q0, q1, q2, q3;
291                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295
296                                         for(i = 0; i < (int)data_len-3; i+=4) {
297                                                 __m128i summ, mull;
298                                                 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
299                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
300                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
301                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
302                                                 summ = _mm_sra_epi32(summ, cnt);
303                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
304                                         }
305                                 }
306                                 else { /* order == 3 */
307                                         __m128i q0, q1, q2;
308                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
309                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
310                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
311
312                                         for(i = 0; i < (int)data_len-3; i+=4) {
313                                                 __m128i summ, mull;
314                                                 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
315                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
316                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
317                                                 summ = _mm_sra_epi32(summ, cnt);
318                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
319                                         }
320                                 }
321                         }
322                         else {
323                                 if(order == 2) {
324                                         __m128i q0, q1;
325                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
326                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
327
328                                         for(i = 0; i < (int)data_len-3; i+=4) {
329                                                 __m128i summ, mull;
330                                                 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
331                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
332                                                 summ = _mm_sra_epi32(summ, cnt);
333                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
334                                         }
335                                 }
336                                 else { /* order == 1 */
337                                         __m128i q0;
338                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
339
340                                         for(i = 0; i < (int)data_len-3; i+=4) {
341                                                 __m128i summ;
342                                                 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
343                                                 summ = _mm_sra_epi32(summ, cnt);
344                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
345                                         }
346                                 }
347                         }
348                 }
349                 for(; i < (int)data_len; i++) {
350                         sum = 0;
351                         switch(order) {
352                                 case 12: sum += qlp_coeff[11] * data[i-12];
353                                 case 11: sum += qlp_coeff[10] * data[i-11];
354                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
355                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
356                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
357                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
358                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
359                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
360                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
361                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
362                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
363                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
364                         }
365                         residual[i] = data[i] - (sum >> lp_quantization);
366                 }
367         }
368         else { /* order > 12 */
369                 for(i = 0; i < (int)data_len; i++) {
370                         sum = 0;
371                         switch(order) {
372                                 case 32: sum += qlp_coeff[31] * data[i-32];
373                                 case 31: sum += qlp_coeff[30] * data[i-31];
374                                 case 30: sum += qlp_coeff[29] * data[i-30];
375                                 case 29: sum += qlp_coeff[28] * data[i-29];
376                                 case 28: sum += qlp_coeff[27] * data[i-28];
377                                 case 27: sum += qlp_coeff[26] * data[i-27];
378                                 case 26: sum += qlp_coeff[25] * data[i-26];
379                                 case 25: sum += qlp_coeff[24] * data[i-25];
380                                 case 24: sum += qlp_coeff[23] * data[i-24];
381                                 case 23: sum += qlp_coeff[22] * data[i-23];
382                                 case 22: sum += qlp_coeff[21] * data[i-22];
383                                 case 21: sum += qlp_coeff[20] * data[i-21];
384                                 case 20: sum += qlp_coeff[19] * data[i-20];
385                                 case 19: sum += qlp_coeff[18] * data[i-19];
386                                 case 18: sum += qlp_coeff[17] * data[i-18];
387                                 case 17: sum += qlp_coeff[16] * data[i-17];
388                                 case 16: sum += qlp_coeff[15] * data[i-16];
389                                 case 15: sum += qlp_coeff[14] * data[i-15];
390                                 case 14: sum += qlp_coeff[13] * data[i-14];
391                                 case 13: sum += qlp_coeff[12] * data[i-13];
392                                          sum += qlp_coeff[11] * data[i-12];
393                                          sum += qlp_coeff[10] * data[i-11];
394                                          sum += qlp_coeff[ 9] * data[i-10];
395                                          sum += qlp_coeff[ 8] * data[i- 9];
396                                          sum += qlp_coeff[ 7] * data[i- 8];
397                                          sum += qlp_coeff[ 6] * data[i- 7];
398                                          sum += qlp_coeff[ 5] * data[i- 6];
399                                          sum += qlp_coeff[ 4] * data[i- 5];
400                                          sum += qlp_coeff[ 3] * data[i- 4];
401                                          sum += qlp_coeff[ 2] * data[i- 3];
402                                          sum += qlp_coeff[ 1] * data[i- 2];
403                                          sum += qlp_coeff[ 0] * data[i- 1];
404                         }
405                         residual[i] = data[i] - (sum >> lp_quantization);
406                 }
407         }
408 }
409
410 FLAC__SSE_TARGET("sse2")
411 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
412 {
413         int i;
414
415         FLAC__ASSERT(order > 0);
416         FLAC__ASSERT(order <= 32);
417
418         if(order <= 12) {
419                 if(order > 8) { /* order == 9, 10, 11, 12 */
420                         if(order > 10) { /* order == 11, 12 */
421                                 if(order == 12) {
422                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
423                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
424                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
425                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
426                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
427                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
428                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
429
430                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
431                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
432                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
433                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
434                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
435                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
436
437                                         for(i = 0; i < (int)data_len; i++) {
438                                                 //sum = 0;
439                                                 //sum += qlp_coeff[11] * data[i-12];
440                                                 //sum += qlp_coeff[10] * data[i-11];
441                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
442                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
443                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
444
445                                                 //sum += qlp_coeff[9] * data[i-10];
446                                                 //sum += qlp_coeff[8] * data[i-9];
447                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
448                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
449                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
450                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
451
452                                                 //sum += qlp_coeff[7] * data[i-8];
453                                                 //sum += qlp_coeff[6] * data[i-7];
454                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
455                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
456                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
457                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
458
459                                                 //sum += qlp_coeff[5] * data[i-6];
460                                                 //sum += qlp_coeff[4] * data[i-5];
461                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
462                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
463                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
464                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
465
466                                                 //sum += qlp_coeff[3] * data[i-4];
467                                                 //sum += qlp_coeff[2] * data[i-3];
468                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
469                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
470                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
471                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
472
473                                                 //sum += qlp_coeff[1] * data[i-2];
474                                                 //sum += qlp_coeff[0] * data[i-1];
475                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
476                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
477                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
478                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
479
480                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
481                                                 RESIDUAL32_RESULT(xmm7);
482                                         }
483                                 }
484                                 else { /* order == 11 */
485                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
486                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
487                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
488                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
489                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
490                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
491                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
492
493                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
494                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
495                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
496                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
497                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
498
499                                         for(i = 0; i < (int)data_len; i++) {
500                                                 //sum = 0;
501                                                 //sum  = qlp_coeff[10] * data[i-11];
502                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
503                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
504
505                                                 //sum += qlp_coeff[9] * data[i-10];
506                                                 //sum += qlp_coeff[8] * data[i-9];
507                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
508                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
509                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
510                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
511
512                                                 //sum += qlp_coeff[7] * data[i-8];
513                                                 //sum += qlp_coeff[6] * data[i-7];
514                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
515                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
516                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
517                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
518
519                                                 //sum += qlp_coeff[5] * data[i-6];
520                                                 //sum += qlp_coeff[4] * data[i-5];
521                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
522                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
523                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
524                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
525
526                                                 //sum += qlp_coeff[3] * data[i-4];
527                                                 //sum += qlp_coeff[2] * data[i-3];
528                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
529                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
530                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
531                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
532
533                                                 //sum += qlp_coeff[1] * data[i-2];
534                                                 //sum += qlp_coeff[0] * data[i-1];
535                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
536                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
537                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
538                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
539
540                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
541                                                 RESIDUAL32_RESULT(xmm7);
542                                         }
543                                 }
544                         }
545                         else { /* order == 9, 10 */
546                                 if(order == 10) {
547                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
548                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
549                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
550                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
551                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
552                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
553
554                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
555                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
556                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
557                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
558                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
559
560                                         for(i = 0; i < (int)data_len; i++) {
561                                                 //sum = 0;
562                                                 //sum += qlp_coeff[9] * data[i-10];
563                                                 //sum += qlp_coeff[8] * data[i-9];
564                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
565                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
566                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
567
568                                                 //sum += qlp_coeff[7] * data[i-8];
569                                                 //sum += qlp_coeff[6] * data[i-7];
570                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
571                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
572                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
573                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
574
575                                                 //sum += qlp_coeff[5] * data[i-6];
576                                                 //sum += qlp_coeff[4] * data[i-5];
577                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
578                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
579                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
580                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
581
582                                                 //sum += qlp_coeff[3] * data[i-4];
583                                                 //sum += qlp_coeff[2] * data[i-3];
584                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
585                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
586                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
587                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
588
589                                                 //sum += qlp_coeff[1] * data[i-2];
590                                                 //sum += qlp_coeff[0] * data[i-1];
591                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
592                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
593                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
594                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
595
596                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
597                                                 RESIDUAL32_RESULT(xmm7);
598                                         }
599                                 }
600                                 else { /* order == 9 */
601                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
602                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
603                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
604                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
605                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
606                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
607
608                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
609                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
610                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
611                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
612
613                                         for(i = 0; i < (int)data_len; i++) {
614                                                 //sum = 0;
615                                                 //sum  = qlp_coeff[8] * data[i-9];
616                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
617                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
618
619                                                 //sum += qlp_coeff[7] * data[i-8];
620                                                 //sum += qlp_coeff[6] * data[i-7];
621                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
622                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
623                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
624                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
625
626                                                 //sum += qlp_coeff[5] * data[i-6];
627                                                 //sum += qlp_coeff[4] * data[i-5];
628                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
629                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
630                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
631                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
632
633                                                 //sum += qlp_coeff[3] * data[i-4];
634                                                 //sum += qlp_coeff[2] * data[i-3];
635                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
636                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
637                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
638                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
639
640                                                 //sum += qlp_coeff[1] * data[i-2];
641                                                 //sum += qlp_coeff[0] * data[i-1];
642                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
643                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
644                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
645                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
646
647                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
648                                                 RESIDUAL32_RESULT(xmm7);
649                                         }
650                                 }
651                         }
652                 }
653                 else if(order > 4) { /* order == 5, 6, 7, 8 */
654                         if(order > 6) { /* order == 7, 8 */
655                                 if(order == 8) {
656                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
657                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
658                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
659                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
660                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
661
662                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
663                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
664                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
665                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
666
667                                         for(i = 0; i < (int)data_len; i++) {
668                                                 //sum = 0;
669                                                 //sum += qlp_coeff[7] * data[i-8];
670                                                 //sum += qlp_coeff[6] * data[i-7];
671                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
672                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
673                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
674
675                                                 //sum += qlp_coeff[5] * data[i-6];
676                                                 //sum += qlp_coeff[4] * data[i-5];
677                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
678                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
679                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
680                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
681
682                                                 //sum += qlp_coeff[3] * data[i-4];
683                                                 //sum += qlp_coeff[2] * data[i-3];
684                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
685                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
686                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
687                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
688
689                                                 //sum += qlp_coeff[1] * data[i-2];
690                                                 //sum += qlp_coeff[0] * data[i-1];
691                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
692                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
693                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
694                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
695
696                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
697                                                 RESIDUAL32_RESULT(xmm7);
698                                         }
699                                 }
700                                 else { /* order == 7 */
701                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
702                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
703                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
704                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
705                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
706
707                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
708                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
709                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
710
711                                         for(i = 0; i < (int)data_len; i++) {
712                                                 //sum = 0;
713                                                 //sum  = qlp_coeff[6] * data[i-7];
714                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
715                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
716
717                                                 //sum += qlp_coeff[5] * data[i-6];
718                                                 //sum += qlp_coeff[4] * data[i-5];
719                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
720                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
721                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
722                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
723
724                                                 //sum += qlp_coeff[3] * data[i-4];
725                                                 //sum += qlp_coeff[2] * data[i-3];
726                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
727                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
728                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
729                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
730
731                                                 //sum += qlp_coeff[1] * data[i-2];
732                                                 //sum += qlp_coeff[0] * data[i-1];
733                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
734                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
735                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
736                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
737
738                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
739                                                 RESIDUAL32_RESULT(xmm7);
740                                         }
741                                 }
742                         }
743                         else { /* order == 5, 6 */
744                                 if(order == 6) {
745                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
746                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
747                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
748                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
749
750                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
751                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
752                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
753
754                                         for(i = 0; i < (int)data_len; i++) {
755                                                 //sum = 0;
756                                                 //sum += qlp_coeff[5] * data[i-6];
757                                                 //sum += qlp_coeff[4] * data[i-5];
758                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
759                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
760                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
761
762                                                 //sum += qlp_coeff[3] * data[i-4];
763                                                 //sum += qlp_coeff[2] * data[i-3];
764                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
765                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
766                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
767                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
768
769                                                 //sum += qlp_coeff[1] * data[i-2];
770                                                 //sum += qlp_coeff[0] * data[i-1];
771                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
772                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
773                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
774                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
775
776                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
777                                                 RESIDUAL32_RESULT(xmm7);
778                                         }
779                                 }
780                                 else { /* order == 5 */
781                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
782                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
783                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
784                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
785
786                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
787                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
788
789                                         for(i = 0; i < (int)data_len; i++) {
790                                                 //sum = 0;
791                                                 //sum  = qlp_coeff[4] * data[i-5];
792                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
793                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
794
795                                                 //sum += qlp_coeff[3] * data[i-4];
796                                                 //sum += qlp_coeff[2] * data[i-3];
797                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
798                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
800                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
801
802                                                 //sum += qlp_coeff[1] * data[i-2];
803                                                 //sum += qlp_coeff[0] * data[i-1];
804                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
805                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
807                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
808
809                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
810                                                 RESIDUAL32_RESULT(xmm7);
811                                         }
812                                 }
813                         }
814                 }
815                 else { /* order == 1, 2, 3, 4 */
816                         if(order > 2) { /* order == 3, 4 */
817                                 if(order == 4) {
818                                         __m128i xmm0, xmm1, xmm6, xmm7;
819                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
820                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
821
822                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
823                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
824
825                                         for(i = 0; i < (int)data_len; i++) {
826                                                 //sum = 0;
827                                                 //sum += qlp_coeff[3] * data[i-4];
828                                                 //sum += qlp_coeff[2] * data[i-3];
829                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
830                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
831                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
832
833                                                 //sum += qlp_coeff[1] * data[i-2];
834                                                 //sum += qlp_coeff[0] * data[i-1];
835                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
836                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
837                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
838                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
839
840                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
841                                                 RESIDUAL32_RESULT(xmm7);
842                                         }
843                                 }
844                                 else { /* order == 3 */
845                                         __m128i xmm0, xmm1, xmm6, xmm7;
846                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
847                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
848
849                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
850
851                                         for(i = 0; i < (int)data_len; i++) {
852                                                 //sum = 0;
853                                                 //sum  = qlp_coeff[2] * data[i-3];
854                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
855                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
856
857                                                 //sum += qlp_coeff[1] * data[i-2];
858                                                 //sum += qlp_coeff[0] * data[i-1];
859                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
860                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
861                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
862                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
863
864                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
865                                                 RESIDUAL32_RESULT(xmm7);
866                                         }
867                                 }
868                         }
869                         else { /* order == 1, 2 */
870                                 if(order == 2) {
871                                         __m128i xmm0, xmm7;
872                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
873                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
874
875                                         for(i = 0; i < (int)data_len; i++) {
876                                                 //sum = 0;
877                                                 //sum += qlp_coeff[1] * data[i-2];
878                                                 //sum += qlp_coeff[0] * data[i-1];
879                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
880                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
881                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
882
883                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
884                                                 RESIDUAL32_RESULT(xmm7);
885                                         }
886                                 }
887                                 else { /* order == 1 */
888                                         for(i = 0; i < (int)data_len; i++)
889                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
890                                 }
891                         }
892                 }
893         }
894         else { /* order > 12 */
895                 FLAC__int32 sum;
896                 for(i = 0; i < (int)data_len; i++) {
897                         sum = 0;
898                         switch(order) {
899                                 case 32: sum += qlp_coeff[31] * data[i-32];
900                                 case 31: sum += qlp_coeff[30] * data[i-31];
901                                 case 30: sum += qlp_coeff[29] * data[i-30];
902                                 case 29: sum += qlp_coeff[28] * data[i-29];
903                                 case 28: sum += qlp_coeff[27] * data[i-28];
904                                 case 27: sum += qlp_coeff[26] * data[i-27];
905                                 case 26: sum += qlp_coeff[25] * data[i-26];
906                                 case 25: sum += qlp_coeff[24] * data[i-25];
907                                 case 24: sum += qlp_coeff[23] * data[i-24];
908                                 case 23: sum += qlp_coeff[22] * data[i-23];
909                                 case 22: sum += qlp_coeff[21] * data[i-22];
910                                 case 21: sum += qlp_coeff[20] * data[i-21];
911                                 case 20: sum += qlp_coeff[19] * data[i-20];
912                                 case 19: sum += qlp_coeff[18] * data[i-19];
913                                 case 18: sum += qlp_coeff[17] * data[i-18];
914                                 case 17: sum += qlp_coeff[16] * data[i-17];
915                                 case 16: sum += qlp_coeff[15] * data[i-16];
916                                 case 15: sum += qlp_coeff[14] * data[i-15];
917                                 case 14: sum += qlp_coeff[13] * data[i-14];
918                                 case 13: sum += qlp_coeff[12] * data[i-13];
919                                          sum += qlp_coeff[11] * data[i-12];
920                                          sum += qlp_coeff[10] * data[i-11];
921                                          sum += qlp_coeff[ 9] * data[i-10];
922                                          sum += qlp_coeff[ 8] * data[i- 9];
923                                          sum += qlp_coeff[ 7] * data[i- 8];
924                                          sum += qlp_coeff[ 6] * data[i- 7];
925                                          sum += qlp_coeff[ 5] * data[i- 6];
926                                          sum += qlp_coeff[ 4] * data[i- 5];
927                                          sum += qlp_coeff[ 3] * data[i- 4];
928                                          sum += qlp_coeff[ 2] * data[i- 3];
929                                          sum += qlp_coeff[ 1] * data[i- 2];
930                                          sum += qlp_coeff[ 0] * data[i- 1];
931                         }
932                         residual[i] = data[i] - (sum >> lp_quantization);
933                 }
934         }
935 }
936
937 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
938
939 FLAC__SSE_TARGET("sse2")
940 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
941 {
942         if (order < 8 || order > 12) {
943                 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
944                 return;
945         }
946         if (data_len == 0)
947                 return;
948
949         FLAC__ASSERT(order >= 8);
950         FLAC__ASSERT(order <= 12);
951
952         if(order > 8) { /* order == 9, 10, 11, 12 */
953                 FLAC__int32 curr;
954                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
955                 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
956                 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
957                 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
958                 switch(order)                                          /* ...and zero them out */
959                 {
960                 case 9:
961                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
962                 case 10:
963                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
964                 case 11:
965                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
966                 }
967                 xmm2 = _mm_setzero_si128();
968                 xmm0 = _mm_packs_epi32(xmm0, xmm6);
969                 xmm1 = _mm_packs_epi32(xmm1, xmm2);
970
971                 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
972                 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
973                 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
974                 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
975                 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
976                 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
977                 xmm4 = _mm_packs_epi32(xmm4, xmm2);
978                 xmm3 = _mm_packs_epi32(xmm3, xmm5);
979
980                 xmm7 = _mm_slli_si128(xmm1, 2);
981                 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
982                 xmm2 = _mm_slli_si128(xmm0, 2);
983
984                 /* xmm0, xmm1: qlp_coeff
985                         xmm2, xmm7: qlp_coeff << 16 bit
986                         xmm3, xmm4: data */
987
988                 xmm5 = _mm_madd_epi16(xmm4, xmm1);
989                 xmm6 = _mm_madd_epi16(xmm3, xmm0);
990                 xmm6 = _mm_add_epi32(xmm6, xmm5);
991                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
992                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
993
994                 DATA16_RESULT(xmm6);
995
996                 data_len--;
997
998                 if(data_len % 2) {
999                         xmm6 = _mm_srli_si128(xmm3, 14);
1000                         xmm4 = _mm_slli_si128(xmm4, 2);
1001                         xmm3 = _mm_slli_si128(xmm3, 2);
1002                         xmm4 = _mm_or_si128(xmm4, xmm6);
1003                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1004
1005                         xmm5 = _mm_madd_epi16(xmm4, xmm1);
1006                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1007                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1008                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1009                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1010
1011                         DATA16_RESULT(xmm6);
1012
1013                         data_len--;
1014                 }
1015
1016                 while(data_len) { /* data_len is a multiple of 2 */
1017                         /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1018                         xmm6 = _mm_srli_si128(xmm3, 12);
1019                         xmm4 = _mm_slli_si128(xmm4, 4);
1020                         xmm3 = _mm_slli_si128(xmm3, 4);
1021                         xmm4 = _mm_or_si128(xmm4, xmm6);
1022                         xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1023
1024                         xmm5 = _mm_madd_epi16(xmm4, xmm7);
1025                         xmm6 = _mm_madd_epi16(xmm3, xmm2);
1026                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1027                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1028                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1029
1030                         DATA16_RESULT(xmm6);
1031
1032                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1033
1034                         xmm5 = _mm_madd_epi16(xmm4, xmm1);
1035                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1036                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1037                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1038                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1039
1040                         DATA16_RESULT(xmm6);
1041
1042                         data_len-=2;
1043                 }
1044         } /* endif(order > 8) */
1045         else
1046         {
1047                 FLAC__int32 curr;
1048                 __m128i xmm0, xmm1, xmm3, xmm6;
1049                 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1050                 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1051                 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1052
1053                 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1054                 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1055                 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1056                 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1057                 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1058
1059                 /* xmm0: qlp_coeff
1060                         xmm3: data */
1061
1062                 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1063                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1064                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1065
1066                 DATA16_RESULT(xmm6);
1067
1068                 data_len--;
1069
1070                 while(data_len) {
1071                         xmm3 = _mm_slli_si128(xmm3, 2);
1072                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1073
1074                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1075                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1076                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1077
1078                         DATA16_RESULT(xmm6);
1079
1080                         data_len--;
1081                 }
1082         }
1083 }
1084
1085 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1086
1087 #endif /* FLAC__SSE2_SUPPORTED */
1088 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1089 #endif /* FLAC__NO_ASM */
1090 #endif /* FLAC__INTEGER_ONLY_LIBRARY */