lpc_intrin* : Remove unused code.
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2013  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <emmintrin.h> /* SSE2 */
47
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
50
51 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
53
54 FLAC__SSE_TARGET("sse2")
55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
56 {
57         int i;
58         FLAC__int32 sum;
59         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
60
61         FLAC__ASSERT(order > 0);
62         FLAC__ASSERT(order <= 32);
63
64         if(order <= 12) {
65                 if(order > 8) {
66                         if(order > 10) {
67                                 if(order == 12) {
68                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
69                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
70                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
71                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
72                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
73                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
74                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
75                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
76                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
77                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
78                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
79                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
80                                         q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
81
82                                         for(i = 0; i < (int)data_len-3; i+=4) {
83                                                 __m128i summ, mull;
84                                                 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
85                                                 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
86                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
87                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
88                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
89                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
90                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
91                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
92                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
93                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
94                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
95                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
96                                                 summ = _mm_sra_epi32(summ, cnt);
97                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
98                                         }
99                                 }
100                                 else { /* order == 11 */
101                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
102                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
103                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
104                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
105                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
106                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
107                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
108                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
109                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
110                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
111                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
112                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
113
114                                         for(i = 0; i < (int)data_len-3; i+=4) {
115                                                 __m128i summ, mull;
116                                                 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
117                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
118                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
119                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
120                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
121                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
122                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
123                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
124                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
125                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
126                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
127                                                 summ = _mm_sra_epi32(summ, cnt);
128                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
129                                         }
130                                 }
131                         }
132                         else {
133                                 if(order == 10) {
134                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
135                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
136                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
137                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
138                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
139                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
140                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
141                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
142                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
143                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
144                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
145
146                                         for(i = 0; i < (int)data_len-3; i+=4) {
147                                                 __m128i summ, mull;
148                                                 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
149                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
150                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
151                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
152                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
153                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
154                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
155                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
156                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
157                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
158                                                 summ = _mm_sra_epi32(summ, cnt);
159                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
160                                         }
161                                 }
162                                 else { /* order == 9 */
163                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
164                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
165                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
166                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
167                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
168                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
169                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
170                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
171                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
172                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
173
174                                         for(i = 0; i < (int)data_len-3; i+=4) {
175                                                 __m128i summ, mull;
176                                                 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
177                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
178                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
179                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
180                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
181                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
182                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
183                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
184                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
185                                                 summ = _mm_sra_epi32(summ, cnt);
186                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
187                                         }
188                                 }
189                         }
190                 }
191                 else if(order > 4) {
192                         if(order > 6) {
193                                 if(order == 8) {
194                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7;
195                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
196                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
197                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
198                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
199                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
200                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
201                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
202                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
203
204                                         for(i = 0; i < (int)data_len-3; i+=4) {
205                                                 __m128i summ, mull;
206                                                 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
207                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
208                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
209                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
210                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
211                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
212                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
213                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
214                                                 summ = _mm_sra_epi32(summ, cnt);
215                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
216                                         }
217                                 }
218                                 else { /* order == 7 */
219                                         __m128i q0, q1, q2, q3, q4, q5, q6;
220                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
221                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
222                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
223                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
224                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
225                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
226                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
227
228                                         for(i = 0; i < (int)data_len-3; i+=4) {
229                                                 __m128i summ, mull;
230                                                 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
231                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
232                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
233                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
234                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
235                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
236                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
237                                                 summ = _mm_sra_epi32(summ, cnt);
238                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
239                                         }
240                                 }
241                         }
242                         else {
243                                 if(order == 6) {
244                                         __m128i q0, q1, q2, q3, q4, q5;
245                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
246                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
247                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
248                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
249                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
250                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
251
252                                         for(i = 0; i < (int)data_len-3; i+=4) {
253                                                 __m128i summ, mull;
254                                                 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
255                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
256                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
257                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
258                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
259                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
260                                                 summ = _mm_sra_epi32(summ, cnt);
261                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
262                                         }
263                                 }
264                                 else { /* order == 5 */
265                                         __m128i q0, q1, q2, q3, q4;
266                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
267                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
268                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
269                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
270                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
271
272                                         for(i = 0; i < (int)data_len-3; i+=4) {
273                                                 __m128i summ, mull;
274                                                 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
275                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
276                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
277                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
278                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
279                                                 summ = _mm_sra_epi32(summ, cnt);
280                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
281                                         }
282                                 }
283                         }
284                 }
285                 else {
286                         if(order > 2) {
287                                 if(order == 4) {
288                                         __m128i q0, q1, q2, q3;
289                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
290                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
291                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
292                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
293
294                                         for(i = 0; i < (int)data_len-3; i+=4) {
295                                                 __m128i summ, mull;
296                                                 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
297                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
298                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
299                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
300                                                 summ = _mm_sra_epi32(summ, cnt);
301                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
302                                         }
303                                 }
304                                 else { /* order == 3 */
305                                         __m128i q0, q1, q2;
306                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
307                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
308                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
309
310                                         for(i = 0; i < (int)data_len-3; i+=4) {
311                                                 __m128i summ, mull;
312                                                 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
313                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
314                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
315                                                 summ = _mm_sra_epi32(summ, cnt);
316                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
317                                         }
318                                 }
319                         }
320                         else {
321                                 if(order == 2) {
322                                         __m128i q0, q1;
323                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
324                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
325
326                                         for(i = 0; i < (int)data_len-3; i+=4) {
327                                                 __m128i summ, mull;
328                                                 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
329                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
330                                                 summ = _mm_sra_epi32(summ, cnt);
331                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
332                                         }
333                                 }
334                                 else { /* order == 1 */
335                                         __m128i q0;
336                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
337
338                                         for(i = 0; i < (int)data_len-3; i+=4) {
339                                                 __m128i summ;
340                                                 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
341                                                 summ = _mm_sra_epi32(summ, cnt);
342                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
343                                         }
344                                 }
345                         }
346                 }
347                 for(; i < (int)data_len; i++) {
348                         sum = 0;
349                         switch(order) {
350                                 case 12: sum += qlp_coeff[11] * data[i-12];
351                                 case 11: sum += qlp_coeff[10] * data[i-11];
352                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
353                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
354                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
355                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
356                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
357                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
358                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
359                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
360                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
361                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
362                         }
363                         residual[i] = data[i] - (sum >> lp_quantization);
364                 }
365         }
366         else { /* order > 12 */
367                 for(i = 0; i < (int)data_len; i++) {
368                         sum = 0;
369                         switch(order) {
370                                 case 32: sum += qlp_coeff[31] * data[i-32];
371                                 case 31: sum += qlp_coeff[30] * data[i-31];
372                                 case 30: sum += qlp_coeff[29] * data[i-30];
373                                 case 29: sum += qlp_coeff[28] * data[i-29];
374                                 case 28: sum += qlp_coeff[27] * data[i-28];
375                                 case 27: sum += qlp_coeff[26] * data[i-27];
376                                 case 26: sum += qlp_coeff[25] * data[i-26];
377                                 case 25: sum += qlp_coeff[24] * data[i-25];
378                                 case 24: sum += qlp_coeff[23] * data[i-24];
379                                 case 23: sum += qlp_coeff[22] * data[i-23];
380                                 case 22: sum += qlp_coeff[21] * data[i-22];
381                                 case 21: sum += qlp_coeff[20] * data[i-21];
382                                 case 20: sum += qlp_coeff[19] * data[i-20];
383                                 case 19: sum += qlp_coeff[18] * data[i-19];
384                                 case 18: sum += qlp_coeff[17] * data[i-18];
385                                 case 17: sum += qlp_coeff[16] * data[i-17];
386                                 case 16: sum += qlp_coeff[15] * data[i-16];
387                                 case 15: sum += qlp_coeff[14] * data[i-15];
388                                 case 14: sum += qlp_coeff[13] * data[i-14];
389                                 case 13: sum += qlp_coeff[12] * data[i-13];
390                                          sum += qlp_coeff[11] * data[i-12];
391                                          sum += qlp_coeff[10] * data[i-11];
392                                          sum += qlp_coeff[ 9] * data[i-10];
393                                          sum += qlp_coeff[ 8] * data[i- 9];
394                                          sum += qlp_coeff[ 7] * data[i- 8];
395                                          sum += qlp_coeff[ 6] * data[i- 7];
396                                          sum += qlp_coeff[ 5] * data[i- 6];
397                                          sum += qlp_coeff[ 4] * data[i- 5];
398                                          sum += qlp_coeff[ 3] * data[i- 4];
399                                          sum += qlp_coeff[ 2] * data[i- 3];
400                                          sum += qlp_coeff[ 1] * data[i- 2];
401                                          sum += qlp_coeff[ 0] * data[i- 1];
402                         }
403                         residual[i] = data[i] - (sum >> lp_quantization);
404                 }
405         }
406 }
407
408 FLAC__SSE_TARGET("sse2")
409 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
410 {
411         int i;
412
413         FLAC__ASSERT(order > 0);
414         FLAC__ASSERT(order <= 32);
415
416         if(order <= 12) {
417                 if(order > 8) { /* order == 9, 10, 11, 12 */
418                         if(order > 10) { /* order == 11, 12 */
419                                 if(order == 12) {
420                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
421                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
422                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
423                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
424                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
425                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
426                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
427
428                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
429                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
430                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
431                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
432                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
433                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
434
435                                         for(i = 0; i < (int)data_len; i++) {
436                                                 //sum = 0;
437                                                 //sum += qlp_coeff[11] * data[i-12];
438                                                 //sum += qlp_coeff[10] * data[i-11];
439                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
440                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
441                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
442
443                                                 //sum += qlp_coeff[9] * data[i-10];
444                                                 //sum += qlp_coeff[8] * data[i-9];
445                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
446                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
447                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
448                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
449
450                                                 //sum += qlp_coeff[7] * data[i-8];
451                                                 //sum += qlp_coeff[6] * data[i-7];
452                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
453                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
454                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
455                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
456
457                                                 //sum += qlp_coeff[5] * data[i-6];
458                                                 //sum += qlp_coeff[4] * data[i-5];
459                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
460                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
461                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
462                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
463
464                                                 //sum += qlp_coeff[3] * data[i-4];
465                                                 //sum += qlp_coeff[2] * data[i-3];
466                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
467                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
468                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
469                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
470
471                                                 //sum += qlp_coeff[1] * data[i-2];
472                                                 //sum += qlp_coeff[0] * data[i-1];
473                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
474                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
475                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
476                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
477
478                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
479                                                 RESIDUAL32_RESULT(xmm7);
480                                         }
481                                 }
482                                 else { /* order == 11 */
483                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
484                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
485                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
486                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
487                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
488                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
489                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
490
491                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
492                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
493                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
494                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
495                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
496
497                                         for(i = 0; i < (int)data_len; i++) {
498                                                 //sum = 0;
499                                                 //sum  = qlp_coeff[10] * data[i-11];
500                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
501                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
502
503                                                 //sum += qlp_coeff[9] * data[i-10];
504                                                 //sum += qlp_coeff[8] * data[i-9];
505                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
506                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
507                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
508                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
509
510                                                 //sum += qlp_coeff[7] * data[i-8];
511                                                 //sum += qlp_coeff[6] * data[i-7];
512                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
513                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
514                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
515                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
516
517                                                 //sum += qlp_coeff[5] * data[i-6];
518                                                 //sum += qlp_coeff[4] * data[i-5];
519                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
520                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
521                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
522                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
523
524                                                 //sum += qlp_coeff[3] * data[i-4];
525                                                 //sum += qlp_coeff[2] * data[i-3];
526                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
527                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
528                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
529                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
530
531                                                 //sum += qlp_coeff[1] * data[i-2];
532                                                 //sum += qlp_coeff[0] * data[i-1];
533                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
534                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
535                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
536                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
537
538                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
539                                                 RESIDUAL32_RESULT(xmm7);
540                                         }
541                                 }
542                         }
543                         else { /* order == 9, 10 */
544                                 if(order == 10) {
545                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
546                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
547                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
548                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
549                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
550                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
551
552                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
553                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
554                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
555                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
556                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
557
558                                         for(i = 0; i < (int)data_len; i++) {
559                                                 //sum = 0;
560                                                 //sum += qlp_coeff[9] * data[i-10];
561                                                 //sum += qlp_coeff[8] * data[i-9];
562                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
563                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
564                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
565
566                                                 //sum += qlp_coeff[7] * data[i-8];
567                                                 //sum += qlp_coeff[6] * data[i-7];
568                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
569                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
570                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
571                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
572
573                                                 //sum += qlp_coeff[5] * data[i-6];
574                                                 //sum += qlp_coeff[4] * data[i-5];
575                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
576                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
577                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
578                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
579
580                                                 //sum += qlp_coeff[3] * data[i-4];
581                                                 //sum += qlp_coeff[2] * data[i-3];
582                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
583                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
584                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
585                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
586
587                                                 //sum += qlp_coeff[1] * data[i-2];
588                                                 //sum += qlp_coeff[0] * data[i-1];
589                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
590                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
591                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
592                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
593
594                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
595                                                 RESIDUAL32_RESULT(xmm7);
596                                         }
597                                 }
598                                 else { /* order == 9 */
599                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
600                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
601                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
602                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
603                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
604                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
605
606                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
607                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
608                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
609                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
610
611                                         for(i = 0; i < (int)data_len; i++) {
612                                                 //sum = 0;
613                                                 //sum  = qlp_coeff[8] * data[i-9];
614                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
615                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
616
617                                                 //sum += qlp_coeff[7] * data[i-8];
618                                                 //sum += qlp_coeff[6] * data[i-7];
619                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
620                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
621                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
622                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
623
624                                                 //sum += qlp_coeff[5] * data[i-6];
625                                                 //sum += qlp_coeff[4] * data[i-5];
626                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
627                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
628                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
629                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
630
631                                                 //sum += qlp_coeff[3] * data[i-4];
632                                                 //sum += qlp_coeff[2] * data[i-3];
633                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
634                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
635                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
636                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
637
638                                                 //sum += qlp_coeff[1] * data[i-2];
639                                                 //sum += qlp_coeff[0] * data[i-1];
640                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
641                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
642                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
643                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
644
645                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
646                                                 RESIDUAL32_RESULT(xmm7);
647                                         }
648                                 }
649                         }
650                 }
651                 else if(order > 4) { /* order == 5, 6, 7, 8 */
652                         if(order > 6) { /* order == 7, 8 */
653                                 if(order == 8) {
654                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
655                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
656                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
657                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
658                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
659
660                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
661                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
662                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
663                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
664
665                                         for(i = 0; i < (int)data_len; i++) {
666                                                 //sum = 0;
667                                                 //sum += qlp_coeff[7] * data[i-8];
668                                                 //sum += qlp_coeff[6] * data[i-7];
669                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
670                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
671                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
672
673                                                 //sum += qlp_coeff[5] * data[i-6];
674                                                 //sum += qlp_coeff[4] * data[i-5];
675                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
676                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
677                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
678                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
679
680                                                 //sum += qlp_coeff[3] * data[i-4];
681                                                 //sum += qlp_coeff[2] * data[i-3];
682                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
683                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
684                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
685                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
686
687                                                 //sum += qlp_coeff[1] * data[i-2];
688                                                 //sum += qlp_coeff[0] * data[i-1];
689                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
690                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
691                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
692                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
693
694                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
695                                                 RESIDUAL32_RESULT(xmm7);
696                                         }
697                                 }
698                                 else { /* order == 7 */
699                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
700                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
701                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
702                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
703                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
704
705                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
706                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
707                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
708
709                                         for(i = 0; i < (int)data_len; i++) {
710                                                 //sum = 0;
711                                                 //sum  = qlp_coeff[6] * data[i-7];
712                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
713                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
714
715                                                 //sum += qlp_coeff[5] * data[i-6];
716                                                 //sum += qlp_coeff[4] * data[i-5];
717                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
718                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
719                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
720                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
721
722                                                 //sum += qlp_coeff[3] * data[i-4];
723                                                 //sum += qlp_coeff[2] * data[i-3];
724                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
725                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
726                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
727                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
728
729                                                 //sum += qlp_coeff[1] * data[i-2];
730                                                 //sum += qlp_coeff[0] * data[i-1];
731                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
732                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
733                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
734                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
735
736                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
737                                                 RESIDUAL32_RESULT(xmm7);
738                                         }
739                                 }
740                         }
741                         else { /* order == 5, 6 */
742                                 if(order == 6) {
743                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
744                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
745                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
746                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
747
748                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
749                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
750                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
751
752                                         for(i = 0; i < (int)data_len; i++) {
753                                                 //sum = 0;
754                                                 //sum += qlp_coeff[5] * data[i-6];
755                                                 //sum += qlp_coeff[4] * data[i-5];
756                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
757                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
758                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
759
760                                                 //sum += qlp_coeff[3] * data[i-4];
761                                                 //sum += qlp_coeff[2] * data[i-3];
762                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
763                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
764                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
765                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
766
767                                                 //sum += qlp_coeff[1] * data[i-2];
768                                                 //sum += qlp_coeff[0] * data[i-1];
769                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
770                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
771                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
772                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
773
774                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
775                                                 RESIDUAL32_RESULT(xmm7);
776                                         }
777                                 }
778                                 else { /* order == 5 */
779                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
780                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
781                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
782                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
783
784                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
785                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
786
787                                         for(i = 0; i < (int)data_len; i++) {
788                                                 //sum = 0;
789                                                 //sum  = qlp_coeff[4] * data[i-5];
790                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
791                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
792
793                                                 //sum += qlp_coeff[3] * data[i-4];
794                                                 //sum += qlp_coeff[2] * data[i-3];
795                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
796                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
797                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
798                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
799
800                                                 //sum += qlp_coeff[1] * data[i-2];
801                                                 //sum += qlp_coeff[0] * data[i-1];
802                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
803                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
804                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
805                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
806
807                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
808                                                 RESIDUAL32_RESULT(xmm7);
809                                         }
810                                 }
811                         }
812                 }
813                 else { /* order == 1, 2, 3, 4 */
814                         if(order > 2) { /* order == 3, 4 */
815                                 if(order == 4) {
816                                         __m128i xmm0, xmm1, xmm6, xmm7;
817                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
818                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
819
820                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
821                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
822
823                                         for(i = 0; i < (int)data_len; i++) {
824                                                 //sum = 0;
825                                                 //sum += qlp_coeff[3] * data[i-4];
826                                                 //sum += qlp_coeff[2] * data[i-3];
827                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
828                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
829                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
830
831                                                 //sum += qlp_coeff[1] * data[i-2];
832                                                 //sum += qlp_coeff[0] * data[i-1];
833                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
834                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
835                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
836                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
837
838                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
839                                                 RESIDUAL32_RESULT(xmm7);
840                                         }
841                                 }
842                                 else { /* order == 3 */
843                                         __m128i xmm0, xmm1, xmm6, xmm7;
844                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
845                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
846
847                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
848
849                                         for(i = 0; i < (int)data_len; i++) {
850                                                 //sum = 0;
851                                                 //sum  = qlp_coeff[2] * data[i-3];
852                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
853                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
854
855                                                 //sum += qlp_coeff[1] * data[i-2];
856                                                 //sum += qlp_coeff[0] * data[i-1];
857                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
858                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
859                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
860                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
861
862                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
863                                                 RESIDUAL32_RESULT(xmm7);
864                                         }
865                                 }
866                         }
867                         else { /* order == 1, 2 */
868                                 if(order == 2) {
869                                         __m128i xmm0, xmm7;
870                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
871                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
872
873                                         for(i = 0; i < (int)data_len; i++) {
874                                                 //sum = 0;
875                                                 //sum += qlp_coeff[1] * data[i-2];
876                                                 //sum += qlp_coeff[0] * data[i-1];
877                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
878                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
879                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
880
881                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
882                                                 RESIDUAL32_RESULT(xmm7);
883                                         }
884                                 }
885                                 else { /* order == 1 */
886                                         for(i = 0; i < (int)data_len; i++)
887                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
888                                 }
889                         }
890                 }
891         }
892         else { /* order > 12 */
893                 FLAC__int32 sum;
894                 for(i = 0; i < (int)data_len; i++) {
895                         sum = 0;
896                         switch(order) {
897                                 case 32: sum += qlp_coeff[31] * data[i-32];
898                                 case 31: sum += qlp_coeff[30] * data[i-31];
899                                 case 30: sum += qlp_coeff[29] * data[i-30];
900                                 case 29: sum += qlp_coeff[28] * data[i-29];
901                                 case 28: sum += qlp_coeff[27] * data[i-28];
902                                 case 27: sum += qlp_coeff[26] * data[i-27];
903                                 case 26: sum += qlp_coeff[25] * data[i-26];
904                                 case 25: sum += qlp_coeff[24] * data[i-25];
905                                 case 24: sum += qlp_coeff[23] * data[i-24];
906                                 case 23: sum += qlp_coeff[22] * data[i-23];
907                                 case 22: sum += qlp_coeff[21] * data[i-22];
908                                 case 21: sum += qlp_coeff[20] * data[i-21];
909                                 case 20: sum += qlp_coeff[19] * data[i-20];
910                                 case 19: sum += qlp_coeff[18] * data[i-19];
911                                 case 18: sum += qlp_coeff[17] * data[i-18];
912                                 case 17: sum += qlp_coeff[16] * data[i-17];
913                                 case 16: sum += qlp_coeff[15] * data[i-16];
914                                 case 15: sum += qlp_coeff[14] * data[i-15];
915                                 case 14: sum += qlp_coeff[13] * data[i-14];
916                                 case 13: sum += qlp_coeff[12] * data[i-13];
917                                          sum += qlp_coeff[11] * data[i-12];
918                                          sum += qlp_coeff[10] * data[i-11];
919                                          sum += qlp_coeff[ 9] * data[i-10];
920                                          sum += qlp_coeff[ 8] * data[i- 9];
921                                          sum += qlp_coeff[ 7] * data[i- 8];
922                                          sum += qlp_coeff[ 6] * data[i- 7];
923                                          sum += qlp_coeff[ 5] * data[i- 6];
924                                          sum += qlp_coeff[ 4] * data[i- 5];
925                                          sum += qlp_coeff[ 3] * data[i- 4];
926                                          sum += qlp_coeff[ 2] * data[i- 3];
927                                          sum += qlp_coeff[ 1] * data[i- 2];
928                                          sum += qlp_coeff[ 0] * data[i- 1];
929                         }
930                         residual[i] = data[i] - (sum >> lp_quantization);
931                 }
932         }
933 }
934
935 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
936
937 FLAC__SSE_TARGET("sse2")
938 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
939 {
940         if (order < 8 || order > 12) {
941                 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
942                 return;
943         }
944         if (data_len == 0)
945                 return;
946
947         FLAC__ASSERT(order >= 8);
948         FLAC__ASSERT(order <= 12);
949
950         if(order > 8) { /* order == 9, 10, 11, 12 */
951                 FLAC__int32 curr;
952                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
953                 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
954                 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
955                 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
956                 switch(order)                                          /* ...and zero them out */
957                 {
958                 case 9:
959                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
960                 case 10:
961                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
962                 case 11:
963                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
964                 }
965                 xmm2 = _mm_setzero_si128();
966                 xmm0 = _mm_packs_epi32(xmm0, xmm6);
967                 xmm1 = _mm_packs_epi32(xmm1, xmm2);
968
969                 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
970                 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
971                 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
972                 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
973                 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
974                 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
975                 xmm4 = _mm_packs_epi32(xmm4, xmm2);
976                 xmm3 = _mm_packs_epi32(xmm3, xmm5);
977
978                 xmm7 = _mm_slli_si128(xmm1, 2);
979                 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
980                 xmm2 = _mm_slli_si128(xmm0, 2);
981
982                 /* xmm0, xmm1: qlp_coeff
983                         xmm2, xmm7: qlp_coeff << 16 bit
984                         xmm3, xmm4: data */
985
986                 xmm5 = _mm_madd_epi16(xmm4, xmm1);
987                 xmm6 = _mm_madd_epi16(xmm3, xmm0);
988                 xmm6 = _mm_add_epi32(xmm6, xmm5);
989                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
990                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
991
992                 DATA16_RESULT(xmm6);
993
994                 data_len--;
995
996                 if(data_len % 2) {
997                         xmm6 = _mm_srli_si128(xmm3, 14);
998                         xmm4 = _mm_slli_si128(xmm4, 2);
999                         xmm3 = _mm_slli_si128(xmm3, 2);
1000                         xmm4 = _mm_or_si128(xmm4, xmm6);
1001                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1002
1003                         xmm5 = _mm_madd_epi16(xmm4, xmm1);
1004                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1005                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1006                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1007                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1008
1009                         DATA16_RESULT(xmm6);
1010
1011                         data_len--;
1012                 }
1013
1014                 while(data_len) { /* data_len is a multiple of 2 */
1015                         /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1016                         xmm6 = _mm_srli_si128(xmm3, 12);
1017                         xmm4 = _mm_slli_si128(xmm4, 4);
1018                         xmm3 = _mm_slli_si128(xmm3, 4);
1019                         xmm4 = _mm_or_si128(xmm4, xmm6);
1020                         xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1021
1022                         xmm5 = _mm_madd_epi16(xmm4, xmm7);
1023                         xmm6 = _mm_madd_epi16(xmm3, xmm2);
1024                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1025                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1026                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1027
1028                         DATA16_RESULT(xmm6);
1029
1030                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1031
1032                         xmm5 = _mm_madd_epi16(xmm4, xmm1);
1033                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1034                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1035                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1036                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1037
1038                         DATA16_RESULT(xmm6);
1039
1040                         data_len-=2;
1041                 }
1042         } /* endif(order > 8) */
1043         else
1044         {
1045                 FLAC__int32 curr;
1046                 __m128i xmm0, xmm1, xmm3, xmm6;
1047                 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1048                 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1049                 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1050
1051                 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1052                 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1053                 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1054                 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1055                 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1056
1057                 /* xmm0: qlp_coeff
1058                         xmm3: data */
1059
1060                 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1061                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1062                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1063
1064                 DATA16_RESULT(xmm6);
1065
1066                 data_len--;
1067
1068                 while(data_len) {
1069                         xmm3 = _mm_slli_si128(xmm3, 2);
1070                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1071
1072                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1073                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1074                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1075
1076                         DATA16_RESULT(xmm6);
1077
1078                         data_len--;
1079                 }
1080         }
1081 }
1082
1083 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1084
1085 #endif /* FLAC__SSE2_SUPPORTED */
1086 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1087 #endif /* FLAC__NO_ASM */
1088 #endif /* FLAC__INTEGER_ONLY_LIBRARY */