e8f88dc33a48478c6980823451865f029404eff9
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2016  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE2_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <emmintrin.h> /* SSE2 */
49
50 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51 #define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52
53 FLAC__SSE_TARGET("sse2")
54 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
55 {
56         int i;
57         FLAC__int32 sum;
58         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
59
60         FLAC__ASSERT(order > 0);
61         FLAC__ASSERT(order <= 32);
62
63         if(order <= 12) {
64                 if(order > 8) {
65                         if(order > 10) {
66                                 if(order == 12) {
67                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
68                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
69                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
70                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
71                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
72                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
73                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
74                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
75                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
76                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
77                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
78                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
79                                         q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
80
81                                         for(i = 0; i < (int)data_len-3; i+=4) {
82                                                 __m128i summ, mull;
83                                                 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
84                                                 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
85                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
86                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
87                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
88                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
89                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
90                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
91                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
92                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
93                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
94                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
95                                                 summ = _mm_sra_epi32(summ, cnt);
96                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
97                                         }
98                                 }
99                                 else { /* order == 11 */
100                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
101                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
102                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
103                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
104                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
105                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
106                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
107                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
108                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
109                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
110                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
111                                         q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
112
113                                         for(i = 0; i < (int)data_len-3; i+=4) {
114                                                 __m128i summ, mull;
115                                                 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
116                                                 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
117                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
118                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
119                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
120                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
121                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
122                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
123                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
124                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
125                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
126                                                 summ = _mm_sra_epi32(summ, cnt);
127                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
128                                         }
129                                 }
130                         }
131                         else {
132                                 if(order == 10) {
133                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
134                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
135                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
136                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
137                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
138                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
139                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
140                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
141                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
142                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
143                                         q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
144
145                                         for(i = 0; i < (int)data_len-3; i+=4) {
146                                                 __m128i summ, mull;
147                                                 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
148                                                 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
149                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
150                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
151                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
152                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
153                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
154                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
155                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
156                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
157                                                 summ = _mm_sra_epi32(summ, cnt);
158                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
159                                         }
160                                 }
161                                 else { /* order == 9 */
162                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
163                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
164                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
165                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
166                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
167                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
168                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
169                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
170                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
171                                         q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
172
173                                         for(i = 0; i < (int)data_len-3; i+=4) {
174                                                 __m128i summ, mull;
175                                                 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
176                                                 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
177                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
178                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
179                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
180                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
181                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
182                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
183                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
184                                                 summ = _mm_sra_epi32(summ, cnt);
185                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
186                                         }
187                                 }
188                         }
189                 }
190                 else if(order > 4) {
191                         if(order > 6) {
192                                 if(order == 8) {
193                                         __m128i q0, q1, q2, q3, q4, q5, q6, q7;
194                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
195                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
196                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
197                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
198                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
199                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
200                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
201                                         q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
202
203                                         for(i = 0; i < (int)data_len-3; i+=4) {
204                                                 __m128i summ, mull;
205                                                 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
206                                                 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
207                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
208                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
209                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
210                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
211                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
212                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
213                                                 summ = _mm_sra_epi32(summ, cnt);
214                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
215                                         }
216                                 }
217                                 else { /* order == 7 */
218                                         __m128i q0, q1, q2, q3, q4, q5, q6;
219                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
220                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
221                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
222                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
223                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
224                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
225                                         q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
226
227                                         for(i = 0; i < (int)data_len-3; i+=4) {
228                                                 __m128i summ, mull;
229                                                 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
230                                                 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
231                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
232                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
233                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
234                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
235                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
236                                                 summ = _mm_sra_epi32(summ, cnt);
237                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
238                                         }
239                                 }
240                         }
241                         else {
242                                 if(order == 6) {
243                                         __m128i q0, q1, q2, q3, q4, q5;
244                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
245                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
246                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
247                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
248                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
249                                         q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
250
251                                         for(i = 0; i < (int)data_len-3; i+=4) {
252                                                 __m128i summ, mull;
253                                                 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
254                                                 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
255                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
256                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
257                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
258                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
259                                                 summ = _mm_sra_epi32(summ, cnt);
260                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
261                                         }
262                                 }
263                                 else { /* order == 5 */
264                                         __m128i q0, q1, q2, q3, q4;
265                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
266                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
267                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
268                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
269                                         q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
270
271                                         for(i = 0; i < (int)data_len-3; i+=4) {
272                                                 __m128i summ, mull;
273                                                 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
274                                                 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
275                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
276                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
277                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
278                                                 summ = _mm_sra_epi32(summ, cnt);
279                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
280                                         }
281                                 }
282                         }
283                 }
284                 else {
285                         if(order > 2) {
286                                 if(order == 4) {
287                                         __m128i q0, q1, q2, q3;
288                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
289                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
290                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
291                                         q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
292
293                                         for(i = 0; i < (int)data_len-3; i+=4) {
294                                                 __m128i summ, mull;
295                                                 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
296                                                 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
297                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
298                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
299                                                 summ = _mm_sra_epi32(summ, cnt);
300                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
301                                         }
302                                 }
303                                 else { /* order == 3 */
304                                         __m128i q0, q1, q2;
305                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
306                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
307                                         q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
308
309                                         for(i = 0; i < (int)data_len-3; i+=4) {
310                                                 __m128i summ, mull;
311                                                 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
312                                                 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
313                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
314                                                 summ = _mm_sra_epi32(summ, cnt);
315                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
316                                         }
317                                 }
318                         }
319                         else {
320                                 if(order == 2) {
321                                         __m128i q0, q1;
322                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
323                                         q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
324
325                                         for(i = 0; i < (int)data_len-3; i+=4) {
326                                                 __m128i summ, mull;
327                                                 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
328                                                 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
329                                                 summ = _mm_sra_epi32(summ, cnt);
330                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
331                                         }
332                                 }
333                                 else { /* order == 1 */
334                                         __m128i q0;
335                                         q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
336
337                                         for(i = 0; i < (int)data_len-3; i+=4) {
338                                                 __m128i summ;
339                                                 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
340                                                 summ = _mm_sra_epi32(summ, cnt);
341                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
342                                         }
343                                 }
344                         }
345                 }
346                 for(; i < (int)data_len; i++) {
347                         sum = 0;
348                         switch(order) {
349                                 case 12: sum += qlp_coeff[11] * data[i-12];
350                                 case 11: sum += qlp_coeff[10] * data[i-11];
351                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
352                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
353                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
354                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
355                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
356                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
357                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
358                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
359                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
360                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
361                         }
362                         residual[i] = data[i] - (sum >> lp_quantization);
363                 }
364         }
365         else { /* order > 12 */
366                 for(i = 0; i < (int)data_len; i++) {
367                         sum = 0;
368                         switch(order) {
369                                 case 32: sum += qlp_coeff[31] * data[i-32];
370                                 case 31: sum += qlp_coeff[30] * data[i-31];
371                                 case 30: sum += qlp_coeff[29] * data[i-30];
372                                 case 29: sum += qlp_coeff[28] * data[i-29];
373                                 case 28: sum += qlp_coeff[27] * data[i-28];
374                                 case 27: sum += qlp_coeff[26] * data[i-27];
375                                 case 26: sum += qlp_coeff[25] * data[i-26];
376                                 case 25: sum += qlp_coeff[24] * data[i-25];
377                                 case 24: sum += qlp_coeff[23] * data[i-24];
378                                 case 23: sum += qlp_coeff[22] * data[i-23];
379                                 case 22: sum += qlp_coeff[21] * data[i-22];
380                                 case 21: sum += qlp_coeff[20] * data[i-21];
381                                 case 20: sum += qlp_coeff[19] * data[i-20];
382                                 case 19: sum += qlp_coeff[18] * data[i-19];
383                                 case 18: sum += qlp_coeff[17] * data[i-18];
384                                 case 17: sum += qlp_coeff[16] * data[i-17];
385                                 case 16: sum += qlp_coeff[15] * data[i-16];
386                                 case 15: sum += qlp_coeff[14] * data[i-15];
387                                 case 14: sum += qlp_coeff[13] * data[i-14];
388                                 case 13: sum += qlp_coeff[12] * data[i-13];
389                                          sum += qlp_coeff[11] * data[i-12];
390                                          sum += qlp_coeff[10] * data[i-11];
391                                          sum += qlp_coeff[ 9] * data[i-10];
392                                          sum += qlp_coeff[ 8] * data[i- 9];
393                                          sum += qlp_coeff[ 7] * data[i- 8];
394                                          sum += qlp_coeff[ 6] * data[i- 7];
395                                          sum += qlp_coeff[ 5] * data[i- 6];
396                                          sum += qlp_coeff[ 4] * data[i- 5];
397                                          sum += qlp_coeff[ 3] * data[i- 4];
398                                          sum += qlp_coeff[ 2] * data[i- 3];
399                                          sum += qlp_coeff[ 1] * data[i- 2];
400                                          sum += qlp_coeff[ 0] * data[i- 1];
401                         }
402                         residual[i] = data[i] - (sum >> lp_quantization);
403                 }
404         }
405 }
406
407 FLAC__SSE_TARGET("sse2")
408 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
409 {
410         int i;
411
412         FLAC__ASSERT(order > 0);
413         FLAC__ASSERT(order <= 32);
414
415         if(order <= 12) {
416                 if(order > 8) { /* order == 9, 10, 11, 12 */
417                         if(order > 10) { /* order == 11, 12 */
418                                 if(order == 12) {
419                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
420                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
421                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
422                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
423                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
424                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
425                                         xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
426
427                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
428                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
429                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
430                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
431                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
432                                         xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
433
434                                         for(i = 0; i < (int)data_len; i++) {
435                                                 //sum = 0;
436                                                 //sum += qlp_coeff[11] * data[i-12];
437                                                 //sum += qlp_coeff[10] * data[i-11];
438                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
439                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
440                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
441
442                                                 //sum += qlp_coeff[9] * data[i-10];
443                                                 //sum += qlp_coeff[8] * data[i-9];
444                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
445                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
446                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
447                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
448
449                                                 //sum += qlp_coeff[7] * data[i-8];
450                                                 //sum += qlp_coeff[6] * data[i-7];
451                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
452                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
454                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
455
456                                                 //sum += qlp_coeff[5] * data[i-6];
457                                                 //sum += qlp_coeff[4] * data[i-5];
458                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
459                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
460                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
461                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
462
463                                                 //sum += qlp_coeff[3] * data[i-4];
464                                                 //sum += qlp_coeff[2] * data[i-3];
465                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
466                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
467                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
468                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
469
470                                                 //sum += qlp_coeff[1] * data[i-2];
471                                                 //sum += qlp_coeff[0] * data[i-1];
472                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
473                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
474                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
475                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
476
477                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
478                                                 RESIDUAL32_RESULT(xmm7);
479                                         }
480                                 }
481                                 else { /* order == 11 */
482                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
483                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
484                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
485                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
486                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
487                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
488                                         xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
489
490                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
491                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
492                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
493                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
494                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
495
496                                         for(i = 0; i < (int)data_len; i++) {
497                                                 //sum = 0;
498                                                 //sum  = qlp_coeff[10] * data[i-11];
499                                                 xmm7 = _mm_cvtsi32_si128(data[i-11]);
500                                                 xmm7 = _mm_mul_epu32(xmm7, xmm5);
501
502                                                 //sum += qlp_coeff[9] * data[i-10];
503                                                 //sum += qlp_coeff[8] * data[i-9];
504                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
505                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
506                                                 xmm6 = _mm_mul_epu32(xmm6, xmm4);
507                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
508
509                                                 //sum += qlp_coeff[7] * data[i-8];
510                                                 //sum += qlp_coeff[6] * data[i-7];
511                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
512                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
513                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
514                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
515
516                                                 //sum += qlp_coeff[5] * data[i-6];
517                                                 //sum += qlp_coeff[4] * data[i-5];
518                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
519                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
520                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
521                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
522
523                                                 //sum += qlp_coeff[3] * data[i-4];
524                                                 //sum += qlp_coeff[2] * data[i-3];
525                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
526                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
527                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
528                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
529
530                                                 //sum += qlp_coeff[1] * data[i-2];
531                                                 //sum += qlp_coeff[0] * data[i-1];
532                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
533                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
534                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
535                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
536
537                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
538                                                 RESIDUAL32_RESULT(xmm7);
539                                         }
540                                 }
541                         }
542                         else { /* order == 9, 10 */
543                                 if(order == 10) {
544                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
545                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
546                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
547                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
548                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
549                                         xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
550
551                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
552                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
553                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
554                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
555                                         xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
556
557                                         for(i = 0; i < (int)data_len; i++) {
558                                                 //sum = 0;
559                                                 //sum += qlp_coeff[9] * data[i-10];
560                                                 //sum += qlp_coeff[8] * data[i-9];
561                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
562                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
563                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
564
565                                                 //sum += qlp_coeff[7] * data[i-8];
566                                                 //sum += qlp_coeff[6] * data[i-7];
567                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
568                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
569                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
570                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
571
572                                                 //sum += qlp_coeff[5] * data[i-6];
573                                                 //sum += qlp_coeff[4] * data[i-5];
574                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
575                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
576                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
577                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
578
579                                                 //sum += qlp_coeff[3] * data[i-4];
580                                                 //sum += qlp_coeff[2] * data[i-3];
581                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
582                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
583                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
584                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
585
586                                                 //sum += qlp_coeff[1] * data[i-2];
587                                                 //sum += qlp_coeff[0] * data[i-1];
588                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
589                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
590                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
591                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
592
593                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
594                                                 RESIDUAL32_RESULT(xmm7);
595                                         }
596                                 }
597                                 else { /* order == 9 */
598                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
599                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
600                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
601                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
602                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
603                                         xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
604
605                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
606                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
607                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
608                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
609
610                                         for(i = 0; i < (int)data_len; i++) {
611                                                 //sum = 0;
612                                                 //sum  = qlp_coeff[8] * data[i-9];
613                                                 xmm7 = _mm_cvtsi32_si128(data[i-9]);
614                                                 xmm7 = _mm_mul_epu32(xmm7, xmm4);
615
616                                                 //sum += qlp_coeff[7] * data[i-8];
617                                                 //sum += qlp_coeff[6] * data[i-7];
618                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
619                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
620                                                 xmm6 = _mm_mul_epu32(xmm6, xmm3);
621                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
622
623                                                 //sum += qlp_coeff[5] * data[i-6];
624                                                 //sum += qlp_coeff[4] * data[i-5];
625                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
626                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
627                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
628                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
629
630                                                 //sum += qlp_coeff[3] * data[i-4];
631                                                 //sum += qlp_coeff[2] * data[i-3];
632                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
633                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
634                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
635                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
636
637                                                 //sum += qlp_coeff[1] * data[i-2];
638                                                 //sum += qlp_coeff[0] * data[i-1];
639                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
640                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
641                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
642                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
643
644                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
645                                                 RESIDUAL32_RESULT(xmm7);
646                                         }
647                                 }
648                         }
649                 }
650                 else if(order > 4) { /* order == 5, 6, 7, 8 */
651                         if(order > 6) { /* order == 7, 8 */
652                                 if(order == 8) {
653                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
654                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
655                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
656                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
657                                         xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
658
659                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
660                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
661                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
662                                         xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
663
664                                         for(i = 0; i < (int)data_len; i++) {
665                                                 //sum = 0;
666                                                 //sum += qlp_coeff[7] * data[i-8];
667                                                 //sum += qlp_coeff[6] * data[i-7];
668                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
669                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
670                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
671
672                                                 //sum += qlp_coeff[5] * data[i-6];
673                                                 //sum += qlp_coeff[4] * data[i-5];
674                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
675                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
676                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
677                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
678
679                                                 //sum += qlp_coeff[3] * data[i-4];
680                                                 //sum += qlp_coeff[2] * data[i-3];
681                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
682                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
683                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
684                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
685
686                                                 //sum += qlp_coeff[1] * data[i-2];
687                                                 //sum += qlp_coeff[0] * data[i-1];
688                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
689                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
690                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
691                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
692
693                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
694                                                 RESIDUAL32_RESULT(xmm7);
695                                         }
696                                 }
697                                 else { /* order == 7 */
698                                         __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
699                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
700                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
701                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
702                                         xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
703
704                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
705                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
706                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
707
708                                         for(i = 0; i < (int)data_len; i++) {
709                                                 //sum = 0;
710                                                 //sum  = qlp_coeff[6] * data[i-7];
711                                                 xmm7 = _mm_cvtsi32_si128(data[i-7]);
712                                                 xmm7 = _mm_mul_epu32(xmm7, xmm3);
713
714                                                 //sum += qlp_coeff[5] * data[i-6];
715                                                 //sum += qlp_coeff[4] * data[i-5];
716                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
717                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
718                                                 xmm6 = _mm_mul_epu32(xmm6, xmm2);
719                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
720
721                                                 //sum += qlp_coeff[3] * data[i-4];
722                                                 //sum += qlp_coeff[2] * data[i-3];
723                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
724                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
725                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
726                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
727
728                                                 //sum += qlp_coeff[1] * data[i-2];
729                                                 //sum += qlp_coeff[0] * data[i-1];
730                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
731                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
732                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
733                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
734
735                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
736                                                 RESIDUAL32_RESULT(xmm7);
737                                         }
738                                 }
739                         }
740                         else { /* order == 5, 6 */
741                                 if(order == 6) {
742                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
743                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
744                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
745                                         xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
746
747                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
748                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
749                                         xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
750
751                                         for(i = 0; i < (int)data_len; i++) {
752                                                 //sum = 0;
753                                                 //sum += qlp_coeff[5] * data[i-6];
754                                                 //sum += qlp_coeff[4] * data[i-5];
755                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
756                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
757                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
758
759                                                 //sum += qlp_coeff[3] * data[i-4];
760                                                 //sum += qlp_coeff[2] * data[i-3];
761                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
762                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
763                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
764                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
765
766                                                 //sum += qlp_coeff[1] * data[i-2];
767                                                 //sum += qlp_coeff[0] * data[i-1];
768                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
769                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
770                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
771                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
772
773                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
774                                                 RESIDUAL32_RESULT(xmm7);
775                                         }
776                                 }
777                                 else { /* order == 5 */
778                                         __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
779                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
780                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
781                                         xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
782
783                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
784                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
785
786                                         for(i = 0; i < (int)data_len; i++) {
787                                                 //sum = 0;
788                                                 //sum  = qlp_coeff[4] * data[i-5];
789                                                 xmm7 = _mm_cvtsi32_si128(data[i-5]);
790                                                 xmm7 = _mm_mul_epu32(xmm7, xmm2);
791
792                                                 //sum += qlp_coeff[3] * data[i-4];
793                                                 //sum += qlp_coeff[2] * data[i-3];
794                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
795                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
796                                                 xmm6 = _mm_mul_epu32(xmm6, xmm1);
797                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
798
799                                                 //sum += qlp_coeff[1] * data[i-2];
800                                                 //sum += qlp_coeff[0] * data[i-1];
801                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
802                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
803                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
804                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
805
806                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
807                                                 RESIDUAL32_RESULT(xmm7);
808                                         }
809                                 }
810                         }
811                 }
812                 else { /* order == 1, 2, 3, 4 */
813                         if(order > 2) { /* order == 3, 4 */
814                                 if(order == 4) {
815                                         __m128i xmm0, xmm1, xmm6, xmm7;
816                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
817                                         xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
818
819                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
820                                         xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
821
822                                         for(i = 0; i < (int)data_len; i++) {
823                                                 //sum = 0;
824                                                 //sum += qlp_coeff[3] * data[i-4];
825                                                 //sum += qlp_coeff[2] * data[i-3];
826                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
827                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
828                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
829
830                                                 //sum += qlp_coeff[1] * data[i-2];
831                                                 //sum += qlp_coeff[0] * data[i-1];
832                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
833                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
834                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
835                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
836
837                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
838                                                 RESIDUAL32_RESULT(xmm7);
839                                         }
840                                 }
841                                 else { /* order == 3 */
842                                         __m128i xmm0, xmm1, xmm6, xmm7;
843                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
844                                         xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
845
846                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
847
848                                         for(i = 0; i < (int)data_len; i++) {
849                                                 //sum = 0;
850                                                 //sum  = qlp_coeff[2] * data[i-3];
851                                                 xmm7 = _mm_cvtsi32_si128(data[i-3]);
852                                                 xmm7 = _mm_mul_epu32(xmm7, xmm1);
853
854                                                 //sum += qlp_coeff[1] * data[i-2];
855                                                 //sum += qlp_coeff[0] * data[i-1];
856                                                 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
857                                                 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
858                                                 xmm6 = _mm_mul_epu32(xmm6, xmm0);
859                                                 xmm7 = _mm_add_epi32(xmm7, xmm6);
860
861                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
862                                                 RESIDUAL32_RESULT(xmm7);
863                                         }
864                                 }
865                         }
866                         else { /* order == 1, 2 */
867                                 if(order == 2) {
868                                         __m128i xmm0, xmm7;
869                                         xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
870                                         xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
871
872                                         for(i = 0; i < (int)data_len; i++) {
873                                                 //sum = 0;
874                                                 //sum += qlp_coeff[1] * data[i-2];
875                                                 //sum += qlp_coeff[0] * data[i-1];
876                                                 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
877                                                 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
878                                                 xmm7 = _mm_mul_epu32(xmm7, xmm0);
879
880                                                 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
881                                                 RESIDUAL32_RESULT(xmm7);
882                                         }
883                                 }
884                                 else { /* order == 1 */
885                                         for(i = 0; i < (int)data_len; i++)
886                                                 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
887                                 }
888                         }
889                 }
890         }
891         else { /* order > 12 */
892                 FLAC__int32 sum;
893                 for(i = 0; i < (int)data_len; i++) {
894                         sum = 0;
895                         switch(order) {
896                                 case 32: sum += qlp_coeff[31] * data[i-32];
897                                 case 31: sum += qlp_coeff[30] * data[i-31];
898                                 case 30: sum += qlp_coeff[29] * data[i-30];
899                                 case 29: sum += qlp_coeff[28] * data[i-29];
900                                 case 28: sum += qlp_coeff[27] * data[i-28];
901                                 case 27: sum += qlp_coeff[26] * data[i-27];
902                                 case 26: sum += qlp_coeff[25] * data[i-26];
903                                 case 25: sum += qlp_coeff[24] * data[i-25];
904                                 case 24: sum += qlp_coeff[23] * data[i-24];
905                                 case 23: sum += qlp_coeff[22] * data[i-23];
906                                 case 22: sum += qlp_coeff[21] * data[i-22];
907                                 case 21: sum += qlp_coeff[20] * data[i-21];
908                                 case 20: sum += qlp_coeff[19] * data[i-20];
909                                 case 19: sum += qlp_coeff[18] * data[i-19];
910                                 case 18: sum += qlp_coeff[17] * data[i-18];
911                                 case 17: sum += qlp_coeff[16] * data[i-17];
912                                 case 16: sum += qlp_coeff[15] * data[i-16];
913                                 case 15: sum += qlp_coeff[14] * data[i-15];
914                                 case 14: sum += qlp_coeff[13] * data[i-14];
915                                 case 13: sum += qlp_coeff[12] * data[i-13];
916                                          sum += qlp_coeff[11] * data[i-12];
917                                          sum += qlp_coeff[10] * data[i-11];
918                                          sum += qlp_coeff[ 9] * data[i-10];
919                                          sum += qlp_coeff[ 8] * data[i- 9];
920                                          sum += qlp_coeff[ 7] * data[i- 8];
921                                          sum += qlp_coeff[ 6] * data[i- 7];
922                                          sum += qlp_coeff[ 5] * data[i- 6];
923                                          sum += qlp_coeff[ 4] * data[i- 5];
924                                          sum += qlp_coeff[ 3] * data[i- 4];
925                                          sum += qlp_coeff[ 2] * data[i- 3];
926                                          sum += qlp_coeff[ 1] * data[i- 2];
927                                          sum += qlp_coeff[ 0] * data[i- 1];
928                         }
929                         residual[i] = data[i] - (sum >> lp_quantization);
930                 }
931         }
932 }
933
934 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
935
936 #define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
937
938 FLAC__SSE_TARGET("sse2")
939 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
940 {
941         if (order < 8 || order > 12) {
942                 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
943                 return;
944         }
945         if (data_len == 0)
946                 return;
947
948         FLAC__ASSERT(order >= 8);
949         FLAC__ASSERT(order <= 12);
950
951         if(order > 8) { /* order == 9, 10, 11, 12 */
952                 FLAC__int32 curr;
953                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
954                 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
955                 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
956                 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
957                 switch(order)                                          /* ...and zero them out */
958                 {
959                 case 9:
960                         xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
961                 case 10:
962                         xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
963                 case 11:
964                         xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
965                 }
966                 xmm2 = _mm_setzero_si128();
967                 xmm0 = _mm_packs_epi32(xmm0, xmm6);
968                 xmm1 = _mm_packs_epi32(xmm1, xmm2);
969
970                 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
971                 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
972                 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
973                 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
974                 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
975                 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
976                 xmm4 = _mm_packs_epi32(xmm4, xmm2);
977                 xmm3 = _mm_packs_epi32(xmm3, xmm5);
978
979                 xmm7 = _mm_slli_si128(xmm1, 2);
980                 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
981                 xmm2 = _mm_slli_si128(xmm0, 2);
982
983                 /* xmm0, xmm1: qlp_coeff
984                    xmm2, xmm7: qlp_coeff << 16 bit
985                    xmm3, xmm4: data */
986
987                 xmm5 = _mm_madd_epi16(xmm4, xmm1);
988                 xmm6 = _mm_madd_epi16(xmm3, xmm0);
989                 xmm6 = _mm_add_epi32(xmm6, xmm5);
990                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
991                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
992
993                 DATA16_RESULT(xmm6);
994
995                 data_len--;
996
997                 if(data_len % 2) {
998                         xmm6 = _mm_srli_si128(xmm3, 14);
999                         xmm4 = _mm_slli_si128(xmm4, 2);
1000                         xmm3 = _mm_slli_si128(xmm3, 2);
1001                         xmm4 = _mm_or_si128(xmm4, xmm6);
1002                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1003
1004                         xmm5 = _mm_madd_epi16(xmm4, xmm1);
1005                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1006                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1007                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1008                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1009
1010                         DATA16_RESULT(xmm6);
1011
1012                         data_len--;
1013                 }
1014
1015                 while(data_len) { /* data_len is a multiple of 2 */
1016                         /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1017                         xmm6 = _mm_srli_si128(xmm3, 12);
1018                         xmm4 = _mm_slli_si128(xmm4, 4);
1019                         xmm3 = _mm_slli_si128(xmm3, 4);
1020                         xmm4 = _mm_or_si128(xmm4, xmm6);
1021                         xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1022
1023                         xmm5 = _mm_madd_epi16(xmm4, xmm7);
1024                         xmm6 = _mm_madd_epi16(xmm3, xmm2);
1025                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1026                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1027                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1028
1029                         DATA16_RESULT(xmm6);
1030
1031                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1032
1033                         xmm5 = _mm_madd_epi16(xmm4, xmm1);
1034                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1035                         xmm6 = _mm_add_epi32(xmm6, xmm5);
1036                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1037                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1038
1039                         DATA16_RESULT(xmm6);
1040
1041                         data_len-=2;
1042                 }
1043         } /* endif(order > 8) */
1044         else {
1045                 FLAC__int32 curr;
1046                 __m128i xmm0, xmm1, xmm3, xmm6;
1047                 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1048                 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1049                 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1050
1051                 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1052                 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1053                 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1054                 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1055                 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1056
1057                 /* xmm0: qlp_coeff
1058                    xmm3: data */
1059
1060                 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1061                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1062                 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1063
1064                 DATA16_RESULT(xmm6);
1065
1066                 data_len--;
1067
1068                 while(data_len) {
1069                         xmm3 = _mm_slli_si128(xmm3, 2);
1070                         xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1071
1072                         xmm6 = _mm_madd_epi16(xmm3, xmm0);
1073                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1074                         xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1075
1076                         DATA16_RESULT(xmm6);
1077
1078                         data_len--;
1079                 }
1080         }
1081 }
1082
1083 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1084
1085 #endif /* FLAC__SSE2_SUPPORTED */
1086 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1087 #endif /* FLAC__NO_ASM */
1088 #endif /* FLAC__INTEGER_ONLY_LIBRARY */