src/libFLAC/stream_decoder.c : Fix NULL de-reference.
[flac.git] / src / libFLAC / lpc_intrin_avx2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2014  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__AVX2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <immintrin.h> /* AVX2 */
47
48 FLAC__SSE_TARGET("avx2")
49 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
50 {
51         int i;
52         FLAC__int32 sum;
53         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
54
55         FLAC__ASSERT(order > 0);
56         FLAC__ASSERT(order <= 32);
57
58         if(order <= 12) {
59                 if(order > 8) {
60                         if(order > 10) {
61                                 if(order == 12) {
62                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
63                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
64                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
65                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
66                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
67                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
68                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
69                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
70                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
71                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
72                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
73                                         q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
74                                         q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
75
76                                         for(i = 0; i < (int)data_len-7; i+=8) {
77                                                 __m256i summ, mull;
78                                                 summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
79                                                 mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
80                                                 mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
81                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
82                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
83                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
84                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
85                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
86                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
87                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
88                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
89                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
90                                                 summ = _mm256_sra_epi32(summ, cnt);
91                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
92                                         }
93                                 }
94                                 else { /* order == 11 */
95                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
96                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
97                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
98                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
99                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
100                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
101                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
102                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
103                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
104                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
105                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
106                                         q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
107
108                                         for(i = 0; i < (int)data_len-7; i+=8) {
109                                                 __m256i summ, mull;
110                                                 summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
111                                                 mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
112                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
113                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
114                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
115                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
116                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
117                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
118                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
119                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
120                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
121                                                 summ = _mm256_sra_epi32(summ, cnt);
122                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
123                                         }
124                                 }
125                         }
126                         else {
127                                 if(order == 10) {
128                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
129                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
130                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
131                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
132                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
133                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
134                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
135                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
136                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
137                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
138                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
139
140                                         for(i = 0; i < (int)data_len-7; i+=8) {
141                                                 __m256i summ, mull;
142                                                 summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
143                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
144                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
145                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
146                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
147                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
148                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
149                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
150                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
151                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
152                                                 summ = _mm256_sra_epi32(summ, cnt);
153                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
154                                         }
155                                 }
156                                 else { /* order == 9 */
157                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
158                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
159                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
160                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
161                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
162                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
163                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
164                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
165                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
166                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
167
168                                         for(i = 0; i < (int)data_len-7; i+=8) {
169                                                 __m256i summ, mull;
170                                                 summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
171                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
172                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
173                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
174                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
175                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
176                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
177                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
178                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
179                                                 summ = _mm256_sra_epi32(summ, cnt);
180                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
181                                         }
182                                 }
183                         }
184                 }
185                 else if(order > 4) {
186                         if(order > 6) {
187                                 if(order == 8) {
188                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
189                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
190                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
191                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
192                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
193                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
194                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
195                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
196                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
197
198                                         for(i = 0; i < (int)data_len-7; i+=8) {
199                                                 __m256i summ, mull;
200                                                 summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
201                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
202                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
203                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
204                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
205                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
206                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
207                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
208                                                 summ = _mm256_sra_epi32(summ, cnt);
209                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
210                                         }
211                                 }
212                                 else { /* order == 7 */
213                                         __m256i q0, q1, q2, q3, q4, q5, q6;
214                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
215                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
216                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
217                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
218                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
219                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
220                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
221
222                                         for(i = 0; i < (int)data_len-7; i+=8) {
223                                                 __m256i summ, mull;
224                                                 summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
225                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
226                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
227                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
228                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
229                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
230                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
231                                                 summ = _mm256_sra_epi32(summ, cnt);
232                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
233                                         }
234                                 }
235                         }
236                         else {
237                                 if(order == 6) {
238                                         __m256i q0, q1, q2, q3, q4, q5;
239                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
240                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
241                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
242                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
243                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
244                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
245
246                                         for(i = 0; i < (int)data_len-7; i+=8) {
247                                                 __m256i summ, mull;
248                                                 summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
249                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
250                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
251                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
252                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
253                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
254                                                 summ = _mm256_sra_epi32(summ, cnt);
255                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
256                                         }
257                                 }
258                                 else { /* order == 5 */
259                                         __m256i q0, q1, q2, q3, q4;
260                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
261                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
262                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
263                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
264                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
265
266                                         for(i = 0; i < (int)data_len-7; i+=8) {
267                                                 __m256i summ, mull;
268                                                 summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
269                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
270                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
271                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
272                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
273                                                 summ = _mm256_sra_epi32(summ, cnt);
274                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
275                                         }
276                                 }
277                         }
278                 }
279                 else {
280                         if(order > 2) {
281                                 if(order == 4) {
282                                         __m256i q0, q1, q2, q3;
283                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
284                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
285                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
286                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
287
288                                         for(i = 0; i < (int)data_len-7; i+=8) {
289                                                 __m256i summ, mull;
290                                                 summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
291                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
292                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
293                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
294                                                 summ = _mm256_sra_epi32(summ, cnt);
295                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
296                                         }
297                                 }
298                                 else { /* order == 3 */
299                                         __m256i q0, q1, q2;
300                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
301                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
302                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
303
304                                         for(i = 0; i < (int)data_len-7; i+=8) {
305                                                 __m256i summ, mull;
306                                                 summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
307                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
308                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
309                                                 summ = _mm256_sra_epi32(summ, cnt);
310                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
311                                         }
312                                 }
313                         }
314                         else {
315                                 if(order == 2) {
316                                         __m256i q0, q1;
317                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
318                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
319
320                                         for(i = 0; i < (int)data_len-7; i+=8) {
321                                                 __m256i summ, mull;
322                                                 summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
323                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
324                                                 summ = _mm256_sra_epi32(summ, cnt);
325                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
326                                         }
327                                 }
328                                 else { /* order == 1 */
329                                         __m256i q0;
330                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
331
332                                         for(i = 0; i < (int)data_len-7; i+=8) {
333                                                 __m256i summ;
334                                                 summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
335                                                 summ = _mm256_sra_epi32(summ, cnt);
336                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
337                                         }
338                                 }
339                         }
340                 }
341                 for(; i < (int)data_len; i++) {
342                         sum = 0;
343                         switch(order) {
344                                 case 12: sum += qlp_coeff[11] * data[i-12];
345                                 case 11: sum += qlp_coeff[10] * data[i-11];
346                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
347                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
348                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
349                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
350                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
351                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
352                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
353                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
354                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
355                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
356                         }
357                         residual[i] = data[i] - (sum >> lp_quantization);
358                 }
359         }
360         else { /* order > 12 */
361                 for(i = 0; i < (int)data_len; i++) {
362                         sum = 0;
363                         switch(order) {
364                                 case 32: sum += qlp_coeff[31] * data[i-32];
365                                 case 31: sum += qlp_coeff[30] * data[i-31];
366                                 case 30: sum += qlp_coeff[29] * data[i-30];
367                                 case 29: sum += qlp_coeff[28] * data[i-29];
368                                 case 28: sum += qlp_coeff[27] * data[i-28];
369                                 case 27: sum += qlp_coeff[26] * data[i-27];
370                                 case 26: sum += qlp_coeff[25] * data[i-26];
371                                 case 25: sum += qlp_coeff[24] * data[i-25];
372                                 case 24: sum += qlp_coeff[23] * data[i-24];
373                                 case 23: sum += qlp_coeff[22] * data[i-23];
374                                 case 22: sum += qlp_coeff[21] * data[i-22];
375                                 case 21: sum += qlp_coeff[20] * data[i-21];
376                                 case 20: sum += qlp_coeff[19] * data[i-20];
377                                 case 19: sum += qlp_coeff[18] * data[i-19];
378                                 case 18: sum += qlp_coeff[17] * data[i-18];
379                                 case 17: sum += qlp_coeff[16] * data[i-17];
380                                 case 16: sum += qlp_coeff[15] * data[i-16];
381                                 case 15: sum += qlp_coeff[14] * data[i-15];
382                                 case 14: sum += qlp_coeff[13] * data[i-14];
383                                 case 13: sum += qlp_coeff[12] * data[i-13];
384                                          sum += qlp_coeff[11] * data[i-12];
385                                          sum += qlp_coeff[10] * data[i-11];
386                                          sum += qlp_coeff[ 9] * data[i-10];
387                                          sum += qlp_coeff[ 8] * data[i- 9];
388                                          sum += qlp_coeff[ 7] * data[i- 8];
389                                          sum += qlp_coeff[ 6] * data[i- 7];
390                                          sum += qlp_coeff[ 5] * data[i- 6];
391                                          sum += qlp_coeff[ 4] * data[i- 5];
392                                          sum += qlp_coeff[ 3] * data[i- 4];
393                                          sum += qlp_coeff[ 2] * data[i- 3];
394                                          sum += qlp_coeff[ 1] * data[i- 2];
395                                          sum += qlp_coeff[ 0] * data[i- 1];
396                         }
397                         residual[i] = data[i] - (sum >> lp_quantization);
398                 }
399         }
400         _mm256_zeroupper();
401 }
402
403 FLAC__SSE_TARGET("avx2")
404 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
405 {
406         int i;
407         FLAC__int32 sum;
408         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
409
410         FLAC__ASSERT(order > 0);
411         FLAC__ASSERT(order <= 32);
412
413         if(order <= 12) {
414                 if(order > 8) {
415                         if(order > 10) {
416                                 if(order == 12) {
417                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
418                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
419                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
420                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
421                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
422                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
423                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
424                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
425                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
426                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
427                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
428                                         q10 = _mm256_set1_epi32(qlp_coeff[10]);
429                                         q11 = _mm256_set1_epi32(qlp_coeff[11]);
430
431                                         for(i = 0; i < (int)data_len-7; i+=8) {
432                                                 __m256i summ, mull;
433                                                 summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
434                                                 mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
435                                                 mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
436                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
437                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
438                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
439                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
440                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
441                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
442                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
443                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
444                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
445                                                 summ = _mm256_sra_epi32(summ, cnt);
446                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
447                                         }
448                                 }
449                                 else { /* order == 11 */
450                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
451                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
452                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
453                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
454                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
455                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
456                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
457                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
458                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
459                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
460                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
461                                         q10 = _mm256_set1_epi32(qlp_coeff[10]);
462
463                                         for(i = 0; i < (int)data_len-7; i+=8) {
464                                                 __m256i summ, mull;
465                                                 summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
466                                                 mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
467                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
468                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
469                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
470                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
471                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
472                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
473                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
474                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
475                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
476                                                 summ = _mm256_sra_epi32(summ, cnt);
477                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
478                                         }
479                                 }
480                         }
481                         else {
482                                 if(order == 10) {
483                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
484                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
485                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
486                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
487                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
488                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
489                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
490                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
491                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
492                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
493                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
494
495                                         for(i = 0; i < (int)data_len-7; i+=8) {
496                                                 __m256i summ, mull;
497                                                 summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
498                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
499                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
500                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
501                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
502                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
503                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
504                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
505                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
506                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
507                                                 summ = _mm256_sra_epi32(summ, cnt);
508                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
509                                         }
510                                 }
511                                 else { /* order == 9 */
512                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
513                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
514                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
515                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
516                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
517                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
518                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
519                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
520                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
521                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
522
523                                         for(i = 0; i < (int)data_len-7; i+=8) {
524                                                 __m256i summ, mull;
525                                                 summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
526                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
527                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
528                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
529                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
530                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
531                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
532                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
533                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
534                                                 summ = _mm256_sra_epi32(summ, cnt);
535                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
536                                         }
537                                 }
538                         }
539                 }
540                 else if(order > 4) {
541                         if(order > 6) {
542                                 if(order == 8) {
543                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
544                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
545                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
546                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
547                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
548                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
549                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
550                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
551                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
552
553                                         for(i = 0; i < (int)data_len-7; i+=8) {
554                                                 __m256i summ, mull;
555                                                 summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
556                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
557                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
558                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
559                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
560                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
561                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
562                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
563                                                 summ = _mm256_sra_epi32(summ, cnt);
564                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
565                                         }
566                                 }
567                                 else { /* order == 7 */
568                                         __m256i q0, q1, q2, q3, q4, q5, q6;
569                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
570                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
571                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
572                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
573                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
574                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
575                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
576
577                                         for(i = 0; i < (int)data_len-7; i+=8) {
578                                                 __m256i summ, mull;
579                                                 summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
580                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
581                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
582                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
583                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
584                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
585                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
586                                                 summ = _mm256_sra_epi32(summ, cnt);
587                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
588                                         }
589                                 }
590                         }
591                         else {
592                                 if(order == 6) {
593                                         __m256i q0, q1, q2, q3, q4, q5;
594                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
595                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
596                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
597                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
598                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
599                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
600
601                                         for(i = 0; i < (int)data_len-7; i+=8) {
602                                                 __m256i summ, mull;
603                                                 summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
604                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
605                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
606                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
607                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
608                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
609                                                 summ = _mm256_sra_epi32(summ, cnt);
610                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
611                                         }
612                                 }
613                                 else { /* order == 5 */
614                                         __m256i q0, q1, q2, q3, q4;
615                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
616                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
617                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
618                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
619                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
620
621                                         for(i = 0; i < (int)data_len-7; i+=8) {
622                                                 __m256i summ, mull;
623                                                 summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
624                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
625                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
626                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
627                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
628                                                 summ = _mm256_sra_epi32(summ, cnt);
629                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
630                                         }
631                                 }
632                         }
633                 }
634                 else {
635                         if(order > 2) {
636                                 if(order == 4) {
637                                         __m256i q0, q1, q2, q3;
638                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
639                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
640                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
641                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
642
643                                         for(i = 0; i < (int)data_len-7; i+=8) {
644                                                 __m256i summ, mull;
645                                                 summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
646                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
647                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
648                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
649                                                 summ = _mm256_sra_epi32(summ, cnt);
650                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
651                                         }
652                                 }
653                                 else { /* order == 3 */
654                                         __m256i q0, q1, q2;
655                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
656                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
657                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
658
659                                         for(i = 0; i < (int)data_len-7; i+=8) {
660                                                 __m256i summ, mull;
661                                                 summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
662                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
663                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
664                                                 summ = _mm256_sra_epi32(summ, cnt);
665                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
666                                         }
667                                 }
668                         }
669                         else {
670                                 if(order == 2) {
671                                         __m256i q0, q1;
672                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
673                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
674
675                                         for(i = 0; i < (int)data_len-7; i+=8) {
676                                                 __m256i summ, mull;
677                                                 summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
678                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
679                                                 summ = _mm256_sra_epi32(summ, cnt);
680                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
681                                         }
682                                 }
683                                 else { /* order == 1 */
684                                         __m256i q0;
685                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
686
687                                         for(i = 0; i < (int)data_len-7; i+=8) {
688                                                 __m256i summ;
689                                                 summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
690                                                 summ = _mm256_sra_epi32(summ, cnt);
691                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
692                                         }
693                                 }
694                         }
695                 }
696                 for(; i < (int)data_len; i++) {
697                         sum = 0;
698                         switch(order) {
699                                 case 12: sum += qlp_coeff[11] * data[i-12];
700                                 case 11: sum += qlp_coeff[10] * data[i-11];
701                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
702                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
703                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
704                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
705                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
706                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
707                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
708                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
709                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
710                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
711                         }
712                         residual[i] = data[i] - (sum >> lp_quantization);
713                 }
714         }
715         else { /* order > 12 */
716                 for(i = 0; i < (int)data_len; i++) {
717                         sum = 0;
718                         switch(order) {
719                                 case 32: sum += qlp_coeff[31] * data[i-32];
720                                 case 31: sum += qlp_coeff[30] * data[i-31];
721                                 case 30: sum += qlp_coeff[29] * data[i-30];
722                                 case 29: sum += qlp_coeff[28] * data[i-29];
723                                 case 28: sum += qlp_coeff[27] * data[i-28];
724                                 case 27: sum += qlp_coeff[26] * data[i-27];
725                                 case 26: sum += qlp_coeff[25] * data[i-26];
726                                 case 25: sum += qlp_coeff[24] * data[i-25];
727                                 case 24: sum += qlp_coeff[23] * data[i-24];
728                                 case 23: sum += qlp_coeff[22] * data[i-23];
729                                 case 22: sum += qlp_coeff[21] * data[i-22];
730                                 case 21: sum += qlp_coeff[20] * data[i-21];
731                                 case 20: sum += qlp_coeff[19] * data[i-20];
732                                 case 19: sum += qlp_coeff[18] * data[i-19];
733                                 case 18: sum += qlp_coeff[17] * data[i-18];
734                                 case 17: sum += qlp_coeff[16] * data[i-17];
735                                 case 16: sum += qlp_coeff[15] * data[i-16];
736                                 case 15: sum += qlp_coeff[14] * data[i-15];
737                                 case 14: sum += qlp_coeff[13] * data[i-14];
738                                 case 13: sum += qlp_coeff[12] * data[i-13];
739                                          sum += qlp_coeff[11] * data[i-12];
740                                          sum += qlp_coeff[10] * data[i-11];
741                                          sum += qlp_coeff[ 9] * data[i-10];
742                                          sum += qlp_coeff[ 8] * data[i- 9];
743                                          sum += qlp_coeff[ 7] * data[i- 8];
744                                          sum += qlp_coeff[ 6] * data[i- 7];
745                                          sum += qlp_coeff[ 5] * data[i- 6];
746                                          sum += qlp_coeff[ 4] * data[i- 5];
747                                          sum += qlp_coeff[ 3] * data[i- 4];
748                                          sum += qlp_coeff[ 2] * data[i- 3];
749                                          sum += qlp_coeff[ 1] * data[i- 2];
750                                          sum += qlp_coeff[ 0] * data[i- 1];
751                         }
752                         residual[i] = data[i] - (sum >> lp_quantization);
753                 }
754         }
755         _mm256_zeroupper();
756 }
757
758 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
759
760 FLAC__SSE_TARGET("avx2")
761 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
762 {
763         int i;
764         FLAC__int64 sum;
765         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
766         __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
767
768         FLAC__ASSERT(order > 0);
769         FLAC__ASSERT(order <= 32);
770         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
771
772         if(order <= 12) {
773                 if(order > 8) {
774                         if(order > 10) {
775                                 if(order == 12) {
776                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
777                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
778                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
779                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
780                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
781                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
782                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
783                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
784                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
785                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
786                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
787                                         q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
788                                         q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
789
790                                         for(i = 0; i < (int)data_len-3; i+=4) {
791                                                 __m256i summ, mull;
792                                                 summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
793                                                 mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
794                                                 mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
795                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
796                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
797                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
798                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
799                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
800                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
801                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
802                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
803                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
804                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
805                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
806                                         }
807                                 }
808                                 else { /* order == 11 */
809                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
810                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
811                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
812                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
813                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
814                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
815                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
816                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
817                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
818                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
819                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
820                                         q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
821
822                                         for(i = 0; i < (int)data_len-3; i+=4) {
823                                                 __m256i summ, mull;
824                                                 summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
825                                                 mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
826                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
827                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
828                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
829                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
830                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
831                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
832                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
833                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
834                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
835                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
836                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
837                                         }
838                                 }
839                         }
840                         else {
841                                 if(order == 10) {
842                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
843                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
844                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
845                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
846                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
847                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
848                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
849                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
850                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
851                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
852                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
853
854                                         for(i = 0; i < (int)data_len-3; i+=4) {
855                                                 __m256i summ, mull;
856                                                 summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
857                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
858                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
859                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
860                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
861                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
862                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
863                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
864                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
865                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
866                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
867                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
868                                         }
869                                 }
870                                 else { /* order == 9 */
871                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
872                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
873                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
874                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
875                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
876                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
877                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
878                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
879                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
880                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
881
882                                         for(i = 0; i < (int)data_len-3; i+=4) {
883                                                 __m256i summ, mull;
884                                                 summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
885                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
886                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
887                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
888                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
889                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
890                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
891                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
892                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
893                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
894                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
895                                         }
896                                 }
897                         }
898                 }
899                 else if(order > 4) {
900                         if(order > 6) {
901                                 if(order == 8) {
902                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
903                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
904                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
905                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
906                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
907                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
908                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
909                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
910                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
911
912                                         for(i = 0; i < (int)data_len-3; i+=4) {
913                                                 __m256i summ, mull;
914                                                 summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
915                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
916                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
917                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
918                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
919                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
920                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
921                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
922                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
923                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
924                                         }
925                                 }
926                                 else { /* order == 7 */
927                                         __m256i q0, q1, q2, q3, q4, q5, q6;
928                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
929                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
930                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
931                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
932                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
933                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
934                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
935
936                                         for(i = 0; i < (int)data_len-3; i+=4) {
937                                                 __m256i summ, mull;
938                                                 summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
939                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
940                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
941                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
942                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
943                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
944                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
945                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
946                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
947                                         }
948                                 }
949                         }
950                         else {
951                                 if(order == 6) {
952                                         __m256i q0, q1, q2, q3, q4, q5;
953                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
954                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
955                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
956                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
957                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
958                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
959
960                                         for(i = 0; i < (int)data_len-3; i+=4) {
961                                                 __m256i summ, mull;
962                                                 summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
963                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
964                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
965                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
966                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
967                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
968                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
969                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
970                                         }
971                                 }
972                                 else { /* order == 5 */
973                                         __m256i q0, q1, q2, q3, q4;
974                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
975                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
976                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
977                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
978                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
979
980                                         for(i = 0; i < (int)data_len-3; i+=4) {
981                                                 __m256i summ, mull;
982                                                 summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
983                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
984                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
985                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
986                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
987                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
988                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
989                                         }
990                                 }
991                         }
992                 }
993                 else {
994                         if(order > 2) {
995                                 if(order == 4) {
996                                         __m256i q0, q1, q2, q3;
997                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
998                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
999                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1000                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
1001
1002                                         for(i = 0; i < (int)data_len-3; i+=4) {
1003                                                 __m256i summ, mull;
1004                                                 summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1005                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
1006                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1007                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1008                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1009                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1010                                         }
1011                                 }
1012                                 else { /* order == 3 */
1013                                         __m256i q0, q1, q2;
1014                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1015                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1016                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1017
1018                                         for(i = 0; i < (int)data_len-3; i+=4) {
1019                                                 __m256i summ, mull;
1020                                                 summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1021                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1022                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1023                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1024                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1025                                         }
1026                                 }
1027                         }
1028                         else {
1029                                 if(order == 2) {
1030                                         __m256i q0, q1;
1031                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1032                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1033
1034                                         for(i = 0; i < (int)data_len-3; i+=4) {
1035                                                 __m256i summ, mull;
1036                                                 summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1037                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1038                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1039                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1040                                         }
1041                                 }
1042                                 else { /* order == 1 */
1043                                         __m256i q0;
1044                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1045
1046                                         for(i = 0; i < (int)data_len-3; i+=4) {
1047                                                 __m256i summ;
1048                                                 summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1049                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1050                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1051                                         }
1052                                 }
1053                         }
1054                 }
1055                 for(; i < (int)data_len; i++) {
1056                         sum = 0;
1057                         switch(order) {
1058                                 case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1059                                 case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1060                                 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1061                                 case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1062                                 case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1063                                 case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1064                                 case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1065                                 case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1066                                 case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1067                                 case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1068                                 case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1069                                 case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1070                         }
1071                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1072                 }
1073         }
1074         else { /* order > 12 */
1075                 for(i = 0; i < (int)data_len; i++) {
1076                         sum = 0;
1077                         switch(order) {
1078                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1079                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1080                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1081                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1082                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1083                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1084                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1085                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1086                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1087                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1088                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1089                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1090                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1091                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1092                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1093                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1094                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1095                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1096                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1097                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1098                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1099                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1100                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1101                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1102                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1103                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1104                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1105                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1106                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1107                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1108                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1109                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1110                         }
1111                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1112                 }
1113         }
1114         _mm256_zeroupper();
1115 }
1116
1117 #endif /* FLAC__AVX2_SUPPORTED */
1118 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1119 #endif /* FLAC__NO_ASM */
1120 #endif /* FLAC__INTEGER_ONLY_LIBRARY */