4ce06fd1b55575cea52249b46a05bd0afb542d24
[flac.git] / src / libFLAC / lpc_intrin_avx2.c
1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2014  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36
37 #include "private/cpu.h"
38
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__AVX2_SUPPORTED
44
45 #include "FLAC/assert.h"
46 #include "FLAC/format.h"
47
48 #include <immintrin.h> /* AVX2 */
49
50 FLAC__SSE_TARGET("avx2")
51 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
52 {
53         int i;
54         FLAC__int32 sum;
55         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
56
57         FLAC__ASSERT(order > 0);
58         FLAC__ASSERT(order <= 32);
59
60         if(order <= 12) {
61                 if(order > 8) {
62                         if(order > 10) {
63                                 if(order == 12) {
64                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
65                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
66                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
67                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
68                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
69                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
70                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
71                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
72                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
73                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
74                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
75                                         q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
76                                         q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
77
78                                         for(i = 0; i < (int)data_len-7; i+=8) {
79                                                 __m256i summ, mull;
80                                                 summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
81                                                 mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
82                                                 mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
83                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
84                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
85                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
86                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
87                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
88                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
89                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
90                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
91                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
92                                                 summ = _mm256_sra_epi32(summ, cnt);
93                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
94                                         }
95                                 }
96                                 else { /* order == 11 */
97                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
98                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
99                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
100                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
101                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
102                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
103                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
104                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
105                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
106                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
107                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
108                                         q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
109
110                                         for(i = 0; i < (int)data_len-7; i+=8) {
111                                                 __m256i summ, mull;
112                                                 summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
113                                                 mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
114                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
115                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
116                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
117                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
118                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
119                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
120                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
121                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
122                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
123                                                 summ = _mm256_sra_epi32(summ, cnt);
124                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
125                                         }
126                                 }
127                         }
128                         else {
129                                 if(order == 10) {
130                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
131                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
132                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
133                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
134                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
135                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
136                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
137                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
138                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
139                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
140                                         q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
141
142                                         for(i = 0; i < (int)data_len-7; i+=8) {
143                                                 __m256i summ, mull;
144                                                 summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
145                                                 mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
146                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
147                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
148                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
149                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
150                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
151                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
152                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
153                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
154                                                 summ = _mm256_sra_epi32(summ, cnt);
155                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
156                                         }
157                                 }
158                                 else { /* order == 9 */
159                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
160                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
161                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
162                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
163                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
164                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
165                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
166                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
167                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
168                                         q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
169
170                                         for(i = 0; i < (int)data_len-7; i+=8) {
171                                                 __m256i summ, mull;
172                                                 summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
173                                                 mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
174                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
175                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
176                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
177                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
178                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
179                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
180                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
181                                                 summ = _mm256_sra_epi32(summ, cnt);
182                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
183                                         }
184                                 }
185                         }
186                 }
187                 else if(order > 4) {
188                         if(order > 6) {
189                                 if(order == 8) {
190                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
191                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
192                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
193                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
194                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
195                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
196                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
197                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
198                                         q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
199
200                                         for(i = 0; i < (int)data_len-7; i+=8) {
201                                                 __m256i summ, mull;
202                                                 summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
203                                                 mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
204                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
205                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
206                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
207                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
208                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
209                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
210                                                 summ = _mm256_sra_epi32(summ, cnt);
211                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
212                                         }
213                                 }
214                                 else { /* order == 7 */
215                                         __m256i q0, q1, q2, q3, q4, q5, q6;
216                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
217                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
218                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
219                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
220                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
221                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
222                                         q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
223
224                                         for(i = 0; i < (int)data_len-7; i+=8) {
225                                                 __m256i summ, mull;
226                                                 summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
227                                                 mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
228                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
229                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
230                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
231                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
232                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
233                                                 summ = _mm256_sra_epi32(summ, cnt);
234                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
235                                         }
236                                 }
237                         }
238                         else {
239                                 if(order == 6) {
240                                         __m256i q0, q1, q2, q3, q4, q5;
241                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
242                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
243                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
244                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
245                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
246                                         q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
247
248                                         for(i = 0; i < (int)data_len-7; i+=8) {
249                                                 __m256i summ, mull;
250                                                 summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
251                                                 mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
252                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
253                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
254                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
255                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
256                                                 summ = _mm256_sra_epi32(summ, cnt);
257                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
258                                         }
259                                 }
260                                 else { /* order == 5 */
261                                         __m256i q0, q1, q2, q3, q4;
262                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
263                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
264                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
265                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
266                                         q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
267
268                                         for(i = 0; i < (int)data_len-7; i+=8) {
269                                                 __m256i summ, mull;
270                                                 summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
271                                                 mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
272                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
273                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
274                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
275                                                 summ = _mm256_sra_epi32(summ, cnt);
276                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
277                                         }
278                                 }
279                         }
280                 }
281                 else {
282                         if(order > 2) {
283                                 if(order == 4) {
284                                         __m256i q0, q1, q2, q3;
285                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
286                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
287                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
288                                         q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
289
290                                         for(i = 0; i < (int)data_len-7; i+=8) {
291                                                 __m256i summ, mull;
292                                                 summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
293                                                 mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
294                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
295                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
296                                                 summ = _mm256_sra_epi32(summ, cnt);
297                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
298                                         }
299                                 }
300                                 else { /* order == 3 */
301                                         __m256i q0, q1, q2;
302                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
303                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
304                                         q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
305
306                                         for(i = 0; i < (int)data_len-7; i+=8) {
307                                                 __m256i summ, mull;
308                                                 summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
309                                                 mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
310                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
311                                                 summ = _mm256_sra_epi32(summ, cnt);
312                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
313                                         }
314                                 }
315                         }
316                         else {
317                                 if(order == 2) {
318                                         __m256i q0, q1;
319                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
320                                         q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
321
322                                         for(i = 0; i < (int)data_len-7; i+=8) {
323                                                 __m256i summ, mull;
324                                                 summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
325                                                 mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
326                                                 summ = _mm256_sra_epi32(summ, cnt);
327                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
328                                         }
329                                 }
330                                 else { /* order == 1 */
331                                         __m256i q0;
332                                         q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
333
334                                         for(i = 0; i < (int)data_len-7; i+=8) {
335                                                 __m256i summ;
336                                                 summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
337                                                 summ = _mm256_sra_epi32(summ, cnt);
338                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
339                                         }
340                                 }
341                         }
342                 }
343                 for(; i < (int)data_len; i++) {
344                         sum = 0;
345                         switch(order) {
346                                 case 12: sum += qlp_coeff[11] * data[i-12];
347                                 case 11: sum += qlp_coeff[10] * data[i-11];
348                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
349                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
350                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
351                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
352                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
353                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
354                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
355                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
356                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
357                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
358                         }
359                         residual[i] = data[i] - (sum >> lp_quantization);
360                 }
361         }
362         else { /* order > 12 */
363                 for(i = 0; i < (int)data_len; i++) {
364                         sum = 0;
365                         switch(order) {
366                                 case 32: sum += qlp_coeff[31] * data[i-32];
367                                 case 31: sum += qlp_coeff[30] * data[i-31];
368                                 case 30: sum += qlp_coeff[29] * data[i-30];
369                                 case 29: sum += qlp_coeff[28] * data[i-29];
370                                 case 28: sum += qlp_coeff[27] * data[i-28];
371                                 case 27: sum += qlp_coeff[26] * data[i-27];
372                                 case 26: sum += qlp_coeff[25] * data[i-26];
373                                 case 25: sum += qlp_coeff[24] * data[i-25];
374                                 case 24: sum += qlp_coeff[23] * data[i-24];
375                                 case 23: sum += qlp_coeff[22] * data[i-23];
376                                 case 22: sum += qlp_coeff[21] * data[i-22];
377                                 case 21: sum += qlp_coeff[20] * data[i-21];
378                                 case 20: sum += qlp_coeff[19] * data[i-20];
379                                 case 19: sum += qlp_coeff[18] * data[i-19];
380                                 case 18: sum += qlp_coeff[17] * data[i-18];
381                                 case 17: sum += qlp_coeff[16] * data[i-17];
382                                 case 16: sum += qlp_coeff[15] * data[i-16];
383                                 case 15: sum += qlp_coeff[14] * data[i-15];
384                                 case 14: sum += qlp_coeff[13] * data[i-14];
385                                 case 13: sum += qlp_coeff[12] * data[i-13];
386                                          sum += qlp_coeff[11] * data[i-12];
387                                          sum += qlp_coeff[10] * data[i-11];
388                                          sum += qlp_coeff[ 9] * data[i-10];
389                                          sum += qlp_coeff[ 8] * data[i- 9];
390                                          sum += qlp_coeff[ 7] * data[i- 8];
391                                          sum += qlp_coeff[ 6] * data[i- 7];
392                                          sum += qlp_coeff[ 5] * data[i- 6];
393                                          sum += qlp_coeff[ 4] * data[i- 5];
394                                          sum += qlp_coeff[ 3] * data[i- 4];
395                                          sum += qlp_coeff[ 2] * data[i- 3];
396                                          sum += qlp_coeff[ 1] * data[i- 2];
397                                          sum += qlp_coeff[ 0] * data[i- 1];
398                         }
399                         residual[i] = data[i] - (sum >> lp_quantization);
400                 }
401         }
402         _mm256_zeroupper();
403 }
404
405 FLAC__SSE_TARGET("avx2")
406 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
407 {
408         int i;
409         FLAC__int32 sum;
410         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
411
412         FLAC__ASSERT(order > 0);
413         FLAC__ASSERT(order <= 32);
414
415         if(order <= 12) {
416                 if(order > 8) {
417                         if(order > 10) {
418                                 if(order == 12) {
419                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
420                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
421                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
422                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
423                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
424                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
425                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
426                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
427                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
428                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
429                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
430                                         q10 = _mm256_set1_epi32(qlp_coeff[10]);
431                                         q11 = _mm256_set1_epi32(qlp_coeff[11]);
432
433                                         for(i = 0; i < (int)data_len-7; i+=8) {
434                                                 __m256i summ, mull;
435                                                 summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
436                                                 mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
437                                                 mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
438                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
439                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
440                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
441                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
442                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
443                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
444                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
445                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
446                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
447                                                 summ = _mm256_sra_epi32(summ, cnt);
448                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
449                                         }
450                                 }
451                                 else { /* order == 11 */
452                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
453                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
454                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
455                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
456                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
457                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
458                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
459                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
460                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
461                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
462                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
463                                         q10 = _mm256_set1_epi32(qlp_coeff[10]);
464
465                                         for(i = 0; i < (int)data_len-7; i+=8) {
466                                                 __m256i summ, mull;
467                                                 summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
468                                                 mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
469                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
470                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
471                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
472                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
473                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
474                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
475                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
476                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
477                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
478                                                 summ = _mm256_sra_epi32(summ, cnt);
479                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
480                                         }
481                                 }
482                         }
483                         else {
484                                 if(order == 10) {
485                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
486                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
487                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
488                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
489                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
490                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
491                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
492                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
493                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
494                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
495                                         q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
496
497                                         for(i = 0; i < (int)data_len-7; i+=8) {
498                                                 __m256i summ, mull;
499                                                 summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
500                                                 mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
501                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
502                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
503                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
504                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
505                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
506                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
507                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
508                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
509                                                 summ = _mm256_sra_epi32(summ, cnt);
510                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
511                                         }
512                                 }
513                                 else { /* order == 9 */
514                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
515                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
516                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
517                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
518                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
519                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
520                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
521                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
522                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
523                                         q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
524
525                                         for(i = 0; i < (int)data_len-7; i+=8) {
526                                                 __m256i summ, mull;
527                                                 summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
528                                                 mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
529                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
530                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
531                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
532                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
533                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
534                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
535                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
536                                                 summ = _mm256_sra_epi32(summ, cnt);
537                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
538                                         }
539                                 }
540                         }
541                 }
542                 else if(order > 4) {
543                         if(order > 6) {
544                                 if(order == 8) {
545                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
546                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
547                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
548                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
549                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
550                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
551                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
552                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
553                                         q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
554
555                                         for(i = 0; i < (int)data_len-7; i+=8) {
556                                                 __m256i summ, mull;
557                                                 summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
558                                                 mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
559                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
560                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
561                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
562                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
563                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
564                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
565                                                 summ = _mm256_sra_epi32(summ, cnt);
566                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
567                                         }
568                                 }
569                                 else { /* order == 7 */
570                                         __m256i q0, q1, q2, q3, q4, q5, q6;
571                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
572                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
573                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
574                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
575                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
576                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
577                                         q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
578
579                                         for(i = 0; i < (int)data_len-7; i+=8) {
580                                                 __m256i summ, mull;
581                                                 summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
582                                                 mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
583                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
584                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
585                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
586                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
587                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
588                                                 summ = _mm256_sra_epi32(summ, cnt);
589                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
590                                         }
591                                 }
592                         }
593                         else {
594                                 if(order == 6) {
595                                         __m256i q0, q1, q2, q3, q4, q5;
596                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
597                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
598                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
599                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
600                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
601                                         q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
602
603                                         for(i = 0; i < (int)data_len-7; i+=8) {
604                                                 __m256i summ, mull;
605                                                 summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
606                                                 mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
607                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
608                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
609                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
610                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
611                                                 summ = _mm256_sra_epi32(summ, cnt);
612                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
613                                         }
614                                 }
615                                 else { /* order == 5 */
616                                         __m256i q0, q1, q2, q3, q4;
617                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
618                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
619                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
620                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
621                                         q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
622
623                                         for(i = 0; i < (int)data_len-7; i+=8) {
624                                                 __m256i summ, mull;
625                                                 summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
626                                                 mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
627                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
628                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
629                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
630                                                 summ = _mm256_sra_epi32(summ, cnt);
631                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
632                                         }
633                                 }
634                         }
635                 }
636                 else {
637                         if(order > 2) {
638                                 if(order == 4) {
639                                         __m256i q0, q1, q2, q3;
640                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
641                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
642                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
643                                         q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
644
645                                         for(i = 0; i < (int)data_len-7; i+=8) {
646                                                 __m256i summ, mull;
647                                                 summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
648                                                 mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
649                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
650                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
651                                                 summ = _mm256_sra_epi32(summ, cnt);
652                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
653                                         }
654                                 }
655                                 else { /* order == 3 */
656                                         __m256i q0, q1, q2;
657                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
658                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
659                                         q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
660
661                                         for(i = 0; i < (int)data_len-7; i+=8) {
662                                                 __m256i summ, mull;
663                                                 summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
664                                                 mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
665                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
666                                                 summ = _mm256_sra_epi32(summ, cnt);
667                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
668                                         }
669                                 }
670                         }
671                         else {
672                                 if(order == 2) {
673                                         __m256i q0, q1;
674                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
675                                         q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
676
677                                         for(i = 0; i < (int)data_len-7; i+=8) {
678                                                 __m256i summ, mull;
679                                                 summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
680                                                 mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
681                                                 summ = _mm256_sra_epi32(summ, cnt);
682                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
683                                         }
684                                 }
685                                 else { /* order == 1 */
686                                         __m256i q0;
687                                         q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
688
689                                         for(i = 0; i < (int)data_len-7; i+=8) {
690                                                 __m256i summ;
691                                                 summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
692                                                 summ = _mm256_sra_epi32(summ, cnt);
693                                                 _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
694                                         }
695                                 }
696                         }
697                 }
698                 for(; i < (int)data_len; i++) {
699                         sum = 0;
700                         switch(order) {
701                                 case 12: sum += qlp_coeff[11] * data[i-12];
702                                 case 11: sum += qlp_coeff[10] * data[i-11];
703                                 case 10: sum += qlp_coeff[ 9] * data[i-10];
704                                 case 9:  sum += qlp_coeff[ 8] * data[i- 9];
705                                 case 8:  sum += qlp_coeff[ 7] * data[i- 8];
706                                 case 7:  sum += qlp_coeff[ 6] * data[i- 7];
707                                 case 6:  sum += qlp_coeff[ 5] * data[i- 6];
708                                 case 5:  sum += qlp_coeff[ 4] * data[i- 5];
709                                 case 4:  sum += qlp_coeff[ 3] * data[i- 4];
710                                 case 3:  sum += qlp_coeff[ 2] * data[i- 3];
711                                 case 2:  sum += qlp_coeff[ 1] * data[i- 2];
712                                 case 1:  sum += qlp_coeff[ 0] * data[i- 1];
713                         }
714                         residual[i] = data[i] - (sum >> lp_quantization);
715                 }
716         }
717         else { /* order > 12 */
718                 for(i = 0; i < (int)data_len; i++) {
719                         sum = 0;
720                         switch(order) {
721                                 case 32: sum += qlp_coeff[31] * data[i-32];
722                                 case 31: sum += qlp_coeff[30] * data[i-31];
723                                 case 30: sum += qlp_coeff[29] * data[i-30];
724                                 case 29: sum += qlp_coeff[28] * data[i-29];
725                                 case 28: sum += qlp_coeff[27] * data[i-28];
726                                 case 27: sum += qlp_coeff[26] * data[i-27];
727                                 case 26: sum += qlp_coeff[25] * data[i-26];
728                                 case 25: sum += qlp_coeff[24] * data[i-25];
729                                 case 24: sum += qlp_coeff[23] * data[i-24];
730                                 case 23: sum += qlp_coeff[22] * data[i-23];
731                                 case 22: sum += qlp_coeff[21] * data[i-22];
732                                 case 21: sum += qlp_coeff[20] * data[i-21];
733                                 case 20: sum += qlp_coeff[19] * data[i-20];
734                                 case 19: sum += qlp_coeff[18] * data[i-19];
735                                 case 18: sum += qlp_coeff[17] * data[i-18];
736                                 case 17: sum += qlp_coeff[16] * data[i-17];
737                                 case 16: sum += qlp_coeff[15] * data[i-16];
738                                 case 15: sum += qlp_coeff[14] * data[i-15];
739                                 case 14: sum += qlp_coeff[13] * data[i-14];
740                                 case 13: sum += qlp_coeff[12] * data[i-13];
741                                          sum += qlp_coeff[11] * data[i-12];
742                                          sum += qlp_coeff[10] * data[i-11];
743                                          sum += qlp_coeff[ 9] * data[i-10];
744                                          sum += qlp_coeff[ 8] * data[i- 9];
745                                          sum += qlp_coeff[ 7] * data[i- 8];
746                                          sum += qlp_coeff[ 6] * data[i- 7];
747                                          sum += qlp_coeff[ 5] * data[i- 6];
748                                          sum += qlp_coeff[ 4] * data[i- 5];
749                                          sum += qlp_coeff[ 3] * data[i- 4];
750                                          sum += qlp_coeff[ 2] * data[i- 3];
751                                          sum += qlp_coeff[ 1] * data[i- 2];
752                                          sum += qlp_coeff[ 0] * data[i- 1];
753                         }
754                         residual[i] = data[i] - (sum >> lp_quantization);
755                 }
756         }
757         _mm256_zeroupper();
758 }
759
760 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
761
762 FLAC__SSE_TARGET("avx2")
763 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
764 {
765         int i;
766         FLAC__int64 sum;
767         __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
768         __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
769
770         FLAC__ASSERT(order > 0);
771         FLAC__ASSERT(order <= 32);
772         FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
773
774         if(order <= 12) {
775                 if(order > 8) {
776                         if(order > 10) {
777                                 if(order == 12) {
778                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
779                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
780                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
781                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
782                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
783                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
784                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
785                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
786                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
787                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
788                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
789                                         q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
790                                         q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
791
792                                         for(i = 0; i < (int)data_len-3; i+=4) {
793                                                 __m256i summ, mull;
794                                                 summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
795                                                 mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
796                                                 mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
797                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
798                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
799                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
800                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
801                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
802                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
803                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
804                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
805                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
806                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
807                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
808                                         }
809                                 }
810                                 else { /* order == 11 */
811                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
812                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
813                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
814                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
815                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
816                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
817                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
818                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
819                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
820                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
821                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
822                                         q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
823
824                                         for(i = 0; i < (int)data_len-3; i+=4) {
825                                                 __m256i summ, mull;
826                                                 summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
827                                                 mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
828                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
829                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
830                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
831                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
832                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
833                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
834                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
835                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
836                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
837                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
838                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
839                                         }
840                                 }
841                         }
842                         else {
843                                 if(order == 10) {
844                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
845                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
846                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
847                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
848                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
849                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
850                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
851                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
852                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
853                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
854                                         q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
855
856                                         for(i = 0; i < (int)data_len-3; i+=4) {
857                                                 __m256i summ, mull;
858                                                 summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
859                                                 mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
860                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
861                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
862                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
863                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
864                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
865                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
866                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
867                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
868                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
869                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
870                                         }
871                                 }
872                                 else { /* order == 9 */
873                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
874                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
875                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
876                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
877                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
878                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
879                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
880                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
881                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
882                                         q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
883
884                                         for(i = 0; i < (int)data_len-3; i+=4) {
885                                                 __m256i summ, mull;
886                                                 summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
887                                                 mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
888                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
889                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
890                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
891                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
892                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
893                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
894                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
895                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
896                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
897                                         }
898                                 }
899                         }
900                 }
901                 else if(order > 4) {
902                         if(order > 6) {
903                                 if(order == 8) {
904                                         __m256i q0, q1, q2, q3, q4, q5, q6, q7;
905                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
906                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
907                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
908                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
909                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
910                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
911                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
912                                         q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
913
914                                         for(i = 0; i < (int)data_len-3; i+=4) {
915                                                 __m256i summ, mull;
916                                                 summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
917                                                 mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
918                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
919                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
920                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
921                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
922                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
923                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
924                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
925                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
926                                         }
927                                 }
928                                 else { /* order == 7 */
929                                         __m256i q0, q1, q2, q3, q4, q5, q6;
930                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
931                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
932                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
933                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
934                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
935                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
936                                         q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
937
938                                         for(i = 0; i < (int)data_len-3; i+=4) {
939                                                 __m256i summ, mull;
940                                                 summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
941                                                 mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
942                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
943                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
944                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
945                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
946                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
947                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
948                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
949                                         }
950                                 }
951                         }
952                         else {
953                                 if(order == 6) {
954                                         __m256i q0, q1, q2, q3, q4, q5;
955                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
956                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
957                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
958                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
959                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
960                                         q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
961
962                                         for(i = 0; i < (int)data_len-3; i+=4) {
963                                                 __m256i summ, mull;
964                                                 summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
965                                                 mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
966                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
967                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
968                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
969                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
970                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
971                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
972                                         }
973                                 }
974                                 else { /* order == 5 */
975                                         __m256i q0, q1, q2, q3, q4;
976                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
977                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
978                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
979                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
980                                         q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
981
982                                         for(i = 0; i < (int)data_len-3; i+=4) {
983                                                 __m256i summ, mull;
984                                                 summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
985                                                 mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
986                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
987                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
988                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
989                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
990                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
991                                         }
992                                 }
993                         }
994                 }
995                 else {
996                         if(order > 2) {
997                                 if(order == 4) {
998                                         __m256i q0, q1, q2, q3;
999                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1000                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1001                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1002                                         q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
1003
1004                                         for(i = 0; i < (int)data_len-3; i+=4) {
1005                                                 __m256i summ, mull;
1006                                                 summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1007                                                 mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
1008                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1009                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1010                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1011                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1012                                         }
1013                                 }
1014                                 else { /* order == 3 */
1015                                         __m256i q0, q1, q2;
1016                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1017                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1018                                         q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1019
1020                                         for(i = 0; i < (int)data_len-3; i+=4) {
1021                                                 __m256i summ, mull;
1022                                                 summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1023                                                 mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1024                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1025                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1026                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1027                                         }
1028                                 }
1029                         }
1030                         else {
1031                                 if(order == 2) {
1032                                         __m256i q0, q1;
1033                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1034                                         q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1035
1036                                         for(i = 0; i < (int)data_len-3; i+=4) {
1037                                                 __m256i summ, mull;
1038                                                 summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1039                                                 mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1040                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1041                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1042                                         }
1043                                 }
1044                                 else { /* order == 1 */
1045                                         __m256i q0;
1046                                         q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1047
1048                                         for(i = 0; i < (int)data_len-3; i+=4) {
1049                                                 __m256i summ;
1050                                                 summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1051                                                 summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1052                                                 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1053                                         }
1054                                 }
1055                         }
1056                 }
1057                 for(; i < (int)data_len; i++) {
1058                         sum = 0;
1059                         switch(order) {
1060                                 case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1061                                 case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1062                                 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1063                                 case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1064                                 case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1065                                 case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1066                                 case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1067                                 case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1068                                 case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1069                                 case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1070                                 case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1071                                 case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1072                         }
1073                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1074                 }
1075         }
1076         else { /* order > 12 */
1077                 for(i = 0; i < (int)data_len; i++) {
1078                         sum = 0;
1079                         switch(order) {
1080                                 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1081                                 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1082                                 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1083                                 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1084                                 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1085                                 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1086                                 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1087                                 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1088                                 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1089                                 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1090                                 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1091                                 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1092                                 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1093                                 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1094                                 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1095                                 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1096                                 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1097                                 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1098                                 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1099                                 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1100                                          sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1101                                          sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1102                                          sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1103                                          sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1104                                          sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1105                                          sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1106                                          sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1107                                          sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1108                                          sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1109                                          sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1110                                          sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1111                                          sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1112                         }
1113                         residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1114                 }
1115         }
1116         _mm256_zeroupper();
1117 }
1118
1119 #endif /* FLAC__AVX2_SUPPORTED */
1120 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1121 #endif /* FLAC__NO_ASM */
1122 #endif /* FLAC__INTEGER_ONLY_LIBRARY */