lpc_intrin_sse2.c : Add RESIDUAL16_RESULT macro.
authorErik de Castro Lopo <erikd@mega-nerd.com>
Thu, 30 Jan 2014 11:17:01 +0000 (22:17 +1100)
committerErik de Castro Lopo <erikd@mega-nerd.com>
Thu, 30 Jan 2014 11:17:08 +0000 (22:17 +1100)
RESIDUAL16_RESULT is analogous to the existing RESIDUAL_RESULT macro
and simplifies the code a little.

Patch-from: lvqcl <lvqcl.mail@gmail.com>

src/libFLAC/lpc_intrin_sse2.c

index 98d51bd..3ccebd0 100644 (file)
 
 #include <emmintrin.h> /* SSE2 */
 
+#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
+
+#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define     DATA_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+
 FLAC__SSE_TARGET("sse2")
 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 {
@@ -59,7 +65,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                FLAC__int32 curr;
                if(order > 8) { /* order == 9, 10, 11, 12 */
 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
-                       /* can be modified to work with order <= 15 but the subset limit is 12 */
                        int r;
                        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
                        xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
@@ -103,8 +108,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                       curr = *data++;
-                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                       RESIDUAL16_RESULT(xmm6);
 
                        data_len--;
                        r = data_len % 2;
@@ -124,8 +128,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                data_len--;
                        }
@@ -146,8 +149,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                xmm3 = _mm_insert_epi16(xmm3, curr, 0);
 
@@ -159,8 +161,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                data_len-=2;
                        }
@@ -218,8 +219,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                       curr = *data++;
-                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                       RESIDUAL16_RESULT(xmm6);
 
                        data_len--;
                        r = data_len % 4;
@@ -239,8 +239,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                data_len--; r--;
                        }
@@ -261,8 +260,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                xmm3 = _mm_insert_epi16(xmm3, curr, 2);
 
@@ -274,8 +272,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                xmm3 = _mm_insert_epi16(xmm3, curr, 1);
 
@@ -287,8 +284,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                xmm3 = _mm_insert_epi16(xmm3, curr, 0);
 
@@ -300,8 +296,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                               curr = *data++;
-                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                               RESIDUAL16_RESULT(xmm6);
 
                                data_len-=4;
                        }
@@ -329,8 +324,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
 
@@ -343,8 +337,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--;
                                        }
@@ -373,8 +366,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
                                        r = data_len % 2;
@@ -388,8 +380,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--;
                                        }
@@ -403,8 +394,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 0);
                                                xmm6 = xmm3;
@@ -412,8 +402,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len-=2;
                                        }
@@ -446,8 +435,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
                                        r = data_len % 3;
@@ -461,8 +449,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--; r--;
                                        }
@@ -476,8 +463,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 1);
 
@@ -486,8 +472,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 0);
 
@@ -496,8 +481,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len-=3;
                                        }
@@ -530,8 +514,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
                                        r = data_len % 4;
@@ -545,8 +528,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--; r--;
                                        }
@@ -560,8 +542,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 2);
 
@@ -570,8 +551,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 1);
 
@@ -580,8 +560,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 0);
 
@@ -590,8 +569,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len-=4;
                                        }
@@ -617,8 +595,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = _mm_madd_epi16(xmm6, xmm0);
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
 
@@ -630,8 +607,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_madd_epi16(xmm6, xmm0);
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--;
                                        }
@@ -657,8 +633,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = _mm_madd_epi16(xmm6, xmm0);
                                        xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
                                        r = data_len % 2;
@@ -671,8 +646,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_madd_epi16(xmm6, xmm0);
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--;
                                        }
@@ -686,8 +660,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_madd_epi16(xmm6, xmm1);
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                xmm3 = _mm_insert_epi16(xmm3, curr, 0);
 
@@ -695,8 +668,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = _mm_madd_epi16(xmm6, xmm0);
                                                xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len-=2;
                                        }
@@ -720,8 +692,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                        xmm6 = xmm3;
                                        xmm6 = _mm_madd_epi16(xmm6, xmm0);
 
-                                       curr = *data++;
-                                       *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                       RESIDUAL16_RESULT(xmm6);
 
                                        data_len--;
 
@@ -732,8 +703,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
                                                xmm6 = xmm3;
                                                xmm6 = _mm_madd_epi16(xmm6, xmm0);
 
-                                               curr = *data++;
-                                               *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+                                               RESIDUAL16_RESULT(xmm6);
 
                                                data_len--;
                                        }
@@ -787,8 +757,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
        }
 }
 
-#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
-
 FLAC__SSE_TARGET("sse2")
 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 {