Replace hadd with shuffle + add master
authorlvqcl <lvqcl.mail@gmail.com>
Wed, 19 Sep 2018 17:03:37 +0000 (20:03 +0300)
committerErik de Castro Lopo <erikd@mega-nerd.com>
Wed, 19 Sep 2018 21:20:13 +0000 (07:20 +1000)
src/libFLAC/lpc_intrin_sse41.c
src/libFLAC/stream_encoder_intrin_avx2.c
src/libFLAC/stream_encoder_intrin_sse2.c
src/libFLAC/stream_encoder_intrin_ssse3.c

index 96dd20d..4ef3d3e 100644 (file)
@@ -980,8 +980,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
                                summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1]));
                                summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0]));
 
-                               summ = _mm_hadd_epi32(summ, summ);
-                               summ = _mm_hadd_epi32(summ, summ);
+                               summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+                               summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
 
                                summ = _mm_sra_epi32(summ, cnt);
                                temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
@@ -1009,8 +1009,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
                        for (i = 0;;) {
                                summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
 
-                               summ = _mm_hadd_epi32(summ, summ);
-                               summ = _mm_hadd_epi32(summ, summ);
+                               summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+                               summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
 
                                summ = _mm_sra_epi32(summ, cnt);
                                temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
@@ -1079,8 +1079,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
                                summ = _mm_madd_epi16(dat[1], qlp[1]);
                                summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0]));
 
-                               summ = _mm_hadd_epi32(summ, summ);
-                               summ = _mm_hadd_epi32(summ, summ);
+                               summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+                               summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
 
                                summ = _mm_sra_epi32(summ, cnt);
                                temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
@@ -1109,8 +1109,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
                        for(i = 0;;) {
                                summ = _mm_madd_epi16(dat0, qlp0);
 
-                               summ = _mm_hadd_epi32(summ, summ);
-                               summ = _mm_hadd_epi32(summ, summ);
+                               summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
+                               summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
 
                                summ = _mm_sra_epi32(summ, cnt);
                                temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
index 265e6fe..94bde0e 100644 (file)
@@ -83,8 +83,8 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
                                        sum128 = _mm_add_epi32(sum128, res128);
                                }
 
-                               sum128 = _mm_hadd_epi32(sum128, sum128);
-                               sum128 = _mm_hadd_epi32(sum128, sum128);
+                               sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1,0,3,2)));
+                               sum128 = _mm_add_epi32(sum128, _mm_shufflelo_epi16(sum128, _MM_SHUFFLE(1,0,3,2)));
                                abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
 /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
 #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
index ed94ec3..44ee4d3 100644 (file)
@@ -97,8 +97,8 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
                                        mm_sum = _mm_add_epi32(mm_sum, mm_res);
                                }
 
-                               mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8));
-                               mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4));
+                               mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
+                               mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
                                abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
 /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
 #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
index b5996f7..d384dc0 100644 (file)
@@ -86,8 +86,8 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
                                        mm_sum = _mm_add_epi32(mm_sum, mm_res);
                                }
 
-                               mm_sum = _mm_hadd_epi32(mm_sum, mm_sum);
-                               mm_sum = _mm_hadd_epi32(mm_sum, mm_sum);
+                               mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
+                               mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
                                abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
 /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
 #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)