Don't use intrinsics when they are slower.
authorErik de Castro Lopo <erikd@mega-nerd.com>
Mon, 24 Feb 2014 10:45:32 +0000 (21:45 +1100)
committerErik de Castro Lopo <erikd@mega-nerd.com>
Mon, 24 Feb 2014 10:46:05 +0000 (21:46 +1100)
More thorough en-/decoding tests show that sometimes the functions
that use intrinsics are slower (or not really faster) than old
plain C functions.

After this patch the encoder doesn't use these new functions
when their usefulness is questionable.

Patch-from: lvqcl <lvqcl.mail@gmail.com>

src/libFLAC/lpc_intrin_sse2.c
src/libFLAC/stream_decoder.c
src/libFLAC/stream_encoder.c

index 2902374..ad9da79 100644 (file)
@@ -1289,6 +1289,10 @@ void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsig
 {
        int i;
        FLAC__int32 sum;
+       if (order < 8) {
+               FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
+               return;
+       }
 
        FLAC__ASSERT(order > 0);
        FLAC__ASSERT(order <= 32);
index cf06398..cd41b5e 100644 (file)
@@ -417,24 +417,17 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
                }
 #endif
 #ifdef FLAC__HAS_X86INTRIN
-# if defined FLAC__SSE2_SUPPORTED && !defined FLAC__HAS_NASM /* not faster than asm MMX code */
+# if defined FLAC__SSE2_SUPPORTED && !defined FLAC__HAS_NASM /* OPT: not faster than ASM/MMX code */
                if(decoder->private_->cpuinfo.ia32.sse2) {
                        decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse2;
                        decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_16_intrin_sse2;
                }
 # endif
-# if defined FLAC__SSE4_1_SUPPORTED && 1 /* faster than asm */
+# if defined FLAC__SSE4_1_SUPPORTED && 1 /* OPT: faster than asm; TODO: more tests */
                if(decoder->private_->cpuinfo.ia32.sse41)
                        decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
 # endif
 #endif
-#elif defined FLAC__CPU_X86_64
-#ifdef FLAC__HAS_X86INTRIN
-# if defined FLAC__SSE2_SUPPORTED
-               decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse2;
-               decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_16_intrin_sse2;
-# endif
-#endif
 #elif defined FLAC__CPU_PPC
                FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_PPC);
                if(decoder->private_->cpuinfo.ppc.altivec) {
index 343da4d..d6b1084 100644 (file)
@@ -957,7 +957,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
 #    endif
 #    ifdef FLAC__SSE2_SUPPORTED
-               encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
+               /* encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2; // OPT: not faster than C; TODO: more tests on different CPUs */
                encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
 #    endif
 #   endif /* FLAC__HAS_X86INTRIN */