Add intrinsics version of two lpc functions.
authorErik de Castro Lopo <erikd@mega-nerd.com>
Thu, 10 Apr 2014 20:21:09 +0000 (06:21 +1000)
committerErik de Castro Lopo <erikd@mega-nerd.com>
Thu, 10 Apr 2014 20:21:15 +0000 (06:21 +1000)
Functions:
- FLAC__fixed_compute_best_predictor
- FLAC__fixed_compute_best_predictor_wide

Patch-from: lvqcl <lvqcl.mail@gmail.com>

src/libFLAC/Makefile.am
src/libFLAC/Makefile.lite
src/libFLAC/include/private/fixed.h
src/libFLAC/libFLAC_dynamic.vcproj
src/libFLAC/libFLAC_static.vcproj
src/libFLAC/stream_encoder.c

index 258de40..b6e1f47 100644 (file)
@@ -121,6 +121,8 @@ libFLAC_sources = \
        cpu.c \
        crc.c \
        fixed.c \
+       fixed_intrin_sse2.c \
+       fixed_intrin_ssse3.c \
        float.c \
        format.c \
        lpc.c \
index 4533138..6127cf3 100644 (file)
@@ -85,6 +85,8 @@ SRCS_C = \
        cpu.c \
        crc.c \
        fixed.c \
+       fixed_intrin_sse2.c \
+       fixed_intrin_ssse3.c \
        float.c \
        format.c \
        lpc.c \
index f786f7d..d1b8a86 100644 (file)
@@ -37,6 +37,7 @@
 #include <config.h>
 #endif
 
+#include "private/cpu.h"
 #include "private/float.h"
 #include "FLAC/format.h"
 
  */
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 # ifndef FLAC__NO_ASM
-#  ifdef FLAC__CPU_IA32
-#   ifdef FLAC__HAS_NASM
-unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#   ifdef FLAC__SSE2_SUPPORTED
+unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+#   endif
+#   ifdef FLAC__SSSE3_SUPPORTED
+unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
 #   endif
 #  endif
+#  if defined FLAC__CPU_IA32 && defined FLAC__HAS_NASM
+unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+#  endif
 # endif
-unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #else
 unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
index 693a10c..f3bb464 100644 (file)
                                >\r
                        </File>\r
                        <File\r
+                               RelativePath=".\fixed_intrin_sse2.c"\r
+                               >\r
+                       </File>\r
+                       <File\r
+                               RelativePath=".\fixed_intrin_ssse3.c"\r
+                               >\r
+                       </File>\r
+                       <File\r
                                RelativePath=".\float.c"\r
                                >\r
                        </File>\r
index 4851755..fdc9f73 100644 (file)
                                >\r
                        </File>\r
                        <File\r
+                               RelativePath=".\fixed_intrin_sse2.c"\r
+                               >\r
+                       </File>\r
+                       <File\r
+                               RelativePath=".\fixed_intrin_ssse3.c"\r
+                               >\r
+                       </File>\r
+                       <File\r
                                RelativePath=".\float.c"\r
                                >\r
                        </File>\r
index 2b1765c..3fe0a74 100644 (file)
@@ -348,8 +348,10 @@ typedef struct FLAC__StreamEncoderPrivate {
        void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
        unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+       unsigned (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #else
        unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+       unsigned (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #endif
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
        void (*local_lpc_compute_autocorrelation)(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
@@ -879,6 +881,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #endif
        encoder->private_->local_precompute_partition_info_sums = precompute_partition_info_sums_;
        encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor;
+       encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide;
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients;
        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide;
@@ -937,6 +940,17 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
                        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
                }
+#     ifdef FLAC__SSSE3_SUPPORTED
+               if (encoder->private_->cpuinfo.ia32.ssse3) {
+                       encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_ssse3;
+                       encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_ssse3;
+               }
+               else
+#     endif
+               if (encoder->private_->cpuinfo.ia32.sse2) {
+                       encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
+                       encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
+               }
 #    endif
 #    ifdef FLAC__SSE4_1_SUPPORTED
                if(encoder->private_->cpuinfo.ia32.sse41)
@@ -959,6 +973,15 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #    ifdef FLAC__SSE2_SUPPORTED
                /* encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2; // OPT: not faster than C; TODO: more tests on different CPUs */
                encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
+#     ifdef FLAC__SSSE3_SUPPORTED
+               if (encoder->private_->cpuinfo.x86_64.ssse3) {
+                       encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_ssse3;
+                       encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_ssse3;
+               }
+               else
+#     endif
+               encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
+               encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
 #    endif
 #   endif /* FLAC__HAS_X86INTRIN */
 #  endif /* FLAC__CPU_... */
@@ -991,7 +1014,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */
        /* finally override based on wide-ness if necessary */
        if(encoder->private_->use_wide_by_block) {
-               encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_wide;
+               encoder->private_->local_fixed_compute_best_predictor = encoder->private_->local_fixed_compute_best_predictor_wide;
        }
 
        /* set state to OK; from here on, errors are fatal and we'll override the state then */