Add FLAC__SSE_SUPPORTED and FLAC__SSE2_SUPPORTED flags.
authorErik de Castro Lopo <erikd@mega-nerd.com>
Thu, 30 Jan 2014 10:49:51 +0000 (21:49 +1100)
committerErik de Castro Lopo <erikd@mega-nerd.com>
Thu, 30 Jan 2014 10:49:55 +0000 (21:49 +1100)
* Allow compiling using GCC GCC w/o SSE support.
* Allow SSE4.1 intrinsic functions to be enabled.

Patch-from: lvqcl <lvqcl.mail@gmail.com>

include/share/compat.h
src/libFLAC/include/private/cpu.h
src/libFLAC/include/private/lpc.h
src/libFLAC/include/private/stream_encoder.h
src/libFLAC/lpc_intrin_sse.c
src/libFLAC/lpc_intrin_sse2.c
src/libFLAC/lpc_intrin_sse41.c
src/libFLAC/stream_decoder.c
src/libFLAC/stream_encoder.c
src/libFLAC/stream_encoder_intrin_sse2.c
src/libFLAC/stream_encoder_intrin_ssse3.c

index a17cc63..6f04f44 100644 (file)
@@ -199,12 +199,4 @@ int flac_snprintf(char *str, size_t size, const char *fmt, ...);
 };
 #endif
 
-/* SSSE3, SSE4 support: MSVS 2008, GCC 4.3 -- currently disabled, Intel Compiler 10.0 */
-#if    ( defined _MSC_VER && _MSC_VER >= 1500 ) \
-    || ( 0 && defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) ) \
-    || ( defined __INTEL_COMPILER && __INTEL_COMPILER >= 1000 )
-#define FLAC__SSSE3_SUPPORTED 1
-#define FLAC__SSE4_SUPPORTED 1
-#endif
-
 #endif /* FLAC__SHARE__COMPAT_H */
index 1bd8bca..07d7fb0 100644 (file)
 #include <config.h>
 #endif
 
+/* SSE intrinsics support by ICC/MSVC/GCC */
+#if defined __INTEL_COMPILER
+  #define FLAC__SSE_TARGET(x)
+  #define FLAC__SSE_SUPPORTED 1
+  #define FLAC__SSE2_SUPPORTED 1
+  #if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
+    #define FLAC__SSSE3_SUPPORTED 1
+    #define FLAC__SSE4_1_SUPPORTED 1
+  #endif
+#elif defined _MSC_VER
+  #define FLAC__SSE_TARGET(x)
+  #define FLAC__SSE_SUPPORTED 1
+  #define FLAC__SSE2_SUPPORTED 1
+  #if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
+    #define FLAC__SSSE3_SUPPORTED 1
+    #define FLAC__SSE4_1_SUPPORTED 1
+  #endif
+#elif defined __GNUC__
+  #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* since GCC 4.9 -msse.. compiler options aren't necessary */
+    #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
+    #define FLAC__SSE_SUPPORTED 1
+    #define FLAC__SSE2_SUPPORTED 1
+    #define FLAC__SSSE3_SUPPORTED 1
+    #define FLAC__SSE4_1_SUPPORTED 1
+  #else /* for GCC older than 4.9 */
+    #define FLAC__SSE_TARGET(x)
+    #ifdef __SSE__
+      #define FLAC__SSE_SUPPORTED 1
+    #endif
+    #ifdef __SSE2__
+      #define FLAC__SSE2_SUPPORTED 1
+    #endif
+    #ifdef __SSSE3__
+      #define FLAC__SSSE3_SUPPORTED 1
+    #endif
+    #ifdef __SSE4_1__
+      #define FLAC__SSE4_1_SUPPORTED 1
+    #endif
+  #endif /* GCC version */
+#endif /* compiler version */
+
 typedef enum {
        FLAC__CPUINFO_TYPE_IA32,
        FLAC__CPUINFO_TYPE_X86_64,
index 27760b4..d0872c3 100644 (file)
@@ -37,6 +37,7 @@
 #include <config.h>
 #endif
 
+#include "private/cpu.h"
 #include "private/float.h"
 #include "FLAC/format.h"
 
@@ -80,10 +81,12 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], u
 #    endif
 #  endif
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE_SUPPORTED
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+#    endif
 #  endif
 #endif
 
@@ -156,9 +159,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__
 #    endif
 #  endif
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE2_SUPPORTED
 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
-#    ifdef FLAC__SSE4_SUPPORTED
+#    endif
+#    ifdef FLAC__SSE4_1_SUPPORTED
 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 #    endif
 #  endif
@@ -195,9 +200,9 @@ void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], u
 void FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #  endif /* FLAC__CPU_IA32 || FLAC__CPU_PPC */
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
-#     ifdef FLAC__SSE4_SUPPORTED
+#    ifdef FLAC__SSE4_1_SUPPORTED
 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
-#     endif
+#    endif
 #  endif
 #endif /* FLAC__NO_ASM */
 
index ee7d978..d26039a 100644 (file)
 #endif
 
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
-#include "share/compat.h"
+#include "private/cpu.h"
 #include "FLAC/format.h"
 
+#ifdef FLAC__SSE2_SUPPORTED
 extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
                        unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
+#endif
 
 #ifdef FLAC__SSSE3_SUPPORTED
 extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
index e8f9f57..23299cc 100644 (file)
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE_SUPPORTED
 
 #include "FLAC/assert.h"
 #include "FLAC/format.h"
-#include "private/lpc.h"
 
 #include <xmmintrin.h> /* SSE */
 
+FLAC__SSE_TARGET("sse")
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
 {
        __m128 xmm0, xmm2, xmm5;
@@ -80,6 +82,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[],
        _mm_storeu_ps(autoc, xmm5);
 }
 
+FLAC__SSE_TARGET("sse")
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
 {
        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
@@ -125,6 +128,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[],
        _mm_storeu_ps(autoc+4, xmm6);
 }
 
+FLAC__SSE_TARGET("sse")
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
 {
        __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -178,6 +182,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[]
        _mm_storeu_ps(autoc+8, xmm7);
 }
 
+FLAC__SSE_TARGET("sse")
 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
 {
        __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
@@ -241,6 +246,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[]
        _mm_storeu_ps(autoc+12,xmm9);
 }
 
+#endif /* FLAC__SSE_SUPPORTED */
 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
 #endif /* FLAC__NO_ASM */
 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
index 9311151..98d51bd 100644 (file)
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE2_SUPPORTED
 
 #include "FLAC/assert.h"
 #include "FLAC/format.h"
-#include "private/lpc.h"
 
 #include <emmintrin.h> /* SSE2 */
 
+FLAC__SSE_TARGET("sse2")
 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 {
        int i;
@@ -787,6 +789,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
 
 #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
 
+FLAC__SSE_TARGET("sse2")
 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 {
        int i;
@@ -1313,6 +1316,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
        }
 }
 
+#endif /* FLAC__SSE2_SUPPORTED */
 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
 #endif /* FLAC__NO_ASM */
 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
index ea8eb37..97ee9ea 100644 (file)
 #  include <config.h>
 #endif
 
-#include "share/compat.h"
-
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
-#ifdef FLAC__SSE4_SUPPORTED
+#include "private/lpc.h"
+#ifdef FLAC__SSE4_1_SUPPORTED
 
 #include "FLAC/assert.h"
 #include "FLAC/format.h"
-#include "private/lpc.h"
 
 #include <smmintrin.h> /* SSE4.1 */
 
@@ -68,6 +66,7 @@
 #define     DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
 #endif
 
+FLAC__SSE_TARGET("sse4.1")
 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 {
        int i;
@@ -594,6 +593,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
        }
 }
 
+FLAC__SSE_TARGET("sse4.1")
 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
 {
        int i;
@@ -1120,7 +1120,7 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], un
        }
 }
 
-#endif /* FLAC__SSE4_SUPPORTED */
+#endif /* FLAC__SSE4_1_SUPPORTED */
 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
 #endif /* FLAC__NO_ASM */
 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
index d8cd714..633dcdc 100644 (file)
@@ -417,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
                }
 #endif
 #ifdef FLAC__HAS_X86INTRIN
-# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
+# if defined FLAC__SSE4_1_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
                if(decoder->private_->cpuinfo.ia32.sse41)
                        decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
 # endif
index cbf2815..1cd123f 100644 (file)
@@ -920,11 +920,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov;
 #   endif /* FLAC__HAS_NASM */
 #   ifdef FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE2_SUPPORTED
                if(encoder->private_->cpuinfo.ia32.sse2) {
                        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
                        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
                }
-#    ifdef FLAC__SSE4_SUPPORTED
+#    endif
+#    ifdef FLAC__SSE4_1_SUPPORTED
                if(encoder->private_->cpuinfo.ia32.sse41)
                        encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41;
 #    endif
@@ -932,6 +934,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #  elif defined FLAC__CPU_X86_64
                FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
 #   ifdef FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE_SUPPORTED
                if(encoder->protected_->max_lpc_order < 4)
                        encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
                else if(encoder->protected_->max_lpc_order < 8)
@@ -940,9 +943,11 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
                else if(encoder->protected_->max_lpc_order < 16)
                        encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
-
+#    endif
+#    ifdef FLAC__SSE2_SUPPORTED
                encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
                encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
+#    endif
 #   endif /* FLAC__HAS_X86INTRIN */
 #  endif /* FLAC__CPU_... */
        }
@@ -956,15 +961,19 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
                else
 #  endif
+#  ifdef FLAC__SSE2_SUPPORTED
                if(encoder->private_->cpuinfo.ia32.sse2)
                        encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
+#  endif
 # elif defined FLAC__CPU_X86_64
 #  ifdef FLAC__SSSE3_SUPPORTED
                if(encoder->private_->cpuinfo.x86_64.ssse3)
                        encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
                else
 #  endif
+#  ifdef FLAC__SSE2_SUPPORTED
                        encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
+#  endif
 # endif /* FLAC__CPU_... */
        }
 #endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */
index ac2e69f..9852175 100644 (file)
 
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/stream_encoder.h"
+#ifdef FLAC__SSE2_SUPPORTED
 
 #include <stdlib.h>    /* for abs() */
 #include <emmintrin.h> /* SSE2 */
 #include "FLAC/assert.h"
-#include "private/stream_encoder.h"
 
+FLAC__SSE_TARGET("sse2")
 void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
                unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
 {
@@ -157,5 +159,6 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
        }
 }
 
+#endif /* FLAC__SSE2_SUPPORTED */
 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
 #endif /* FLAC__NO_ASM */
index 35f3b23..7294c55 100644 (file)
 #  include <config.h>
 #endif
 
-#include "share/compat.h"
-
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/stream_encoder.h"
 #ifdef FLAC__SSSE3_SUPPORTED
 
 #include <stdlib.h>    /* for abs() */
 #include <tmmintrin.h> /* SSSE3 */
 #include "FLAC/assert.h"
-#include "private/stream_encoder.h"
 
+FLAC__SSE_TARGET("ssse3")
 void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
                unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
 {