Adding AVX config switches
authorRadu Velea <radu.velea@intel.com>
Tue, 27 Oct 2015 10:21:36 +0000 (12:21 +0200)
committerTimothy B. Terriberry <tterribe@xiph.org>
Thu, 5 Nov 2015 09:23:05 +0000 (01:23 -0800)
Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>
celt/cpu_support.h
celt/x86/x86_celt_map.c
celt/x86/x86cpu.c
configure.ac
silk/x86/x86_silk_map.c

index db1cb58..133abbf 100644 (file)
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))
 
 #include "x86/x86cpu.h"
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))
 
 #include "x86/x86cpu.h"
-/* We currently support 4 x86 variants:
+/* We currently support 5 x86 variants:
  * arch[0] -> non-sse
  * arch[1] -> sse
  * arch[2] -> sse2
  * arch[3] -> sse4.1
  * arch[0] -> non-sse
  * arch[1] -> sse
  * arch[2] -> sse2
  * arch[3] -> sse4.1
+ * arch[4] -> avx
  */
  */
-#define OPUS_ARCHMASK 3
+#define OPUS_ARCHMASK 7
 int opus_select_arch(void);
 
 #else
 int opus_select_arch(void);
 
 #else
index 1ed2acb..8e5e449 100644 (file)
@@ -53,6 +53,7 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
   celt_fir_c,
   celt_fir_c,
   MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
   celt_fir_c,
   celt_fir_c,
   MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
+  MAY_HAVE_SSE4_1(celt_fir)  /* avx  */
 };
 
 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
 };
 
 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
@@ -65,6 +66,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
   xcorr_kernel_c,
   xcorr_kernel_c,
   MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
   xcorr_kernel_c,
   xcorr_kernel_c,
   MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
+  MAY_HAVE_SSE4_1(xcorr_kernel)  /* avx  */
 };
 
 #endif
 };
 
 #endif
@@ -81,6 +83,7 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
   celt_inner_prod_c,
   MAY_HAVE_SSE2(celt_inner_prod),
   MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
   celt_inner_prod_c,
   MAY_HAVE_SSE2(celt_inner_prod),
   MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
+  MAY_HAVE_SSE4_1(celt_inner_prod)  /* avx  */
 };
 
 #endif
 };
 
 #endif
@@ -99,6 +102,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
   MAY_HAVE_SSE(xcorr_kernel),
   MAY_HAVE_SSE(xcorr_kernel),
   MAY_HAVE_SSE(xcorr_kernel),
   MAY_HAVE_SSE(xcorr_kernel),
   MAY_HAVE_SSE(xcorr_kernel),
   MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel)
 };
 
 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 };
 
 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
@@ -110,6 +114,7 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
   MAY_HAVE_SSE(celt_inner_prod),
   MAY_HAVE_SSE(celt_inner_prod),
   MAY_HAVE_SSE(celt_inner_prod),
   MAY_HAVE_SSE(celt_inner_prod),
   MAY_HAVE_SSE(celt_inner_prod),
   MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod)
 };
 
 void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 };
 
 void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
@@ -124,6 +129,7 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
   MAY_HAVE_SSE(dual_inner_prod),
   MAY_HAVE_SSE(dual_inner_prod),
   MAY_HAVE_SSE(dual_inner_prod),
   MAY_HAVE_SSE(dual_inner_prod),
   MAY_HAVE_SSE(dual_inner_prod),
   MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod)
 };
 
 void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
 };
 
 void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
@@ -139,6 +145,7 @@ void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
   MAY_HAVE_SSE(comb_filter_const),
   MAY_HAVE_SSE(comb_filter_const),
   MAY_HAVE_SSE(comb_filter_const),
   MAY_HAVE_SSE(comb_filter_const),
   MAY_HAVE_SSE(comb_filter_const),
   MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const)
 };
 
 
 };
 
 
index f850715..1a73dd1 100644 (file)
@@ -91,6 +91,8 @@ typedef struct CPU_Feature{
     int HW_SSE;
     int HW_SSE2;
     int HW_SSE41;
     int HW_SSE;
     int HW_SSE2;
     int HW_SSE41;
+    /*  SIMD: 256-bit */
+    int HW_AVX;
 } CPU_Feature;
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
 } CPU_Feature;
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
@@ -106,11 +108,13 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
         cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
         cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
+        cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
     }
     else {
         cpu_feature->HW_SSE = 0;
         cpu_feature->HW_SSE2 = 0;
         cpu_feature->HW_SSE41 = 0;
     }
     else {
         cpu_feature->HW_SSE = 0;
         cpu_feature->HW_SSE2 = 0;
         cpu_feature->HW_SSE41 = 0;
+        cpu_feature->HW_AVX = 0;
     }
 }
 
     }
 }
 
@@ -140,6 +144,12 @@ int opus_select_arch(void)
     }
     arch++;
 
     }
     arch++;
 
+    if (!cpu_feature.HW_AVX)
+    {
+        return arch;
+    }
+    arch++;
+
     return arch;
 }
 
     return arch;
 }
 
index bb838c0..74aa2f4 100644 (file)
@@ -351,10 +351,12 @@ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
 AM_CONDITIONAL([HAVE_SSE], [false])
 AM_CONDITIONAL([HAVE_SSE2], [false])
 AM_CONDITIONAL([HAVE_SSE4_1], [false])
 AM_CONDITIONAL([HAVE_SSE], [false])
 AM_CONDITIONAL([HAVE_SSE2], [false])
 AM_CONDITIONAL([HAVE_SSE4_1], [false])
+AM_CONDITIONAL([HAVE_AVX], [false])
 
 m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse])
 m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2])
 m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1])
 
 m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse])
 m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2])
 m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1])
+m4_define([DEFAULT_X86_AVX_CFLAGS], [-mavx])
 m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon])
 # With GCC on ARM32 softfp architectures (e.g. Android, or older Ubuntu) you need to specify
 # -mfloat-abi=softfp for -mfpu=neon to work.  However, on ARM32 hardfp architectures (e.g. newer Ubuntu),
 m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon])
 # With GCC on ARM32 softfp architectures (e.g. Android, or older Ubuntu) you need to specify
 # -mfloat-abi=softfp for -mfpu=neon to work.  However, on ARM32 hardfp architectures (e.g. newer Ubuntu),
@@ -371,11 +373,13 @@ AS_CASE([$host],
 AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics @<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@])
 AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics @<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@])
 AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@])
 AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics @<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@])
 AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics @<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@])
 AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@])
+AC_ARG_VAR([X86_AVX_CFLAGS], [C compiler flags to compile AVX intrinsics @<:@default=]DEFAULT_X86_AVX_CFLAGS[@:>@])
 AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS / DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS[@:>@])
 
 AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS], "DEFAULT_X86_SSE_CFLAGS")])
 AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS], "DEFAULT_X86_SSE2_CFLAGS")])
 AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS], "DEFAULT_X86_SSE4_1_CFLAGS")])
 AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS / DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS[@:>@])
 
 AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS], "DEFAULT_X86_SSE_CFLAGS")])
 AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS], "DEFAULT_X86_SSE2_CFLAGS")])
 AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS], "DEFAULT_X86_SSE4_1_CFLAGS")])
+AS_VAR_SET_IF([X86_AVX_CFLAGS], [], [AS_VAR_SET([X86_AVX_CFLAGS], "DEFAULT_X86_AVX_CFLAGS")])
 AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS], ["$RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS"])])
 
 AC_DEFUN([OPUS_PATH_NE10],
 AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS], ["$RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS"])])
 
 AC_DEFUN([OPUS_PATH_NE10],
@@ -566,7 +570,24 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
              AC_SUBST([OPUS_X86_SSE4_1_CFLAGS])
           ]
       )
              AC_SUBST([OPUS_X86_SSE4_1_CFLAGS])
           ]
       )
-
+      OPUS_CHECK_INTRINSICS(
+         [AVX],
+         [$X86_AVX_CFLAGS],
+         [OPUS_X86_MAY_HAVE_AVX],
+         [OPUS_X86_PRESUME_AVX],
+         [[#include <immintrin.h>
+         ]],
+         [[
+            static __m256 mtest;
+            mtest = _mm256_setzero_ps();
+         ]]
+      )
+      AS_IF([test x"$OPUS_X86_MAY_HAVE_AVX" = x"1" && test x"$OPUS_X86_PRESUME_AVX" != x"1"],
+          [
+             OPUS_X86_AVX_CFLAGS="$X86_AVX_CFLAGS"
+             AC_SUBST([OPUS_X86_AVX_CFLAGS])
+          ]
+      )
          AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""])
          AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"],
          [
          AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""])
          AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"],
          [
@@ -606,6 +627,19 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
          [
             AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics])
          ])
          [
             AC_MSG_WARN([Compiler does not support SSE4.1 intrinsics])
          ])
+         AS_IF([test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"],
+         [
+            AC_DEFINE([OPUS_X86_MAY_HAVE_AVX], 1, [Compiler supports X86 AVX Intrinsics])
+            intrinsics_support="$intrinsics_support AVX"
+
+            AS_IF([test x"$OPUS_X86_PRESUME_AVX" = x"1"],
+               [AC_DEFINE([OPUS_X86_PRESUME_AVX], 1, [Define if binary requires AVX intrinsics support])],
+               [rtcd_support="$rtcd_support AVX"])
+         ],
+         [
+            AC_MSG_WARN([Compiler does not support AVX intrinsics])
+         ])
+
          AS_IF([test x"$intrinsics_support" = x""],
             [intrinsics_support=no],
             [intrinsics_support="x86$intrinsics_support"]
          AS_IF([test x"$intrinsics_support" = x""],
             [intrinsics_support=no],
             [intrinsics_support="x86$intrinsics_support"]
@@ -672,6 +706,8 @@ AM_CONDITIONAL([HAVE_SSE2],
     [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"])
 AM_CONDITIONAL([HAVE_SSE4_1],
     [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"])
     [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"])
 AM_CONDITIONAL([HAVE_SSE4_1],
     [test x"$OPUS_X86_MAY_HAVE_SSE4_1" = x"1"])
+AM_CONDITIONAL([HAVE_AVX],
+    [test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"])
 
 AS_IF([test x"$enable_rtcd" = x"yes"],[
     AS_IF([test x"$rtcd_support" != x"no"],[
 
 AS_IF([test x"$enable_rtcd" = x"yes"],[
     AS_IF([test x"$rtcd_support" != x"no"],[
index 6e79675..818841f 100644 (file)
@@ -50,6 +50,7 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_inner_prod16_aligned_64_c,
   silk_inner_prod16_aligned_64_c,
   MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
   silk_inner_prod16_aligned_64_c,
   silk_inner_prod16_aligned_64_c,
   MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */
 };
 
 #endif
 };
 
 #endif
@@ -62,6 +63,7 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_VAD_GetSA_Q8_c,
   silk_VAD_GetSA_Q8_c,
   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
   silk_VAD_GetSA_Q8_c,
   silk_VAD_GetSA_Q8_c,
   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */
 };
 
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };
 
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -85,6 +87,7 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_NSQ_c,
   silk_NSQ_c,
   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
   silk_NSQ_c,
   silk_NSQ_c,
   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */
 };
 
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };
 
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -104,6 +107,7 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_VQ_WMat_EC_c,
   silk_VQ_WMat_EC_c,
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
   silk_VQ_WMat_EC_c,
   silk_VQ_WMat_EC_c,
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */
 };
 
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };
 
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -127,6 +131,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_NSQ_del_dec_c,
   silk_NSQ_del_dec_c,
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
   silk_NSQ_del_dec_c,
   silk_NSQ_del_dec_c,
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
 };
 
 #if defined(FIXED_POINT)
 };
 
 #if defined(FIXED_POINT)
@@ -144,6 +149,7 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_warped_LPC_analysis_filter_FIX_c,
   silk_warped_LPC_analysis_filter_FIX_c,
   MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
   silk_warped_LPC_analysis_filter_FIX_c,
   silk_warped_LPC_analysis_filter_FIX_c,
   MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX )  /* avx */
 };
 
 void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };
 
 void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -161,6 +167,7 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
   silk_burg_modified_c,
   silk_burg_modified_c,
   MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
   silk_burg_modified_c,
   silk_burg_modified_c,
   MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_burg_modified )  /* avx */
 };
 
 #endif
 };
 
 #endif