Optimize silk_warped_autocorrelation_FIX() for ARM NEON
[opus.git] / silk / SigProc_FIX.h
index 4be0985..e0c3967 100644 (file)
@@ -35,13 +35,21 @@ extern "C"
 
 /*#define silk_MACRO_COUNT */          /* Used to enable WMOPS counting */
 
-#define SILK_MAX_ORDER_LPC            16            /* max order of the LPC analysis in schur() and k2a() */
+#define SILK_MAX_ORDER_LPC            24            /* max order of the LPC analysis in schur() and k2a() */
 
 #include <string.h>                                 /* for memset(), memcpy(), memmove() */
 #include "typedef.h"
 #include "resampler_structs.h"
 #include "macros.h"
+#include "cpu_support.h"
 
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/SigProc_FIX_sse.h"
+#endif
+
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/LPC_inv_pred_gain_arm.h"
+#endif
 
 /********************************************************************/
 /*                    SIGNAL PROCESSING FUNCTIONS                   */
@@ -108,7 +116,8 @@ void silk_LPC_analysis_filter(
     const opus_int16            *in,                /* I    Input signal                                                */
     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
     const opus_int32            len,                /* I    Signal length                                               */
-    const opus_int32            d                   /* I    Filter order                                                */
+    const opus_int32            d,                  /* I    Filter order                                                */
+    int                         arch                /* I    Run-time architecture                                       */
 );
 
 /* Chirp (bandwidth expand) LP AR filter */
@@ -127,17 +136,11 @@ void silk_bwexpander_32(
 
 /* Compute inverse of LPC prediction gain, and                           */
 /* test if LPC coefficients are stable (all poles within unit circle)    */
-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse prediction gain in energy domain, Q30        */
+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse prediction gain in energy domain, Q30        */
     const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */
     const opus_int              order               /* I   Prediction order                                             */
 );
 
-/* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O    Returns inverse prediction gain in energy domain, Q30       */
-    const opus_int32            *A_Q24,             /* I    Prediction coefficients [order]                             */
-    const opus_int              order               /* I    Prediction order                                            */
-);
-
 /* Split signal in two decimated bands using first-order allpass filters */
 void silk_ana_filt_bank_1(
     const opus_int16            *in,                /* I    Input signal [N]                                            */
@@ -147,6 +150,10 @@ void silk_ana_filt_bank_1(
     const opus_int32            N                   /* I    Number of input samples                                     */
 );
 
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+#define silk_LPC_inverse_pred_gain(A_Q12, order, arch)     ((void)(arch), silk_LPC_inverse_pred_gain_c(A_Q12, order))
+#endif
+
 /********************************************************************/
 /*                        SCALAR FUNCTIONS                          */
 /********************************************************************/
@@ -266,7 +273,17 @@ void silk_A2NLSF(
 void silk_NLSF2A(
     opus_int16                  *a_Q12,             /* O    monic whitening filter coefficients in Q12,  [ d ]          */
     const opus_int16            *NLSF,              /* I    normalized line spectral frequencies in Q15, [ d ]          */
-    const opus_int              d                   /* I    filter order (should be even)                               */
+    const opus_int              d,                  /* I    filter order (should be even)                               */
+    int                         arch                /* I    Run-time architecture                                       */
+);
+
+/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */
+void silk_LPC_fit(
+    opus_int16                  *a_QOUT,            /* O    Output signal                                               */
+    opus_int32                  *a_QIN,             /* I/O  Input signal                                                */
+    const opus_int              QOUT,               /* I    Input Q domain                                              */
+    const opus_int              QIN,                /* I    Input Q domain                                              */
+    const opus_int              d                   /* I    Filter order                                                */
 );
 
 void silk_insertion_sort_increasing(
@@ -303,7 +320,7 @@ void silk_NLSF_VQ_weights_laroia(
 );
 
 /* Compute reflection coefficients from input signal */
-void silk_burg_modified(
+void silk_burg_modified_c(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
     opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
@@ -335,12 +352,15 @@ void silk_scale_vector32_Q26_lshift_18(
 /********************************************************************/
 
 /*    return sum( inVec1[i] * inVec2[i] ) */
+
 opus_int32 silk_inner_prod_aligned(
     const opus_int16 *const     inVec1,             /*    I input vector 1                                              */
     const opus_int16 *const     inVec2,             /*    I input vector 2                                              */
-    const opus_int              len                 /*    I vector lengths                                              */
+    const opus_int              len,                /*    I vector lengths                                              */
+    int                         arch                /*    I Run-time architecture                                       */
 );
 
+
 opus_int32 silk_inner_prod_aligned_scale(
     const opus_int16 *const     inVec1,             /*    I input vector 1                                              */
     const opus_int16 *const     inVec2,             /*    I input vector 2                                              */
@@ -348,7 +368,7 @@ opus_int32 silk_inner_prod_aligned_scale(
     const opus_int              len                 /*    I vector lengths                                              */
 );
 
-opus_int64 silk_inner_prod16_aligned_64(
+opus_int64 silk_inner_prod16_aligned_64_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
@@ -463,8 +483,7 @@ static OPUS_INLINE opus_int32 silk_ROR32( opus_int32 a32, opus_int rot )
 /* Add with saturation for positive input values */
 #define silk_ADD_POS_SAT8(a, b)             ((((a)+(b)) & 0x80)                 ? silk_int8_MAX  : ((a)+(b)))
 #define silk_ADD_POS_SAT16(a, b)            ((((a)+(b)) & 0x8000)               ? silk_int16_MAX : ((a)+(b)))
-#define silk_ADD_POS_SAT32(a, b)            ((((a)+(b)) & 0x80000000)           ? silk_int32_MAX : ((a)+(b)))
-#define silk_ADD_POS_SAT64(a, b)            ((((a)+(b)) & 0x8000000000000000LL) ? silk_int64_MAX : ((a)+(b)))
+#define silk_ADD_POS_SAT32(a, b)            ((((opus_uint32)(a)+(opus_uint32)(b)) & 0x80000000) ? silk_int32_MAX : ((a)+(b)))
 
 #define silk_LSHIFT8(a, shift)              ((opus_int8)((opus_uint8)(a)<<(shift)))         /* shift >= 0, shift < 8  */
 #define silk_LSHIFT16(a, shift)             ((opus_int16)((opus_uint16)(a)<<(shift)))       /* shift >= 0, shift < 16 */
@@ -564,7 +583,9 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 /* Make sure to store the result as the seed for the next call (also in between     */
 /* frames), otherwise result won't be random at all. When only using some of the    */
 /* bits, take the most significant bits by right-shifting.                          */
-#define silk_RAND(seed)                     (silk_MLA_ovflw(907633515, (seed), 196314165))
+#define RAND_MULTIPLIER                     196314165
+#define RAND_INCREMENT                      907633515
+#define silk_RAND(seed)                     (silk_MLA_ovflw((RAND_INCREMENT), (seed), (RAND_MULTIPLIER)))
 
 /*  Add some multiplication functions that can be easily mapped to ARM. */
 
@@ -575,6 +596,14 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 /* the following seems faster on x86 */
 #define silk_SMMUL(a32, b32)                (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
 
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+    ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#endif
+
 #include "Inlines.h"
 #include "MacroCount.h"
 #include "MacroDebug.h"