Replaces log2() with celt_log2() and fixes a potential divide-by-zero
[opus.git] / silk / fixed / pitch_analysis_core_FIX.c
index d43f444..b6bc0bb 100644 (file)
@@ -8,11 +8,11 @@ this list of conditions and the following disclaimer.
 - Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
-- Neither the name of Internet Society, IETF or IETF Trust, nor the 
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
 names of specific contributors, may be used to endorse or promote
 products derived from this software without specific prior written
 permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
@@ -34,15 +34,30 @@ POSSIBILITY OF SUCH DAMAGE.
 ********************************************************** */
 #include "SigProc_FIX.h"
 #include "pitch_est_defines.h"
+#include "stack_alloc.h"
 #include "debug.h"
+#include "pitch.h"
 
 #define SCRATCH_SIZE    22
+#define SF_LENGTH_4KHZ  ( PE_SUBFR_LENGTH_MS * 4 )
+#define SF_LENGTH_8KHZ  ( PE_SUBFR_LENGTH_MS * 8 )
+#define MIN_LAG_4KHZ    ( PE_MIN_LAG_MS * 4 )
+#define MIN_LAG_8KHZ    ( PE_MIN_LAG_MS * 8 )
+#define MAX_LAG_4KHZ    ( PE_MAX_LAG_MS * 4 )
+#define MAX_LAG_8KHZ    ( PE_MAX_LAG_MS * 8 - 1 )
+#define CSTRIDE_4KHZ    ( MAX_LAG_4KHZ + 1 - MIN_LAG_4KHZ )
+#define CSTRIDE_8KHZ    ( MAX_LAG_8KHZ + 3 - ( MIN_LAG_8KHZ - 2 ) )
+#define D_COMP_MIN      ( MIN_LAG_8KHZ - 3 )
+#define D_COMP_MAX      ( MAX_LAG_8KHZ + 4 )
+#define D_COMP_STRIDE   ( D_COMP_MAX - D_COMP_MIN )
+
+typedef opus_int32 silk_pe_stage3_vals[ PE_NB_STAGE3_LAGS ];
 
 /************************************************************/
 /* Internally used functions                                */
 /************************************************************/
-void silk_P_Ana_calc_corr_st3(
-    opus_int32        cross_corr_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ],/* (O) 3 DIM correlation array */
+static void silk_P_Ana_calc_corr_st3(
+    silk_pe_stage3_vals cross_corr_st3[],              /* O 3 DIM correlation array */
     const opus_int16  frame[],                         /* I vector to correlate         */
     opus_int          start_lag,                       /* I lag offset to search around */
     opus_int          sf_length,                       /* I length of a 5 ms subframe   */
@@ -50,8 +65,8 @@ void silk_P_Ana_calc_corr_st3(
     opus_int          complexity                       /* I Complexity setting          */
 );
 
-void silk_P_Ana_calc_energy_st3(
-    opus_int32        energies_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ],/* (O) 3 DIM energy array */
+static void silk_P_Ana_calc_energy_st3(
+    silk_pe_stage3_vals energies_st3[],                /* O 3 DIM energy array */
     const opus_int16  frame[],                         /* I vector to calc energy in    */
     opus_int          start_lag,                       /* I lag offset to search around */
     opus_int          sf_length,                       /* I length of one 5 ms subframe */
@@ -59,12 +74,6 @@ void silk_P_Ana_calc_energy_st3(
     opus_int          complexity                       /* I Complexity setting          */
 );
 
-opus_int32 silk_P_Ana_find_scaling(
-    const opus_int16  *frame,
-    const opus_int    frame_length,
-    const opus_int    sum_sqr_len
-);
-
 /*************************************************************/
 /*      FIXED POINT CORE PITCH ANALYSIS FUNCTION             */
 /*************************************************************/
@@ -76,36 +85,37 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     opus_int                    *LTPCorr_Q15,       /* I/O  Normalized correlation; input: value from previous frame    */
     opus_int                    prevLag,            /* I    Last lag of previous frame; set to zero is unvoiced         */
     const opus_int32            search_thres1_Q16,  /* I    First stage threshold for lag candidates 0 - 1              */
-    const opus_int              search_thres2_Q15,  /* I    Final threshold for lag candidates 0 - 1                    */
+    const opus_int              search_thres2_Q13,  /* I    Final threshold for lag candidates 0 - 1                    */
     const opus_int              Fs_kHz,             /* I    Sample frequency (kHz)                                      */
     const opus_int              complexity,         /* I    Complexity setting, 0-2, where 2 is highest                 */
     const opus_int              nb_subfr            /* I    number of 5 ms subframes                                    */
 )
 {
-    opus_int16 frame_8kHz[ PE_MAX_FRAME_LENGTH_ST_2 ];
-    opus_int16 frame_4kHz[ PE_MAX_FRAME_LENGTH_ST_1 ];
+    VARDECL( opus_int16, frame_8kHz );
+    VARDECL( opus_int16, frame_4kHz );
     opus_int32 filt_state[ 6 ];
-    opus_int32 scratch_mem[ 3 * PE_MAX_FRAME_LENGTH ];
-    opus_int16 *input_frame_ptr;
+    const opus_int16 *input_frame_ptr;
     opus_int   i, k, d, j;
-    opus_int16 C[ PE_MAX_NB_SUBFR ][ ( PE_MAX_LAG >> 1 ) + 5 ];
+    VARDECL( opus_int16, C );
+    VARDECL( opus_int32, xcorr32 );
     const opus_int16 *target_ptr, *basis_ptr;
     opus_int32 cross_corr, normalizer, energy, shift, energy_basis, energy_target;
     opus_int   d_srch[ PE_D_SRCH_LENGTH ], Cmax, length_d_srch, length_d_comp;
-    opus_int16 d_comp[ ( PE_MAX_LAG >> 1 ) + 5 ];
-    opus_int32 sum, threshold, temp32, lag_counter;
+    VARDECL( opus_int16, d_comp );
+    opus_int32 sum, threshold, lag_counter;
     opus_int   CBimax, CBimax_new, CBimax_old, lag, start_lag, end_lag, lag_new;
     opus_int32 CC[ PE_NB_CBKS_STAGE2_EXT ], CCmax, CCmax_b, CCmax_new_b, CCmax_new;
-    opus_int32 energies_st3[  PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ];
-    opus_int32 crosscorr_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ];
-    opus_int   frame_length, frame_length_8kHz, frame_length_4kHz, max_sum_sq_length;
-    opus_int   sf_length, sf_length_8kHz, sf_length_4kHz;
-    opus_int   min_lag, min_lag_8kHz, min_lag_4kHz;
-    opus_int   max_lag, max_lag_8kHz, max_lag_4kHz;
-    opus_int32 contour_bias_Q20, diff, lz, lshift;
+    VARDECL( silk_pe_stage3_vals, energies_st3 );
+    VARDECL( silk_pe_stage3_vals, cross_corr_st3 );
+    opus_int   frame_length, frame_length_8kHz, frame_length_4kHz;
+    opus_int   sf_length;
+    opus_int   min_lag;
+    opus_int   max_lag;
+    opus_int32 contour_bias_Q15, diff;
     opus_int   nb_cbk_search, cbk_size;
-    opus_int32 delta_lag_log2_sqr_Q7, lag_log2_Q7, prevLag_log2_Q7, prev_lag_bias_Q15, corr_thres_Q15;
+    opus_int32 delta_lag_log2_sqr_Q7, lag_log2_Q7, prevLag_log2_Q7, prev_lag_bias_Q13;
     const opus_int8 *Lag_CB_ptr;
+    SAVE_STACK;
     /* Check for valid sampling frequency */
     silk_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 );
 
@@ -114,25 +124,18 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
 
     silk_assert( search_thres1_Q16 >= 0 && search_thres1_Q16 <= (1<<16) );
-    silk_assert( search_thres2_Q15 >= 0 && search_thres2_Q15 <= (1<<15) );
+    silk_assert( search_thres2_Q13 >= 0 && search_thres2_Q13 <= (1<<13) );
 
     /* Set up frame lengths max / min lag for the sampling frequency */
     frame_length      = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * Fs_kHz;
     frame_length_4kHz = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * 4;
     frame_length_8kHz = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * 8;
     sf_length         = PE_SUBFR_LENGTH_MS * Fs_kHz;
-    sf_length_4kHz    = PE_SUBFR_LENGTH_MS * 4;
-    sf_length_8kHz    = PE_SUBFR_LENGTH_MS * 8;
     min_lag           = PE_MIN_LAG_MS * Fs_kHz;
-    min_lag_4kHz      = PE_MIN_LAG_MS * 4;
-    min_lag_8kHz      = PE_MIN_LAG_MS * 8;
     max_lag           = PE_MAX_LAG_MS * Fs_kHz - 1;
-    max_lag_4kHz      = PE_MAX_LAG_MS * 4;
-    max_lag_8kHz      = PE_MAX_LAG_MS * 8 - 1;
-
-    silk_memset( C, 0, sizeof( opus_int16 ) * nb_subfr * ( ( PE_MAX_LAG >> 1 ) + 5) );
 
     /* Resample from input sampled at Fs_kHz to 8 kHz */
+    ALLOC( frame_8kHz, frame_length_8kHz, opus_int16 );
     if( Fs_kHz == 16 ) {
         silk_memset( filt_state, 0, 2 * sizeof( opus_int32 ) );
         silk_resampler_down2( filt_state, frame_8kHz, frame, frame_length );
@@ -146,6 +149,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
 
     /* Decimate again to 4 kHz */
     silk_memset( filt_state, 0, 2 * sizeof( opus_int32 ) );/* Set state to zero */
+    ALLOC( frame_4kHz, frame_length_4kHz, opus_int16 );
     silk_resampler_down2( filt_state, frame_4kHz, frame_8kHz, frame_length_8kHz );
 
     /* Low-pass filter */
@@ -159,9 +163,9 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     *******************************************************************************/
 
     /* Inner product is calculated with different lengths, so scale for the worst case */
-    max_sum_sq_length = silk_max_32( sf_length_8kHz, silk_LSHIFT( sf_length_4kHz, 2 ) );
-    shift = silk_P_Ana_find_scaling( frame_4kHz, frame_length_4kHz, max_sum_sq_length );
+    silk_sum_sqr_shift( &energy, &shift, frame_4kHz, frame_length_4kHz );
     if( shift > 0 ) {
+        shift = silk_RSHIFT( shift, 1 );
         for( i = 0; i < frame_length_4kHz; i++ ) {
             frame_4kHz[ i ] = silk_RSHIFT( frame_4kHz[ i ], shift );
         }
@@ -170,94 +174,93 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     /******************************************************************************
     * FIRST STAGE, operating in 4 khz
     ******************************************************************************/
-    target_ptr = &frame_4kHz[ silk_LSHIFT( sf_length_4kHz, 2 ) ];
+    ALLOC( C, nb_subfr * CSTRIDE_8KHZ, opus_int16 );
+    ALLOC( xcorr32, MAX_LAG_4KHZ-MIN_LAG_4KHZ+1, opus_int32 );
+    silk_memset( C, 0, (nb_subfr >> 1) * CSTRIDE_4KHZ * sizeof( opus_int16 ) );
+    target_ptr = &frame_4kHz[ silk_LSHIFT( SF_LENGTH_4KHZ, 2 ) ];
     for( k = 0; k < nb_subfr >> 1; k++ ) {
         /* Check that we are within range of the array */
         silk_assert( target_ptr >= frame_4kHz );
-        silk_assert( target_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
+        silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
 
-        basis_ptr = target_ptr - min_lag_4kHz;
+        basis_ptr = target_ptr - MIN_LAG_4KHZ;
 
         /* Check that we are within range of the array */
         silk_assert( basis_ptr >= frame_4kHz );
-        silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
+        silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
+
+        celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1 );
 
         /* Calculate first vector products before loop */
-        cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, sf_length_8kHz );
-        normalizer = silk_inner_prod_aligned( basis_ptr,  basis_ptr, sf_length_8kHz );
-        normalizer = silk_ADD_SAT32( normalizer, silk_SMULBB( sf_length_8kHz, 4000 ) );
+        cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];
+        normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ );
+        normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr,  basis_ptr, SF_LENGTH_8KHZ ) );
+        normalizer = silk_ADD32( normalizer, silk_SMULBB( SF_LENGTH_8KHZ, 4000 ) );
 
-        temp32 = silk_DIV32( cross_corr, silk_SQRT_APPROX( normalizer ) + 1 );
-        C[ k ][ min_lag_4kHz ] = (opus_int16)silk_SAT16( temp32 );        /* Q0 */
+        matrix_ptr( C, k, 0, CSTRIDE_4KHZ ) =
+            (opus_int16)silk_DIV32_varQ( cross_corr, normalizer, 13 + 1 );                      /* Q13 */
 
         /* From now on normalizer is computed recursively */
-        for( d = min_lag_4kHz + 1; d <= max_lag_4kHz; d++ ) {
+        for( d = MIN_LAG_4KHZ + 1; d <= MAX_LAG_4KHZ; d++ ) {
             basis_ptr--;
 
             /* Check that we are within range of the array */
             silk_assert( basis_ptr >= frame_4kHz );
-            silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
+            silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
 
-            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, sf_length_8kHz );
+            cross_corr = xcorr32[ MAX_LAG_4KHZ - d ];
 
             /* Add contribution of new sample and remove contribution from oldest sample */
-            normalizer +=
+            normalizer = silk_ADD32( normalizer,
                 silk_SMULBB( basis_ptr[ 0 ], basis_ptr[ 0 ] ) -
-                silk_SMULBB( basis_ptr[ sf_length_8kHz ], basis_ptr[ sf_length_8kHz ] );
+                silk_SMULBB( basis_ptr[ SF_LENGTH_8KHZ ], basis_ptr[ SF_LENGTH_8KHZ ] ) );
 
-            temp32 = silk_DIV32( cross_corr, silk_SQRT_APPROX( normalizer ) + 1 );
-            C[ k ][ d ] = (opus_int16)silk_SAT16( temp32 );                        /* Q0 */
+            matrix_ptr( C, k, d - MIN_LAG_4KHZ, CSTRIDE_4KHZ) =
+                (opus_int16)silk_DIV32_varQ( cross_corr, normalizer, 13 + 1 );                  /* Q13 */
         }
         /* Update target pointer */
-        target_ptr += sf_length_8kHz;
+        target_ptr += SF_LENGTH_8KHZ;
     }
 
     /* Combine two subframes into single correlation measure and apply short-lag bias */
     if( nb_subfr == PE_MAX_NB_SUBFR ) {
-        for( i = max_lag_4kHz; i >= min_lag_4kHz; i-- ) {
-            sum = (opus_int32)C[ 0 ][ i ] + (opus_int32)C[ 1 ][ i ];                /* Q0 */
-            silk_assert( silk_RSHIFT( sum, 1 ) == silk_SAT16( silk_RSHIFT( sum, 1 ) ) );
-            sum = silk_RSHIFT( sum, 1 );                                           /* Q-1 */
-            silk_assert( silk_LSHIFT( (opus_int32)-i, 4 ) == silk_SAT16( silk_LSHIFT( (opus_int32)-i, 4 ) ) );
-            sum = silk_SMLAWB( sum, sum, silk_LSHIFT( -i, 4 ) );                    /* Q-1 */
-            silk_assert( sum == silk_SAT16( sum ) );
-            C[ 0 ][ i ] = (opus_int16)sum;                                         /* Q-1 */
+        for( i = MAX_LAG_4KHZ; i >= MIN_LAG_4KHZ; i-- ) {
+            sum = (opus_int32)matrix_ptr( C, 0, i - MIN_LAG_4KHZ, CSTRIDE_4KHZ )
+                + (opus_int32)matrix_ptr( C, 1, i - MIN_LAG_4KHZ, CSTRIDE_4KHZ );               /* Q14 */
+            sum = silk_SMLAWB( sum, sum, silk_LSHIFT( -i, 4 ) );                                /* Q14 */
+            C[ i - MIN_LAG_4KHZ ] = (opus_int16)sum;                                            /* Q14 */
         }
     } else {
         /* Only short-lag bias */
-        for( i = max_lag_4kHz; i >= min_lag_4kHz; i-- ) {
-            sum = (opus_int32)C[ 0 ][ i ];
-            sum = silk_SMLAWB( sum, sum, silk_LSHIFT( -i, 4 ) );                    /* Q-1 */
-            C[ 0 ][ i ] = (opus_int16)sum;                                         /* Q-1 */
+        for( i = MAX_LAG_4KHZ; i >= MIN_LAG_4KHZ; i-- ) {
+            sum = silk_LSHIFT( (opus_int32)C[ i - MIN_LAG_4KHZ ], 1 );                          /* Q14 */
+            sum = silk_SMLAWB( sum, sum, silk_LSHIFT( -i, 4 ) );                                /* Q14 */
+            C[ i - MIN_LAG_4KHZ ] = (opus_int16)sum;                                            /* Q14 */
         }
     }
 
     /* Sort */
     length_d_srch = silk_ADD_LSHIFT32( 4, complexity, 1 );
     silk_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH );
-    silk_insertion_sort_decreasing_int16( &C[ 0 ][ min_lag_4kHz ], d_srch, max_lag_4kHz - min_lag_4kHz + 1, length_d_srch );
+    silk_insertion_sort_decreasing_int16( C, d_srch, CSTRIDE_4KHZ,
+                                          length_d_srch );
 
     /* Escape if correlation is very low already here */
-    target_ptr = &frame_4kHz[ silk_SMULBB( sf_length_4kHz, nb_subfr ) ];
-    energy = silk_inner_prod_aligned( target_ptr, target_ptr, silk_LSHIFT( sf_length_4kHz, 2 ) );
-    energy = silk_ADD_SAT32( energy, 1000 );                                  /* Q0 */
-    Cmax = (opus_int)C[ 0 ][ min_lag_4kHz ];                                  /* Q-1 */
-    threshold = silk_SMULBB( Cmax, Cmax );                                    /* Q-2 */
-
-    /* Compare in Q-2 domain */
-    if( silk_RSHIFT( energy, 4 + 2 ) > threshold ) {
+    Cmax = (opus_int)C[ 0 ];                                                    /* Q14 */
+    if( Cmax < SILK_FIX_CONST( 0.2, 14 ) ) {
         silk_memset( pitch_out, 0, nb_subfr * sizeof( opus_int ) );
         *LTPCorr_Q15  = 0;
         *lagIndex     = 0;
         *contourIndex = 0;
+        RESTORE_STACK;
         return 1;
     }
 
     threshold = silk_SMULWB( search_thres1_Q16, Cmax );
     for( i = 0; i < length_d_srch; i++ ) {
         /* Convert to 8 kHz indices for the sorted correlation that exceeds the threshold */
-        if( C[ 0 ][ min_lag_4kHz + i ] > threshold ) {
-            d_srch[ i ] = silk_LSHIFT( d_srch[ i ] + min_lag_4kHz, 1 );
+        if( C[ i ] > threshold ) {
+            d_srch[ i ] = silk_LSHIFT( d_srch[ i ] + MIN_LAG_4KHZ, 1 );
         } else {
             length_d_srch = i;
             break;
@@ -265,34 +268,37 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     }
     silk_assert( length_d_srch > 0 );
 
-    for( i = min_lag_8kHz - 5; i < max_lag_8kHz + 5; i++ ) {
-        d_comp[ i ] = 0;
+    ALLOC( d_comp, D_COMP_STRIDE, opus_int16 );
+    for( i = D_COMP_MIN; i < D_COMP_MAX; i++ ) {
+        d_comp[ i - D_COMP_MIN ] = 0;
     }
     for( i = 0; i < length_d_srch; i++ ) {
-        d_comp[ d_srch[ i ] ] = 1;
+        d_comp[ d_srch[ i ] - D_COMP_MIN ] = 1;
     }
 
     /* Convolution */
-    for( i = max_lag_8kHz + 3; i >= min_lag_8kHz; i-- ) {
-        d_comp[ i ] += d_comp[ i - 1 ] + d_comp[ i - 2 ];
+    for( i = D_COMP_MAX - 1; i >= MIN_LAG_8KHZ; i-- ) {
+        d_comp[ i - D_COMP_MIN ] +=
+            d_comp[ i - 1 - D_COMP_MIN ] + d_comp[ i - 2 - D_COMP_MIN ];
     }
 
     length_d_srch = 0;
-    for( i = min_lag_8kHz; i < max_lag_8kHz + 1; i++ ) {
-        if( d_comp[ i + 1 ] > 0 ) {
+    for( i = MIN_LAG_8KHZ; i < MAX_LAG_8KHZ + 1; i++ ) {
+        if( d_comp[ i + 1 - D_COMP_MIN ] > 0 ) {
             d_srch[ length_d_srch ] = i;
             length_d_srch++;
         }
     }
 
     /* Convolution */
-    for( i = max_lag_8kHz + 3; i >= min_lag_8kHz; i-- ) {
-        d_comp[ i ] += d_comp[ i - 1 ] + d_comp[ i - 2 ] + d_comp[ i - 3 ];
+    for( i = D_COMP_MAX - 1; i >= MIN_LAG_8KHZ; i-- ) {
+        d_comp[ i - D_COMP_MIN ] += d_comp[ i - 1 - D_COMP_MIN ]
+            + d_comp[ i - 2 - D_COMP_MIN ] + d_comp[ i - 3 - D_COMP_MIN ];
     }
 
     length_d_comp = 0;
-    for( i = min_lag_8kHz; i < max_lag_8kHz + 4; i++ ) {
-        if( d_comp[ i ] > 0 ) {
+    for( i = MIN_LAG_8KHZ; i < D_COMP_MAX; i++ ) {
+        if( d_comp[ i - D_COMP_MIN ] > 0 ) {
             d_comp[ length_d_comp ] = i - 2;
             length_d_comp++;
         }
@@ -306,8 +312,9 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     ** Scale signal down to avoid correlations measures from overflowing
     *******************************************************************************/
     /* find scaling as max scaling for each subframe */
-    shift = silk_P_Ana_find_scaling( frame_8kHz, frame_length_8kHz, sf_length_8kHz );
+    silk_sum_sqr_shift( &energy, &shift, frame_8kHz, frame_length_8kHz );
     if( shift > 0 ) {
+        shift = silk_RSHIFT( shift, 1 );
         for( i = 0; i < frame_length_8kHz; i++ ) {
             frame_8kHz[ i ] = silk_RSHIFT( frame_8kHz[ i ], shift );
         }
@@ -316,43 +323,37 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     /*********************************************************************************
     * Find energy of each subframe projected onto its history, for a range of delays
     *********************************************************************************/
-    silk_memset( C, 0, PE_MAX_NB_SUBFR * ( ( PE_MAX_LAG >> 1 ) + 5 ) * sizeof( opus_int16 ) );
+    silk_memset( C, 0, nb_subfr * CSTRIDE_8KHZ * sizeof( opus_int16 ) );
 
     target_ptr = &frame_8kHz[ PE_LTP_MEM_LENGTH_MS * 8 ];
     for( k = 0; k < nb_subfr; k++ ) {
 
         /* Check that we are within range of the array */
         silk_assert( target_ptr >= frame_8kHz );
-        silk_assert( target_ptr + sf_length_8kHz <= frame_8kHz + frame_length_8kHz );
+        silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
 
-        energy_target = silk_inner_prod_aligned( target_ptr, target_ptr, sf_length_8kHz );
+        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ ), 1 );
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
 
             /* Check that we are within range of the array */
             silk_assert( basis_ptr >= frame_8kHz );
-            silk_assert( basis_ptr + sf_length_8kHz <= frame_8kHz + frame_length_8kHz );
+            silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
 
-            cross_corr   = silk_inner_prod_aligned( target_ptr, basis_ptr, sf_length_8kHz );
-            energy_basis = silk_inner_prod_aligned( basis_ptr,  basis_ptr, sf_length_8kHz );
+            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ );
             if( cross_corr > 0 ) {
-                energy = silk_max( energy_target, energy_basis ); /* Find max to make sure first division < 1.0 */
-                lz = silk_CLZ32( cross_corr );
-                lshift = silk_LIMIT_32( lz - 1, 0, 15 );
-                temp32 = silk_DIV32( silk_LSHIFT( cross_corr, lshift ), silk_RSHIFT( energy, 15 - lshift ) + 1 ); /* Q15 */
-                silk_assert( temp32 == silk_SAT16( temp32 ) );
-                temp32 = silk_SMULWB( cross_corr, temp32 ); /* Q(-1), cc * ( cc / max(b, t) ) */
-                temp32 = silk_ADD_SAT32( temp32, temp32 );  /* Q(0) */
-                lz = silk_CLZ32( temp32 );
-                lshift = silk_LIMIT_32( lz - 1, 0, 15 );
-                energy = silk_min( energy_target, energy_basis );
-                C[ k ][ d ] = silk_DIV32( silk_LSHIFT( temp32, lshift ), silk_RSHIFT( energy, 15 - lshift ) + 1 ); /* Q15*/
+                energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ );
+                matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) =
+                    (opus_int16)silk_DIV32_varQ( cross_corr,
+                                                 silk_ADD32( energy_target,
+                                                             energy_basis ),
+                                                 13 + 1 );                                      /* Q13 */
             } else {
-                C[ k ][ d ] = 0;
+                matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) = 0;
             }
         }
-        target_ptr += sf_length_8kHz;
+        target_ptr += SF_LENGTH_8KHZ;
     }
 
     /* search over lag range and lags codebook */
@@ -374,7 +375,7 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
     } else {
         prevLag_log2_Q7 = 0;
     }
-    silk_assert( search_thres2_Q15 == silk_SAT16( search_thres2_Q15 ) );
+    silk_assert( search_thres2_Q13 == silk_SAT16( search_thres2_Q13 ) );
     /* Set up stage 2 codebook based on number of subframes */
     if( nb_subfr == PE_MAX_NB_SUBFR ) {
         cbk_size   = PE_NB_CBKS_STAGE2_EXT;
@@ -385,12 +386,10 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         } else {
             nb_cbk_search = PE_NB_CBKS_STAGE2;
         }
-        corr_thres_Q15 = silk_RSHIFT( silk_SMULBB( search_thres2_Q15, search_thres2_Q15 ), 13 );
     } else {
         cbk_size       = PE_NB_CBKS_STAGE2_10MS;
         Lag_CB_ptr     = &silk_CB_lags_stage2_10_ms[ 0 ][ 0 ];
         nb_cbk_search  = PE_NB_CBKS_STAGE2_10MS;
-        corr_thres_Q15 = silk_RSHIFT( silk_SMULBB( search_thres2_Q15, search_thres2_Q15 ), 14 );
     }
 
     for( k = 0; k < length_d_srch; k++ ) {
@@ -398,8 +397,13 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         for( j = 0; j < nb_cbk_search; j++ ) {
             CC[ j ] = 0;
             for( i = 0; i < nb_subfr; i++ ) {
+                opus_int d_subfr;
                 /* Try all codebooks */
-                CC[ j ] = CC[ j ] + (opus_int32)C[ i ][ d + matrix_ptr( Lag_CB_ptr, i, j, cbk_size )];
+                d_subfr = d + matrix_ptr( Lag_CB_ptr, i, j, cbk_size );
+                CC[ j ] = CC[ j ]
+                    + (opus_int32)matrix_ptr( C, i,
+                                              d_subfr - ( MIN_LAG_8KHZ - 2 ),
+                                              CSTRIDE_8KHZ );
             }
         }
         /* Find best codebook */
@@ -413,25 +417,25 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         }
 
         /* Bias towards shorter lags */
-        lag_log2_Q7 = silk_lin2log( (opus_int32)d ); /* Q7 */
+        lag_log2_Q7 = silk_lin2log( d ); /* Q7 */
         silk_assert( lag_log2_Q7 == silk_SAT16( lag_log2_Q7 ) );
-        silk_assert( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 15 ) == silk_SAT16( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 15 ) ) );
-        CCmax_new_b = CCmax_new - silk_RSHIFT( silk_SMULBB( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 15 ), lag_log2_Q7 ), 7 ); /* Q15 */
+        silk_assert( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 13 ) == silk_SAT16( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 13 ) ) );
+        CCmax_new_b = CCmax_new - silk_RSHIFT( silk_SMULBB( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 13 ), lag_log2_Q7 ), 7 ); /* Q13 */
 
         /* Bias towards previous lag */
-        silk_assert( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 15 ) == silk_SAT16( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 15 ) ) );
+        silk_assert( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 13 ) == silk_SAT16( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 13 ) ) );
         if( prevLag > 0 ) {
             delta_lag_log2_sqr_Q7 = lag_log2_Q7 - prevLag_log2_Q7;
             silk_assert( delta_lag_log2_sqr_Q7 == silk_SAT16( delta_lag_log2_sqr_Q7 ) );
             delta_lag_log2_sqr_Q7 = silk_RSHIFT( silk_SMULBB( delta_lag_log2_sqr_Q7, delta_lag_log2_sqr_Q7 ), 7 );
-            prev_lag_bias_Q15 = silk_RSHIFT( silk_SMULBB( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 15 ), *LTPCorr_Q15 ), 15 ); /* Q15 */
-            prev_lag_bias_Q15 = silk_DIV32( silk_MUL( prev_lag_bias_Q15, delta_lag_log2_sqr_Q7 ), delta_lag_log2_sqr_Q7 + ( 1 << 6 ) );
-            CCmax_new_b -= prev_lag_bias_Q15; /* Q15 */
+            prev_lag_bias_Q13 = silk_RSHIFT( silk_SMULBB( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 13 ), *LTPCorr_Q15 ), 15 ); /* Q13 */
+            prev_lag_bias_Q13 = silk_DIV32( silk_MUL( prev_lag_bias_Q13, delta_lag_log2_sqr_Q7 ), delta_lag_log2_sqr_Q7 + SILK_FIX_CONST( 0.5, 7 ) );
+            CCmax_new_b -= prev_lag_bias_Q13; /* Q13 */
         }
 
         if( CCmax_new_b > CCmax_b                                   &&  /* Find maximum biased correlation                  */
-            CCmax_new > corr_thres_Q15                              &&  /* Correlation needs to be high enough to be voiced */
-            silk_CB_lags_stage2[ 0 ][ CBimax_new ] <= min_lag_8kHz      /* Lag must be in range                             */
+            CCmax_new > silk_SMULBB( nb_subfr, search_thres2_Q13 )  &&  /* Correlation needs to be high enough to be voiced */
+            silk_CB_lags_stage2[ 0 ][ CBimax_new ] <= MIN_LAG_8KHZ      /* Lag must be in range                             */
          ) {
             CCmax_b = CCmax_new_b;
             CCmax   = CCmax_new;
@@ -446,24 +450,31 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         *LTPCorr_Q15  = 0;
         *lagIndex     = 0;
         *contourIndex = 0;
+        RESTORE_STACK;
         return 1;
     }
 
+    /* Output normalized correlation */
+    *LTPCorr_Q15 = (opus_int)silk_LSHIFT( silk_DIV32_16( CCmax, nb_subfr ), 2 );
+    silk_assert( *LTPCorr_Q15 >= 0 );
+
     if( Fs_kHz > 8 ) {
+        VARDECL( opus_int16, scratch_mem );
         /***************************************************************************/
         /* Scale input signal down to avoid correlations measures from overflowing */
         /***************************************************************************/
         /* find scaling as max scaling for each subframe */
-        shift = silk_P_Ana_find_scaling( frame, frame_length, sf_length );
+        silk_sum_sqr_shift( &energy, &shift, frame, frame_length );
+        ALLOC( scratch_mem, shift > 0 ? frame_length : 0, opus_int16 );
         if( shift > 0 ) {
             /* Move signal to scratch mem because the input signal should be unchanged */
-            /* Reuse the 32 bit scratch mem vector, use a 16 bit pointer from now */
-            input_frame_ptr = (opus_int16*)scratch_mem;
+            shift = silk_RSHIFT( shift, 1 );
             for( i = 0; i < frame_length; i++ ) {
-                input_frame_ptr[ i ] = silk_RSHIFT( frame[ i ], shift );
+                scratch_mem[ i ] = silk_RSHIFT( frame[ i ], shift );
             }
+            input_frame_ptr = scratch_mem;
         } else {
-            input_frame_ptr = (opus_int16*)frame;
+            input_frame_ptr = frame;
         }
 
         /* Search in original signal */
@@ -483,22 +494,13 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         start_lag = silk_max_int( lag - 2, min_lag );
         end_lag   = silk_min_int( lag + 2, max_lag );
         lag_new   = lag;                                    /* to avoid undefined lag */
-        CBimax    = 0;                                        /* to avoid undefined lag */
-        silk_assert( silk_LSHIFT( CCmax, 13 ) >= 0 );
-        *LTPCorr_Q15 = (opus_int)silk_SQRT_APPROX( silk_LSHIFT( CCmax, 13 ) ); /* Output normalized correlation */
+        CBimax    = 0;                                      /* to avoid undefined lag */
 
         CCmax = silk_int32_MIN;
         /* pitch lags according to second stage */
         for( k = 0; k < nb_subfr; k++ ) {
             pitch_out[ k ] = lag + 2 * silk_CB_lags_stage2[ k ][ CBimax_old ];
         }
-        /* Calculate the correlations and energies needed in stage 3 */
-        silk_P_Ana_calc_corr_st3(  crosscorr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
-        silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
-
-        lag_counter = 0;
-        silk_assert( lag == silk_SAT16( lag ) );
-        contour_bias_Q20 = silk_DIV32_16( SILK_FIX_CONST( PE_FLATCONTOUR_BIAS, 20 ), lag );
 
         /* Set up codebook parameters according to complexity setting and frame length */
         if( nb_subfr == PE_MAX_NB_SUBFR ) {
@@ -510,41 +512,43 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
             cbk_size        = PE_NB_CBKS_STAGE3_10MS;
             Lag_CB_ptr      = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ];
         }
+
+        /* Calculate the correlations and energies needed in stage 3 */
+        ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
+        ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
+        silk_P_Ana_calc_corr_st3(  cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
+        silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
+
+        lag_counter = 0;
+        silk_assert( lag == silk_SAT16( lag ) );
+        contour_bias_Q15 = silk_DIV32_16( SILK_FIX_CONST( PE_FLATCONTOUR_BIAS, 15 ), lag );
+
+        target_ptr = &input_frame_ptr[ PE_LTP_MEM_LENGTH_MS * Fs_kHz ];
+        energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length ), 1 );
         for( d = start_lag; d <= end_lag; d++ ) {
             for( j = 0; j < nb_cbk_search; j++ ) {
                 cross_corr = 0;
-                energy     = 0;
+                energy     = energy_target;
                 for( k = 0; k < nb_subfr; k++ ) {
-                    silk_assert( PE_MAX_NB_SUBFR == 4 );
-                    energy     += silk_RSHIFT( energies_st3[  k ][ j ][ lag_counter ], 2 ); /* use mean, to avoid overflow */
+                    cross_corr = silk_ADD32( cross_corr,
+                        matrix_ptr( cross_corr_st3, k, j,
+                                    nb_cbk_search )[ lag_counter ] );
+                    energy     = silk_ADD32( energy,
+                        matrix_ptr( energies_st3, k, j,
+                                    nb_cbk_search )[ lag_counter ] );
                     silk_assert( energy >= 0 );
-                    cross_corr += silk_RSHIFT( crosscorr_st3[ k ][ j ][ lag_counter ], 2 ); /* use mean, to avoid overflow */
                 }
                 if( cross_corr > 0 ) {
-                    /* Divide cross_corr / energy and get result in Q15 */
-                    lz = silk_CLZ32( cross_corr );
-                    /* Divide with result in Q13, cross_corr could be larger than energy */
-                    lshift = silk_LIMIT_32( lz - 1, 0, 13 );
-                    CCmax_new = silk_DIV32( silk_LSHIFT( cross_corr, lshift ), silk_RSHIFT( energy, 13 - lshift ) + 1 );
-                    CCmax_new = silk_SAT16( CCmax_new );
-                    CCmax_new = silk_SMULWB( cross_corr, CCmax_new );
-                    /* Saturate */
-                    if( CCmax_new > silk_RSHIFT( silk_int32_MAX, 3 ) ) {
-                        CCmax_new = silk_int32_MAX;
-                    } else {
-                        CCmax_new = silk_LSHIFT( CCmax_new, 3 );
-                    }
+                    CCmax_new = silk_DIV32_varQ( cross_corr, energy, 13 + 1 );          /* Q13 */
                     /* Reduce depending on flatness of contour */
-                    diff = silk_int16_MAX - silk_RSHIFT( silk_MUL( contour_bias_Q20, j ), 5 ); /* Q20 -> Q15 */
+                    diff = silk_int16_MAX - silk_MUL( contour_bias_Q15, j );            /* Q15 */
                     silk_assert( diff == silk_SAT16( diff ) );
-                    CCmax_new = silk_LSHIFT( silk_SMULWB( CCmax_new, diff ), 1 );
+                    CCmax_new = silk_SMULWB( CCmax_new, diff );                         /* Q14 */
                 } else {
                     CCmax_new = 0;
                 }
 
-                if( CCmax_new > CCmax                                               &&
-                   ( d + silk_CB_lags_stage3[ 0 ][ j ] ) <= max_lag
-                   ) {
+                if( CCmax_new > CCmax && ( d + silk_CB_lags_stage3[ 0 ][ j ] ) <= max_lag ) {
                     CCmax   = CCmax_new;
                     lag_new = d;
                     CBimax  = j;
@@ -560,27 +564,35 @@ opus_int silk_pitch_analysis_core(                  /* O    Voicing estimate: 0
         *lagIndex = (opus_int16)( lag_new - min_lag);
         *contourIndex = (opus_int8)CBimax;
     } else {        /* Fs_kHz == 8 */
-        /* Save Lags and correlation */
-        CCmax = silk_max( CCmax, 0 );
-        *LTPCorr_Q15 = (opus_int)silk_SQRT_APPROX( silk_LSHIFT( CCmax, 13 ) ); /* Output normalized correlation */
+        /* Save Lags */
         for( k = 0; k < nb_subfr; k++ ) {
             pitch_out[ k ] = lag + matrix_ptr( Lag_CB_ptr, k, CBimax, cbk_size );
-            pitch_out[ k ] = silk_LIMIT( pitch_out[ k ], min_lag_8kHz, PE_MAX_LAG_MS * Fs_kHz );
+            pitch_out[ k ] = silk_LIMIT( pitch_out[ k ], MIN_LAG_8KHZ, PE_MAX_LAG_MS * 8 );
         }
-        *lagIndex = (opus_int16)( lag - min_lag_8kHz );
+        *lagIndex = (opus_int16)( lag - MIN_LAG_8KHZ );
         *contourIndex = (opus_int8)CBimax;
     }
     silk_assert( *lagIndex >= 0 );
     /* return as voiced */
+    RESTORE_STACK;
     return 0;
 }
 
-/*************************************************************************/
-/* Calculates the correlations used in stage 3 search. In order to cover */
-/* the whole lag codebook for all the searched offset lags (lag +- 2),   */
-/*************************************************************************/
-void silk_P_Ana_calc_corr_st3(
-    opus_int32        cross_corr_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ],/* (O) 3 DIM correlation array */
+/***********************************************************************
+ * Calculates the correlations used in stage 3 search. In order to cover
+ * the whole lag codebook for all the searched offset lags (lag +- 2),
+ * the following correlations are needed in each sub frame:
+ *
+ * sf1: lag range [-8,...,7] total 16 correlations
+ * sf2: lag range [-4,...,4] total 9 correlations
+ * sf3: lag range [-3,....4] total 8 correltions
+ * sf4: lag range [-6,....8] total 15 correlations
+ *
+ * In total 48 correlations. The direct implementation computed in worst
+ * case 4*12*5 = 240 correlations, but more likely around 120.
+ ***********************************************************************/
+static void silk_P_Ana_calc_corr_st3(
+    silk_pe_stage3_vals cross_corr_st3[],              /* O 3 DIM correlation array */
     const opus_int16  frame[],                         /* I vector to correlate         */
     opus_int          start_lag,                       /* I lag offset to search around */
     opus_int          sf_length,                       /* I length of a 5 ms subframe   */
@@ -588,12 +600,13 @@ void silk_P_Ana_calc_corr_st3(
     opus_int          complexity                       /* I Complexity setting          */
 )
 {
-    const opus_int16 *target_ptr, *basis_ptr;
-    opus_int32 cross_corr;
+    const opus_int16 *target_ptr;
     opus_int   i, j, k, lag_counter, lag_low, lag_high;
     opus_int   nb_cbk_search, delta, idx, cbk_size;
-    opus_int32 scratch_mem[ SCRATCH_SIZE ];
+    VARDECL( opus_int32, scratch_mem );
+    VARDECL( opus_int32, xcorr32 );
     const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;
+    SAVE_STACK;
 
     silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
     silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
@@ -610,6 +623,8 @@ void silk_P_Ana_calc_corr_st3(
         nb_cbk_search = PE_NB_CBKS_STAGE3_10MS;
         cbk_size      = PE_NB_CBKS_STAGE3_10MS;
     }
+    ALLOC( scratch_mem, SCRATCH_SIZE, opus_int32 );
+    ALLOC( xcorr32, SCRATCH_SIZE, opus_int32 );
 
     target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ]; /* Pointer to middle of frame */
     for( k = 0; k < nb_subfr; k++ ) {
@@ -618,11 +633,11 @@ void silk_P_Ana_calc_corr_st3(
         /* Calculate the correlations for each subframe */
         lag_low  = matrix_ptr( Lag_range_ptr, k, 0, 2 );
         lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );
+        silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
+        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1 );
         for( j = lag_low; j <= lag_high; j++ ) {
-            basis_ptr = target_ptr - ( start_lag + j );
-            cross_corr = silk_inner_prod_aligned( (opus_int16*)target_ptr, (opus_int16*)basis_ptr, sf_length );
             silk_assert( lag_counter < SCRATCH_SIZE );
-            scratch_mem[ lag_counter ] = cross_corr;
+            scratch_mem[ lag_counter ] = xcorr32[ lag_high - j ];
             lag_counter++;
         }
 
@@ -634,32 +649,35 @@ void silk_P_Ana_calc_corr_st3(
             for( j = 0; j < PE_NB_STAGE3_LAGS; j++ ) {
                 silk_assert( idx + j < SCRATCH_SIZE );
                 silk_assert( idx + j < lag_counter );
-                cross_corr_st3[ k ][ i ][ j ] = scratch_mem[ idx + j ];
+                matrix_ptr( cross_corr_st3, k, i, nb_cbk_search )[ j ] =
+                    scratch_mem[ idx + j ];
             }
         }
         target_ptr += sf_length;
     }
+    RESTORE_STACK;
 }
 
 /********************************************************************/
 /* Calculate the energies for first two subframes. The energies are */
 /* calculated recursively.                                          */
 /********************************************************************/
-void silk_P_Ana_calc_energy_st3(
-    opus_int32        energies_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ],/* (O) 3 DIM energy array */
-    const opus_int16  frame[],                         /* I vector to calc energy in    */
-    opus_int          start_lag,                       /* I lag offset to search around */
-    opus_int          sf_length,                       /* I length of one 5 ms subframe */
-    opus_int          nb_subfr,                     /* I number of subframes         */
-    opus_int          complexity                       /* I Complexity setting          */
+static void silk_P_Ana_calc_energy_st3(
+    silk_pe_stage3_vals energies_st3[],                 /* O 3 DIM energy array */
+    const opus_int16  frame[],                          /* I vector to calc energy in    */
+    opus_int          start_lag,                        /* I lag offset to search around */
+    opus_int          sf_length,                        /* I length of one 5 ms subframe */
+    opus_int          nb_subfr,                         /* I number of subframes         */
+    opus_int          complexity                        /* I Complexity setting          */
 )
 {
     const opus_int16 *target_ptr, *basis_ptr;
     opus_int32 energy;
     opus_int   k, i, j, lag_counter;
     opus_int   nb_cbk_search, delta, idx, cbk_size, lag_diff;
-    opus_int32 scratch_mem[ SCRATCH_SIZE ];
+    VARDECL( opus_int32, scratch_mem );
     const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;
+    SAVE_STACK;
 
     silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
     silk_assert( complexity <= SILK_PE_MAX_COMPLEX );
@@ -676,6 +694,8 @@ void silk_P_Ana_calc_energy_st3(
         nb_cbk_search = PE_NB_CBKS_STAGE3_10MS;
         cbk_size      = PE_NB_CBKS_STAGE3_10MS;
     }
+    ALLOC( scratch_mem, SCRATCH_SIZE, opus_int32 );
+
     target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ];
     for( k = 0; k < nb_subfr; k++ ) {
         lag_counter = 0;
@@ -709,37 +729,13 @@ void silk_P_Ana_calc_energy_st3(
             for( j = 0; j < PE_NB_STAGE3_LAGS; j++ ) {
                 silk_assert( idx + j < SCRATCH_SIZE );
                 silk_assert( idx + j < lag_counter );
-                energies_st3[ k ][ i ][ j ] = scratch_mem[ idx + j ];
-                silk_assert( energies_st3[ k ][ i ][ j ] >= 0 );
+                matrix_ptr( energies_st3, k, i, nb_cbk_search )[ j ] =
+                    scratch_mem[ idx + j ];
+                silk_assert(
+                    matrix_ptr( energies_st3, k, i, nb_cbk_search )[ j ] >= 0 );
             }
         }
         target_ptr += sf_length;
     }
-}
-
-opus_int32 silk_P_Ana_find_scaling(
-    const opus_int16  *frame,
-    const opus_int    frame_length,
-    const opus_int    sum_sqr_len
-)
-{
-    opus_int32 nbits, x_max;
-
-    x_max = silk_int16_array_maxabs( frame, frame_length );
-
-    if( x_max < silk_int16_MAX ) {
-        /* Number of bits needed for the sum of the squares */
-        nbits = 32 - silk_CLZ32( silk_SMULBB( x_max, x_max ) );
-    } else {
-        /* Here we don't know if x_max should have been silk_int16_MAX + 1, so we expect the worst case */
-        nbits = 30;
-    }
-    nbits += 17 - silk_CLZ16( sum_sqr_len );
-
-    /* Without a guarantee of saturation, we need to keep the 31st bit free */
-    if( nbits < 31 ) {
-        return 0;
-    } else {
-        return( nbits - 30 );
-    }
+    RESTORE_STACK;
 }