Adds an analysis function to control VBR
authorJean-Marc Valin <jmvalin@jmvalin.ca>
Mon, 7 Nov 2011 04:27:16 +0000 (23:27 -0500)
committerJean-Marc Valin <jmvalin@jmvalin.ca>
Fri, 13 Jul 2012 18:50:34 +0000 (14:50 -0400)
Conflicts:

src/opus_encoder.c

celt/celt.c
celt/celt.h
src/analysis.c [new file with mode: 0644]
src/opus_encoder.c

index d20b025..97d9edc 100644 (file)
@@ -177,6 +177,8 @@ struct OpusCustomEncoder {
    int prefilter_tapset_old;
 #endif
    int consec_transient;
+   int frame_tonality;
+   int tonality_slope;
 
    opus_val32 preemph_memE[2];
    opus_val32 preemph_memD[2];
@@ -699,6 +701,9 @@ static int tf_analysis(const CELTMode *m, int len, int C, int isTransient,
    return tf_select;
 }
 
+extern int boost_band[2];
+extern float boost_amount[2];
+
 static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, int tf_select, ec_enc *enc)
 {
    int curr, i;
@@ -790,7 +795,7 @@ static void init_caps(const CELTMode *m,int *cap,int LM,int C)
 }
 
 static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
-      const opus_val16 *bandLogE, int end, int LM, int C, int N0)
+      const opus_val16 *bandLogE, int end, int LM, int C, int N0, float tonality_slope)
 {
    int i;
    opus_val32 diff=0;
@@ -831,6 +836,7 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       result of a bug in the loop above */
    diff /= 2*C*(end-1);
    /*printf("%f\n", diff);*/
+#if 1
    if (diff > QCONST16(2.f, DB_SHIFT))
       trim_index--;
    if (diff > QCONST16(8.f, DB_SHIFT))
@@ -839,11 +845,23 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       trim_index++;
    if (diff < -QCONST16(10.f, DB_SHIFT))
       trim_index++;
-
+#endif
+#if 0
+   if (tonality_slope > .15)
+      trim_index--;
+   if (tonality_slope > .3)
+      trim_index--;
+   if (tonality_slope < -.15)
+      trim_index++;
+   if (tonality_slope < -.3)
+      trim_index++;
+#endif
+   //printf("%f\n", tonality_slope);
    if (trim_index<0)
       trim_index = 0;
    if (trim_index>10)
       trim_index = 10;
+   //printf("%f %d\n", tonality_slope, trim_index);
 #ifdef FUZZING
    trim_index = rand()%11;
 #endif
@@ -1291,6 +1309,14 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
          st->spread_decision = spreading_decision(st->mode, X,
                &st->tonal_average, st->spread_decision, &st->hf_average,
                &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
+         /*if (st->frame_tonality > .7*32768)
+            st->spread_decision = SPREAD_NONE;
+         else if (st->frame_tonality > .3*32768)
+            st->spread_decision = SPREAD_LIGHT;
+         else if (st->frame_tonality > .1*32768)
+            st->spread_decision = SPREAD_NORMAL;
+         else
+            st->spread_decision = SPREAD_AGGRESSIVE;*/
       }
       ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5);
    }
@@ -1336,6 +1362,18 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
 #endif
       }
    }
+   if (0)
+   {
+      if (boost_amount[0]>.2)
+         offsets[boost_band[0]]+=2;
+      if (boost_amount[0]>.4)
+         offsets[boost_band[0]]+=2;
+      if (boost_amount[1]>.2)
+         offsets[boost_band[1]]+=2;
+      if (boost_amount[1]>.4)
+         offsets[boost_band[1]]+=2;
+      //printf("%f %f\n", boost_amount[0], boost_amount[1]);
+   }
    dynalloc_logp = 6;
    total_bits<<=BITRES;
    total_boost = 0;
@@ -1374,18 +1412,48 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
    if (tell+(6<<BITRES) <= total_bits - total_boost)
    {
       alloc_trim = alloc_trim_analysis(st->mode, X, bandLogE,
-            st->end, LM, C, N);
+            st->end, LM, C, N, st->tonality_slope/16384.);
       ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
       tell = ec_tell_frac(enc);
    }
 
+   if (C==2)
+   {
+      int effectiveRate;
+
+      /* Always use MS for 2.5 ms frames until we can do a better analysis */
+      if (LM!=0)
+         dual_stereo = stereo_analysis(st->mode, X, LM, N);
+
+      /* Account for coarse energy */
+      effectiveRate = (8*effectiveBytes - 80)>>LM;
+
+      /* effectiveRate in kb/s */
+      effectiveRate = 2*effectiveRate/5;
+      if (effectiveRate<35)
+         intensity = 8;
+      else if (effectiveRate<50)
+         intensity = 12;
+      else if (effectiveRate<68)
+         intensity = 16;
+      else if (effectiveRate<84)
+         intensity = 18;
+      else if (effectiveRate<102)
+         intensity = 19;
+      else if (effectiveRate<130)
+         intensity = 20;
+      else
+         intensity = 100;
+      intensity = IMIN(st->end,IMAX(st->start, intensity));
+   }
+
    /* Variable bitrate */
    if (vbr_rate>0)
    {
      opus_val16 alpha;
      opus_int32 delta;
      /* The target rate in 8th bits per frame */
-     opus_int32 target;
+     opus_int32 target, new_target;
      opus_int32 min_allowed;
      int lm_diff = st->mode->maxLM - LM;
 
@@ -1397,14 +1465,30 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
         target += (st->vbr_offset>>lm_diff);
 
 #ifdef FIXED_POINT
-     target = SHL32(MULT16_32_Q15(target, SUB16(tf_estimate, QCONST16(0.05, 14))),1);
+     new_target = SHL32(MULT16_32_Q15(target, SUB16(tf_estimate, QCONST16(0.05, 14))),1);
 #else
-     target *= tf_estimate-.05;
+     new_target = target*(tf_estimate-.05);
 #endif
+     if (1) {
+        int tonal_target;
+        float tonal;
+        int coded_bins;
+        int coded_bands;
+        tonal = st->frame_tonality/32768.;
+        tonal -= .06;
+        coded_bands = st->lastCodedBands ? st->lastCodedBands : st->mode->nbEBands;
+        //coded_bands = IMIN(coded_bands, st->mode->nbEBands-1);
+        coded_bins = st->mode->eBands[coded_bands]<<LM;
+        if (C==2)
+           coded_bins += st->mode->eBands[IMIN(intensity, coded_bands)]<<LM;
+        tonal_target = target + (coded_bins<<BITRES)*1.55*tonal;
+        new_target = IMAX(tonal_target,new_target);
+     }
+
      /* The current offset is removed from the target and the space used
         so far is added*/
-     target=target+tell;
-
+     target=new_target+tell;
+     //printf("%d\n", target);
      /* In VBR mode the frame size must not be reduced so much that it would
          result in the encoder running out of bits.
         The margin of 2 bytes ensures that none of the bust-prevention logic
@@ -1464,35 +1548,6 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
      /* This moves the raw bits to take into account the new compressed size */
      ec_enc_shrink(enc, nbCompressedBytes);
    }
-   if (C==2)
-   {
-      int effectiveRate;
-
-      /* Always use MS for 2.5 ms frames until we can do a better analysis */
-      if (LM!=0)
-         dual_stereo = stereo_analysis(st->mode, X, LM, N);
-
-      /* Account for coarse energy */
-      effectiveRate = (8*effectiveBytes - 80)>>LM;
-
-      /* effectiveRate in kb/s */
-      effectiveRate = 2*effectiveRate/5;
-      if (effectiveRate<35)
-         intensity = 8;
-      else if (effectiveRate<50)
-         intensity = 12;
-      else if (effectiveRate<68)
-         intensity = 16;
-      else if (effectiveRate<84)
-         intensity = 18;
-      else if (effectiveRate<102)
-         intensity = 19;
-      else if (effectiveRate<130)
-         intensity = 20;
-      else
-         intensity = 100;
-      intensity = IMIN(st->end,IMAX(st->start, intensity));
-   }
 
    /* Bit allocation */
    ALLOC(fine_quant, st->mode->nbEBands, int);
@@ -1862,6 +1917,18 @@ int opus_custom_encoder_ctl(CELTEncoder * restrict st, int request, ...)
          st->signalling = value;
       }
       break;
+      case CELT_SET_TONALITY_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->frame_tonality = value;
+      }
+      break;
+      case CELT_SET_TONALITY_SLOPE_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->tonality_slope = value;
+      }
+      break;
       case CELT_GET_MODE_REQUEST:
       {
          const CELTMode ** value = va_arg(ap, const CELTMode**);
index da4464c..553670c 100644 (file)
@@ -86,6 +86,10 @@ extern "C" {
 #define CELT_SET_SIGNALLING_REQUEST    10016
 #define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, __opus_check_int(x)
 
+#define CELT_SET_TONALITY_REQUEST    10018
+#define CELT_SET_TONALITY(x) CELT_SET_TONALITY_REQUEST, __opus_check_int(x)
+#define CELT_SET_TONALITY_SLOPE_REQUEST    10020
+#define CELT_SET_TONALITY_SLOPE(x) CELT_SET_TONALITY_SLOPE_REQUEST, __opus_check_int(x)
 
 
 /* Encoder stuff */
diff --git a/src/analysis.c b/src/analysis.c
new file mode 100644 (file)
index 0000000..21a4a10
--- /dev/null
@@ -0,0 +1,213 @@
+/* Copyright (c) 2011 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "kiss_fft.h"
+#include "celt.h"
+#include "modes.h"
+#include "arch.h"
+#include "quant_bands.h"
+#include <stdio.h>
+
+#define NB_FRAMES 8
+
+#define NB_TBANDS 17
+static const int tbands[NB_TBANDS+1] = {
+      4, 6, 8, 10, 12, 14, 16, 20, 24, 32, 40, 48, 56, 68, 80, 96, 120, 156
+};
+
+typedef struct {
+   float angle[240];
+   float d_angle[240];
+   float d2_angle[240];
+   float prev_band_tonality[NB_TBANDS];
+   float prev_tonality;
+   float E[NB_FRAMES][NB_TBANDS];
+   int E_count;
+} TonalityAnalysisState;
+
+int boost_band[2];
+float boost_amount[2];
+
+float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, const opus_val16 *x, int C, float *tslope)
+{
+    int i, b;
+    const CELTMode *mode;
+    const kiss_fft_state *kfft;
+    kiss_fft_cpx in[480], out[480];
+    const opus_val16 *window;
+    int overlap = 240;
+    int N = 480, N2=240;
+    float * restrict A = tonal->angle;
+    float * restrict dA = tonal->d_angle;
+    float * restrict d2A = tonal->d2_angle;
+    float tonality[240];
+    float band_tonality[NB_TBANDS];
+    float frame_tonality;
+    const float pi4 = M_PI*M_PI*M_PI*M_PI;
+    float slope=0;
+    float max_tonality=-1;
+    int max_band=0;
+    celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
+
+    kfft = mode->mdct.kfft[0];
+    window = mode->window;
+    if (C==1)
+    {
+       for (i=0;i<N2;i++)
+       {
+          float w = .5-.5*cos(M_PI*(i+1)/N2);
+          in[i].r = MULT16_16(w, x[i]);
+          in[i].i = MULT16_16(w, x[N-N2+i]);
+          in[N-i-1].r = MULT16_16(w, x[N-i-1]);
+          in[N-i-1].i = MULT16_16(w, x[2*N-N2-i-1]);
+       }
+    } else {
+       for (i=0;i<N2;i++)
+       {
+          float w = .5-.5*cos(M_PI*(i+1)/N2);
+          in[i].r = MULT16_16(w, x[2*i]+x[2*i+1]);
+          in[i].i = MULT16_16(w, x[2*(N-N2+i)]+x[2*(N-N2+i)+1]);
+          in[N-i-1].r = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]);
+          in[N-i-1].i = MULT16_16(w, x[2*(2*N-N2-i-1)]+x[2*(2*N-N2-i-1)+1]);
+       }
+    }
+    opus_fft(kfft, in, out);
+
+    for (i=1;i<N2;i++)
+    {
+       float X1r, X2r, X1i, X2i;
+       float angle, d_angle, d2_angle;
+       float angle2, d_angle2, d2_angle2;
+       float mod1, mod2, avg_mod;
+       X1r = out[i].r+out[N-i].r;
+       X1i = out[i].i-out[N-i].i;
+       X2r = out[i].i+out[N-i].i;
+       X2i = out[N-i].r-out[i].r;
+       //printf("%f\n", X1r);
+       angle = (.5/M_PI)*atan2(X1i, X1r);
+       d_angle = angle - A[i];
+       d2_angle = d_angle - dA[i];
+
+       angle2 = (.5/M_PI)*atan2(X2i, X2r);
+       d_angle2 = angle2 - angle;
+       d2_angle2 = d_angle2 - d_angle;
+       //printf("%f ", angle2);
+
+       //printf("%f ", d2_angle);
+       mod1 = d2_angle - floor(.5+d2_angle);
+       //printf("%f ", mod1);
+       mod1 *= mod1;
+       mod1 *= mod1;
+       mod2 = d2_angle2 - floor(.5+d2_angle2);
+       mod2 *= mod2;
+       mod2 *= mod2;
+
+       avg_mod = .25*(d2A[i]+2*mod1+mod2);
+       tonality[i] = 1./(1+40*16*pi4*avg_mod)-.015;
+
+       A[i] = angle2;
+       dA[i] = d_angle2;
+       d2A[i] = mod2;
+    }
+
+    frame_tonality = 0;
+    for (b=0;b<NB_TBANDS;b++)
+    {
+       float E=0, tE=0;
+       float L1, L2;
+       float stationarity;
+       for (i=tbands[b];i<tbands[b+1];i++)
+       {
+          float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
+                     + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
+          E += binE;
+          tE += binE*tonality[i];
+       }
+       tonal->E[tonal->E_count][b] = E;
+       L1=L2=0;
+       for (i=0;i<NB_FRAMES;i++)
+       {
+          L1 += sqrt(tonal->E[i][b]);
+          L2 += tonal->E[i][b];
+       }
+
+       stationarity = MIN16(0.99,L1/sqrt(EPSILON+NB_FRAMES*L2));
+       stationarity *= stationarity;
+       stationarity *= stationarity;
+       //fprintf(stderr, "%f %f %f\n", L1, L2, stationarity);
+       //fprintf(stderr, "%f %f\n", tE, E);
+       //fprintf(stderr, "%f %f\n", stationarity, );
+       //band_tonality[b] = tE/(1e-15+E);
+       band_tonality[b] = MAX16(tE/(1e-15+E), stationarity*tonal->prev_band_tonality[b]);
+       //if (band_tonality[b]>1)
+       //   printf("%f %f %f\n", L1, L2, stationarity);
+       //fprintf(stdout, "%f ", band_tonality[b]);
+       if (b>=7)
+          frame_tonality += band_tonality[b];
+       slope += band_tonality[b]*(b-8);
+       if (band_tonality[b] > boost_amount[1] && b>=7 && b < NB_TBANDS-1)
+       {
+          if (band_tonality[b] > boost_amount[0])
+          {
+             boost_amount[1] = boost_amount[0];
+             boost_band[1] = boost_band[0];
+             boost_amount[0] = band_tonality[b];
+             boost_band[0] = b;
+          } else {
+             boost_amount[1] = band_tonality[b];
+             boost_band[1] = b;
+          }
+       }
+       tonal->prev_band_tonality[b] = band_tonality[b];
+    }
+    frame_tonality /= NB_TBANDS-7;
+    frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8);
+    //fprintf(stdout, "%f\n", frame_tonality);
+    tonal->prev_tonality = frame_tonality;
+    boost_amount[0] -= frame_tonality+.2;
+    boost_amount[1] -= frame_tonality+.2;
+    if (band_tonality[boost_band[0]] < band_tonality[boost_band[0]+1]+.15
+        || band_tonality[boost_band[0]] < band_tonality[boost_band[0]-1]+.15)
+       boost_amount[0]=0;
+    if (band_tonality[boost_band[1]] < band_tonality[boost_band[1]+1]+.15
+        || band_tonality[boost_band[1]] < band_tonality[boost_band[1]-1]+.15)
+       boost_amount[1]=0;
+
+    //boost_band = 16;
+    //boost_amount = .6;
+    //printf("%d %f %f\n", max_band, max_tonality, frame_tonality);
+    slope /= 8*8;
+    *tslope = slope;
+    //fprintf(stdout, "%f %f\n", frame_tonality, slope);
+
+    tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
+    return frame_tonality;
+}
index a7622f9..af54605 100644 (file)
@@ -40,6 +40,7 @@
 #include "arch.h"
 #include "opus_private.h"
 #include "os_support.h"
+#include "analysis.c"
 
 #include "tuning_parameters.h"
 #ifdef FIXED_POINT
@@ -101,7 +102,7 @@ static const opus_int32 mono_music_bandwidth_thresholds[8] = {
         14000, 1000, /* MB not allowed */
         18000, 2000, /* MB<->WB */
         24000, 2000, /* WB<->SWB */
-        33000, 2000, /* SWB<->FB */
+        31000, 2000, /* SWB<->FB */
 };
 static const opus_int32 stereo_voice_bandwidth_thresholds[8] = {
         11000, 1000, /* NB<->MB */
@@ -472,6 +473,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
     opus_int32 max_rate;
     int curr_bandwidth;
     opus_int32 max_data_bytes;
+    int extra_buffer, total_buffer;
     VARDECL(opus_val16, tmp_prefill);
 
     ALLOC_STACK;
@@ -497,7 +499,11 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
        delay_compensation = 0;
     else
        delay_compensation = st->delay_compensation;
-
+    if (1)
+    {
+       total_buffer = IMAX(240, delay_compensation);
+    }
+    extra_buffer = total_buffer-delay_compensation;
     st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes);
 
     frame_rate = st->Fs/frame_size;
@@ -823,9 +829,9 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
 
     ec_enc_init(&enc, data, max_data_bytes-1);
 
-    ALLOC(pcm_buf, (delay_compensation+frame_size)*st->channels, opus_val16);
-    for (i=0;i<delay_compensation*st->channels;i++)
-       pcm_buf[i] = st->delay_buffer[(st->encoder_buffer-delay_compensation)*st->channels+i];
+    ALLOC(pcm_buf, (total_buffer+frame_size)*st->channels, opus_val16);
+    for (i=0;i<total_buffer*st->channels;i++)
+       pcm_buf[i] = st->delay_buffer[(st->encoder_buffer-total_buffer)*st->channels+i];
 
     if (st->mode == MODE_CELT_ONLY)
        hp_freq_smth1 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 );
@@ -840,12 +846,20 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
 
     if (st->application == OPUS_APPLICATION_VOIP)
     {
-       hp_cutoff(pcm, cutoff_Hz, &pcm_buf[delay_compensation*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
+       hp_cutoff(pcm, cutoff_Hz, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
     } else {
        for (i=0;i<frame_size*st->channels;i++)
-          pcm_buf[delay_compensation*st->channels + i] = pcm[i];
+          pcm_buf[total_buffer*st->channels + i] = pcm[i];
     }
 
+    static TonalityAnalysisState tonal;
+    float tonality;
+    float tonality_slope;
+    tonality_analysis(&tonal, celt_enc, pcm_buf, st->channels, &tonality_slope);
+    tonality = tonality_analysis(&tonal, celt_enc, pcm_buf+(st->Fs/100)*st->channels, st->channels, &tonality_slope);
+    celt_encoder_ctl(celt_enc, CELT_SET_TONALITY(32768*tonality));
+    celt_encoder_ctl(celt_enc, CELT_SET_TONALITY_SLOPE(16384*tonality_slope));
+
     /* SILK processing */
     if (st->mode != MODE_CELT_ONLY)
     {
@@ -951,10 +965,10 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
         }
 
 #ifdef FIXED_POINT
-        pcm_silk = pcm_buf+delay_compensation*st->channels;
+        pcm_silk = pcm_buf+total_buffer*st->channels;
 #else
         for (i=0;i<frame_size*st->channels;i++)
-            pcm_silk[i] = FLOAT2INT16(pcm_buf[delay_compensation*st->channels + i]);
+            pcm_silk[i] = FLOAT2INT16(pcm_buf[total_buffer*st->channels + i]);
 #endif
         ret = silk_Encode( silk_enc, &st->silk_mode, pcm_silk, frame_size, &enc, &nBytes, 0 );
         if( ret ) {
@@ -1055,13 +1069,13 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
     if (st->mode != MODE_SILK_ONLY && st->mode != st->prev_mode && st->prev_mode > 0)
     {
        for (i=0;i<st->channels*st->Fs/400;i++)
-          tmp_prefill[i] = st->delay_buffer[(st->encoder_buffer-st->delay_compensation-st->Fs/400)*st->channels + i];
+          tmp_prefill[i] = st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i];
     }
 
-    for (i=0;i<st->channels*(st->encoder_buffer-(frame_size+delay_compensation));i++)
+    for (i=0;i<st->channels*(st->encoder_buffer-(frame_size+total_buffer));i++)
         st->delay_buffer[i] = st->delay_buffer[i+st->channels*frame_size];
     for (;i<st->encoder_buffer*st->channels;i++)
-        st->delay_buffer[i] = pcm_buf[(frame_size+delay_compensation-st->encoder_buffer)*st->channels+i];
+        st->delay_buffer[i] = pcm_buf[(frame_size+total_buffer-st->encoder_buffer)*st->channels+i];
 
 
     if (st->mode != MODE_HYBRID || st->stream_channels==1)
@@ -1082,7 +1096,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
             g1 *= (1./16384);
             g2 *= (1./16384);
 #endif
-            stereo_fade(pcm_buf, pcm_buf, g1, g2, celt_mode->overlap,
+            stereo_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels, g1, g2, celt_mode->overlap,
                   frame_size, st->channels, celt_mode->window, st->Fs);
             st->hybrid_stereo_width_Q14 = st->silk_mode.stereoWidth_Q14;
         }
@@ -1134,7 +1148,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
         int err;
         celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(0));
         celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0));
-        err = celt_encode_with_ec(celt_enc, pcm_buf, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL);
+        err = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL);
         if (err < 0)
         {
            RESTORE_STACK;
@@ -1160,7 +1174,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
         /* If false, we already busted the budget and we'll end up with a "PLC packet" */
         if (ec_tell(&enc) <= 8*nb_compr_bytes)
         {
-           ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc);
+           ret = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, frame_size, NULL, nb_compr_bytes, &enc);
            if (ret < 0)
            {
               RESTORE_STACK;
@@ -1183,9 +1197,9 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
         celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(0));
 
         /* NOTE: We could speed this up slightly (at the expense of code size) by just adding a function that prefills the buffer */
-        celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2-N4), N4, dummy, 2, NULL);
+        celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2-N4), N4, dummy, 2, NULL);
 
-        err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL);
+        err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL);
         if (err < 0)
         {
            RESTORE_STACK;