Cleans up the most ugly parts of the analysis code
authorJean-Marc Valin <jmvalin@jmvalin.ca>
Mon, 14 Nov 2011 09:58:29 +0000 (17:58 +0800)
committerJean-Marc Valin <jmvalin@jmvalin.ca>
Fri, 13 Jul 2012 18:50:34 +0000 (14:50 -0400)
Should be mostly usable now.

celt/celt.c
celt/celt.h
src/analysis.c
src/opus_encoder.c

index 97d9edc..7285e97 100644 (file)
@@ -177,8 +177,7 @@ struct OpusCustomEncoder {
    int prefilter_tapset_old;
 #endif
    int consec_transient;
-   int frame_tonality;
-   int tonality_slope;
+   AnalysisInfo analysis;
 
    opus_val32 preemph_memE[2];
    opus_val32 preemph_memD[2];
@@ -701,9 +700,6 @@ static int tf_analysis(const CELTMode *m, int len, int C, int isTransient,
    return tf_select;
 }
 
-extern int boost_band[2];
-extern float boost_amount[2];
-
 static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, int tf_select, ec_enc *enc)
 {
    int curr, i;
@@ -795,7 +791,7 @@ static void init_caps(const CELTMode *m,int *cap,int LM,int C)
 }
 
 static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
-      const opus_val16 *bandLogE, int end, int LM, int C, int N0, float tonality_slope)
+      const opus_val16 *bandLogE, int end, int LM, int C, int N0, AnalysisInfo *analysis)
 {
    int i;
    opus_val32 diff=0;
@@ -836,7 +832,6 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       result of a bug in the loop above */
    diff /= 2*C*(end-1);
    /*printf("%f\n", diff);*/
-#if 1
    if (diff > QCONST16(2.f, DB_SHIFT))
       trim_index--;
    if (diff > QCONST16(8.f, DB_SHIFT))
@@ -845,23 +840,23 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       trim_index++;
    if (diff < -QCONST16(10.f, DB_SHIFT))
       trim_index++;
+#ifndef FIXED_POINT
+   if (0 && analysis->valid)
+   {
+      if (analysis->tonality_slope > .15)
+         trim_index--;
+      if (analysis->tonality_slope > .3)
+         trim_index--;
+      if (analysis->tonality_slope < -.15)
+         trim_index++;
+      if (analysis->tonality_slope < -.3)
+         trim_index++;
+   }
 #endif
-#if 0
-   if (tonality_slope > .15)
-      trim_index--;
-   if (tonality_slope > .3)
-      trim_index--;
-   if (tonality_slope < -.15)
-      trim_index++;
-   if (tonality_slope < -.3)
-      trim_index++;
-#endif
-   //printf("%f\n", tonality_slope);
    if (trim_index<0)
       trim_index = 0;
    if (trim_index>10)
       trim_index = 10;
-   //printf("%f %d\n", tonality_slope, trim_index);
 #ifdef FUZZING
    trim_index = rand()%11;
 #endif
@@ -1309,6 +1304,7 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
          st->spread_decision = spreading_decision(st->mode, X,
                &st->tonal_average, st->spread_decision, &st->hf_average,
                &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
+         /*printf("%f %d\n", st->analysis.tonality_slope, st->tapset_decision);*/
          /*if (st->frame_tonality > .7*32768)
             st->spread_decision = SPREAD_NONE;
          else if (st->frame_tonality > .3*32768)
@@ -1362,18 +1358,19 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
 #endif
       }
    }
-   if (0)
+#ifndef FIXED_POINT
+   if (0 && st->analysis.valid)
    {
-      if (boost_amount[0]>.2)
-         offsets[boost_band[0]]+=2;
-      if (boost_amount[0]>.4)
-         offsets[boost_band[0]]+=2;
-      if (boost_amount[1]>.2)
-         offsets[boost_band[1]]+=2;
-      if (boost_amount[1]>.4)
-         offsets[boost_band[1]]+=2;
-      //printf("%f %f\n", boost_amount[0], boost_amount[1]);
+      if (st->analysis.boost_amount[0]>.2)
+         offsets[st->analysis.boost_band[0]]+=2;
+      if (st->analysis.boost_amount[0]>.4)
+         offsets[st->analysis.boost_band[0]]+=2;
+      if (st->analysis.boost_amount[1]>.2)
+         offsets[st->analysis.boost_band[1]]+=2;
+      if (st->analysis.boost_amount[1]>.4)
+         offsets[st->analysis.boost_band[1]]+=2;
    }
+#endif
    dynalloc_logp = 6;
    total_bits<<=BITRES;
    total_boost = 0;
@@ -1412,7 +1409,7 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
    if (tell+(6<<BITRES) <= total_bits - total_boost)
    {
       alloc_trim = alloc_trim_analysis(st->mode, X, bandLogE,
-            st->end, LM, C, N, st->tonality_slope/16384.);
+            st->end, LM, C, N, &st->analysis);
       ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
       tell = ec_tell_frac(enc);
    }
@@ -1455,7 +1452,13 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
      /* The target rate in 8th bits per frame */
      opus_int32 target, new_target;
      opus_int32 min_allowed;
+     int coded_bins;
+     int coded_bands;
      int lm_diff = st->mode->maxLM - LM;
+     coded_bands = st->lastCodedBands ? st->lastCodedBands : st->mode->nbEBands;
+     coded_bins = st->mode->eBands[coded_bands]<<LM;
+     if (C==2)
+        coded_bins += st->mode->eBands[IMIN(intensity, coded_bands)]<<LM;
 
      /* Don't attempt to use more than 510 kb/s, even for frames smaller than 20 ms.
         The CELT allocator will just not be able to use more than that anyway. */
@@ -1464,31 +1467,31 @@ int celt_encode_with_ec(CELTEncoder * restrict st, const opus_val16 * pcm, int f
      if (st->constrained_vbr)
         target += (st->vbr_offset>>lm_diff);
 
+#ifndef FIXED_POINT
+     if (st->analysis.valid && st->analysis.activity<.4)
+        target -= (coded_bins<<BITRES)*2*(.4-st->analysis.activity);
+#endif
+
 #ifdef FIXED_POINT
      new_target = SHL32(MULT16_32_Q15(target, SUB16(tf_estimate, QCONST16(0.05, 14))),1);
 #else
      new_target = target*(tf_estimate-.05);
 #endif
-     if (1) {
+
+#ifndef FIXED_POINT
+     if (st->analysis.valid) {
         int tonal_target;
         float tonal;
-        int coded_bins;
-        int coded_bands;
-        tonal = st->frame_tonality/32768.;
+        tonal = st->analysis.tonality;
         tonal -= .06;
-        coded_bands = st->lastCodedBands ? st->lastCodedBands : st->mode->nbEBands;
-        //coded_bands = IMIN(coded_bands, st->mode->nbEBands-1);
-        coded_bins = st->mode->eBands[coded_bands]<<LM;
-        if (C==2)
-           coded_bins += st->mode->eBands[IMIN(intensity, coded_bands)]<<LM;
         tonal_target = target + (coded_bins<<BITRES)*1.55*tonal;
         new_target = IMAX(tonal_target,new_target);
      }
+#endif
 
      /* The current offset is removed from the target and the space used
         so far is added*/
      target=new_target+tell;
-     //printf("%d\n", target);
      /* In VBR mode the frame size must not be reduced so much that it would
          result in the encoder running out of bits.
         The margin of 2 bytes ensures that none of the bust-prevention logic
@@ -1917,16 +1920,11 @@ int opus_custom_encoder_ctl(CELTEncoder * restrict st, int request, ...)
          st->signalling = value;
       }
       break;
-      case CELT_SET_TONALITY_REQUEST:
+      case CELT_SET_ANALYSIS_REQUEST:
       {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->frame_tonality = value;
-      }
-      break;
-      case CELT_SET_TONALITY_SLOPE_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->tonality_slope = value;
+         AnalysisInfo *info = va_arg(ap, AnalysisInfo *);
+         if (info)
+            OPUS_COPY(&st->analysis, info, 1);
       }
       break;
       case CELT_GET_MODE_REQUEST:
index 553670c..54bca44 100644 (file)
@@ -50,7 +50,18 @@ extern "C" {
 #define CELTDecoder OpusCustomDecoder
 #define CELTMode OpusCustomMode
 
-#define _celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
+typedef struct {
+   int valid;
+   opus_val16 tonality;
+   opus_val16 tonality_slope;
+   opus_val16 activity;
+   int boost_band[2];
+   opus_val16 boost_amount[2];
+}AnalysisInfo;
+
+#define __celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
+
+#define __celt_check_analysis_ptr(ptr) ((ptr) + ((ptr) - (const AnalysisInfo*)(ptr)))
 
 /* Encoder/decoder Requests */
 
@@ -81,7 +92,7 @@ extern "C" {
 
 #define CELT_GET_MODE_REQUEST    10015
 /** Get the CELTMode used by an encoder or decoder */
-#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, _celt_check_mode_ptr_ptr(x)
+#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, __celt_check_mode_ptr_ptr(x)
 
 #define CELT_SET_SIGNALLING_REQUEST    10016
 #define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, __opus_check_int(x)
@@ -91,6 +102,9 @@ extern "C" {
 #define CELT_SET_TONALITY_SLOPE_REQUEST    10020
 #define CELT_SET_TONALITY_SLOPE(x) CELT_SET_TONALITY_SLOPE_REQUEST, __opus_check_int(x)
 
+#define CELT_SET_ANALYSIS_REQUEST    10022
+#define CELT_SET_ANALYSIS(x) CELT_SET_ANALYSIS_REQUEST, __celt_check_analysis_ptr(x)
+
 
 /* Encoder stuff */
 
index 21a4a10..49517f6 100644 (file)
@@ -50,35 +50,33 @@ typedef struct {
    float prev_band_tonality[NB_TBANDS];
    float prev_tonality;
    float E[NB_FRAMES][NB_TBANDS];
+   float lowE[NB_TBANDS], highE[NB_TBANDS];
    int E_count;
+   int count;
 } TonalityAnalysisState;
 
-int boost_band[2];
-float boost_amount[2];
-
-float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, const opus_val16 *x, int C, float *tslope)
+void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int C)
 {
     int i, b;
     const CELTMode *mode;
     const kiss_fft_state *kfft;
     kiss_fft_cpx in[480], out[480];
-    const opus_val16 *window;
-    int overlap = 240;
     int N = 480, N2=240;
     float * restrict A = tonal->angle;
     float * restrict dA = tonal->d_angle;
     float * restrict d2A = tonal->d2_angle;
     float tonality[240];
+    float noisiness[240];
     float band_tonality[NB_TBANDS];
     float frame_tonality;
+    float frame_noisiness;
     const float pi4 = M_PI*M_PI*M_PI*M_PI;
     float slope=0;
-    float max_tonality=-1;
-    int max_band=0;
+    float frame_stationarity;
+    float relativeE;
     celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
 
     kfft = mode->mdct.kfft[0];
-    window = mode->window;
     if (C==1)
     {
        for (i=0;i<N2;i++)
@@ -111,7 +109,7 @@ float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, con
        X1i = out[i].i-out[N-i].i;
        X2r = out[i].i+out[N-i].i;
        X2i = out[N-i].r-out[i].r;
-       //printf("%f\n", X1r);
+
        angle = (.5/M_PI)*atan2(X1i, X1r);
        d_angle = angle - A[i];
        d2_angle = d_angle - dA[i];
@@ -119,14 +117,14 @@ float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, con
        angle2 = (.5/M_PI)*atan2(X2i, X2r);
        d_angle2 = angle2 - angle;
        d2_angle2 = d_angle2 - d_angle;
-       //printf("%f ", angle2);
 
-       //printf("%f ", d2_angle);
        mod1 = d2_angle - floor(.5+d2_angle);
-       //printf("%f ", mod1);
+       noisiness[i] = fabs(mod1);
        mod1 *= mod1;
        mod1 *= mod1;
+
        mod2 = d2_angle2 - floor(.5+d2_angle2);
+       noisiness[i] += fabs(mod2);
        mod2 *= mod2;
        mod2 *= mod2;
 
@@ -139,9 +137,23 @@ float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, con
     }
 
     frame_tonality = 0;
+    info->activity = 0;
+    frame_noisiness = 0;
+    frame_stationarity = 0;
+    if (!tonal->count)
+    {
+       for (b=0;b<NB_TBANDS;b++)
+       {
+          tonal->lowE[b] = 1e10;
+          tonal->highE[b] = -1e10;
+       }
+    }
+    relativeE = 0;
+    info->boost_amount[0]=info->boost_amount[1]=0;
+    info->boost_band[0]=info->boost_band[1]=0;
     for (b=0;b<NB_TBANDS;b++)
     {
-       float E=0, tE=0;
+       float E=0, tE=0, nE=0, logE;
        float L1, L2;
        float stationarity;
        for (i=tbands[b];i<tbands[b+1];i++)
@@ -150,8 +162,21 @@ float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, con
                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
           E += binE;
           tE += binE*tonality[i];
+          nE += binE*2*(.5-noisiness[i]);
        }
        tonal->E[tonal->E_count][b] = E;
+       frame_noisiness += nE/(1e-15+E);
+
+       logE = log(E+EPSILON);
+       tonal->lowE[b] = MIN32(logE, tonal->lowE[b]+.01);
+       tonal->highE[b] = MAX32(logE, tonal->highE[b]-.1);
+       if (tonal->highE[b] < tonal->lowE[b]+1)
+       {
+          tonal->highE[b]+=.5;
+          tonal->lowE[b]-=.5;
+       }
+       relativeE += (logE-tonal->lowE[b])/(EPSILON+tonal->highE[b]-tonal->lowE[b]);
+
        L1=L2=0;
        for (i=0;i<NB_FRAMES;i++)
        {
@@ -162,52 +187,54 @@ float tonality_analysis(TonalityAnalysisState *tonal, CELTEncoder *celt_enc, con
        stationarity = MIN16(0.99,L1/sqrt(EPSILON+NB_FRAMES*L2));
        stationarity *= stationarity;
        stationarity *= stationarity;
-       //fprintf(stderr, "%f %f %f\n", L1, L2, stationarity);
-       //fprintf(stderr, "%f %f\n", tE, E);
-       //fprintf(stderr, "%f %f\n", stationarity, );
-       //band_tonality[b] = tE/(1e-15+E);
-       band_tonality[b] = MAX16(tE/(1e-15+E), stationarity*tonal->prev_band_tonality[b]);
-       //if (band_tonality[b]>1)
-       //   printf("%f %f %f\n", L1, L2, stationarity);
-       //fprintf(stdout, "%f ", band_tonality[b]);
+       frame_stationarity += stationarity;
+       /*band_tonality[b] = tE/(1e-15+E)*/;
+       band_tonality[b] = MAX16(tE/(EPSILON+E), stationarity*tonal->prev_band_tonality[b]);
        if (b>=7)
           frame_tonality += band_tonality[b];
        slope += band_tonality[b]*(b-8);
-       if (band_tonality[b] > boost_amount[1] && b>=7 && b < NB_TBANDS-1)
+       if (band_tonality[b] > info->boost_amount[1] && b>=7 && b < NB_TBANDS-1)
        {
-          if (band_tonality[b] > boost_amount[0])
+          if (band_tonality[b] > info->boost_amount[0])
           {
-             boost_amount[1] = boost_amount[0];
-             boost_band[1] = boost_band[0];
-             boost_amount[0] = band_tonality[b];
-             boost_band[0] = b;
+             info->boost_amount[1] = info->boost_amount[0];
+             info->boost_band[1] = info->boost_band[0];
+             info->boost_amount[0] = band_tonality[b];
+             info->boost_band[0] = b;
           } else {
-             boost_amount[1] = band_tonality[b];
-             boost_band[1] = b;
+             info->boost_amount[1] = band_tonality[b];
+             info->boost_band[1] = b;
           }
        }
        tonal->prev_band_tonality[b] = band_tonality[b];
     }
+    frame_stationarity /= NB_TBANDS;
+    relativeE /= NB_TBANDS;
+    if (tonal->count<10)
+       relativeE = .5;
+    frame_noisiness /= NB_TBANDS;
+#if 1
+    info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
+#else
+    info->activity = .5*(1+frame_noisiness-frame_stationarity);
+#endif
     frame_tonality /= NB_TBANDS-7;
     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8);
-    //fprintf(stdout, "%f\n", frame_tonality);
     tonal->prev_tonality = frame_tonality;
-    boost_amount[0] -= frame_tonality+.2;
-    boost_amount[1] -= frame_tonality+.2;
-    if (band_tonality[boost_band[0]] < band_tonality[boost_band[0]+1]+.15
-        || band_tonality[boost_band[0]] < band_tonality[boost_band[0]-1]+.15)
-       boost_amount[0]=0;
-    if (band_tonality[boost_band[1]] < band_tonality[boost_band[1]+1]+.15
-        || band_tonality[boost_band[1]] < band_tonality[boost_band[1]-1]+.15)
-       boost_amount[1]=0;
-
-    //boost_band = 16;
-    //boost_amount = .6;
-    //printf("%d %f %f\n", max_band, max_tonality, frame_tonality);
+    info->boost_amount[0] -= frame_tonality+.2;
+    info->boost_amount[1] -= frame_tonality+.2;
+    if (band_tonality[info->boost_band[0]] < band_tonality[info->boost_band[0]+1]+.15
+        || band_tonality[info->boost_band[0]] < band_tonality[info->boost_band[0]-1]+.15)
+       info->boost_amount[0]=0;
+    if (band_tonality[info->boost_band[1]] < band_tonality[info->boost_band[1]+1]+.15
+        || band_tonality[info->boost_band[1]] < band_tonality[info->boost_band[1]-1]+.15)
+       info->boost_amount[1]=0;
+
     slope /= 8*8;
-    *tslope = slope;
-    //fprintf(stdout, "%f %f\n", frame_tonality, slope);
+    info->tonality_slope = slope;
 
     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
-    return frame_tonality;
+    tonal->count++;
+    info->tonality = frame_tonality;
+    info->valid = 1;
 }
index af54605..197c687 100644 (file)
@@ -85,7 +85,9 @@ struct OpusEncoder {
     /* Sampling rate (at the API level) */
     int          first;
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
-
+#ifndef FIXED_POINT
+    TonalityAnalysisState analysis;
+#endif
     opus_uint32  rangeFinal;
 };
 
@@ -102,7 +104,7 @@ static const opus_int32 mono_music_bandwidth_thresholds[8] = {
         14000, 1000, /* MB not allowed */
         18000, 2000, /* MB<->WB */
         24000, 2000, /* WB<->SWB */
-        31000, 2000, /* SWB<->FB */
+        33000, 2000, /* SWB<->FB */
 };
 static const opus_int32 stereo_voice_bandwidth_thresholds[8] = {
         11000, 1000, /* NB<->MB */
@@ -474,6 +476,10 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
     int curr_bandwidth;
     opus_int32 max_data_bytes;
     int extra_buffer, total_buffer;
+    int perform_analysis=0;
+#ifndef FIXED_POINT
+    AnalysisInfo analysis_info;
+#endif
     VARDECL(opus_val16, tmp_prefill);
 
     ALLOC_STACK;
@@ -495,13 +501,18 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
     silk_enc = (char*)st+st->silk_enc_offset;
     celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset);
 
+#ifndef FIXED_POINT
+    perform_analysis = st->silk_mode.complexity >= 7 && frame_size >= st->Fs/100 && st->Fs==48000;
+#endif
     if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
        delay_compensation = 0;
     else
        delay_compensation = st->delay_compensation;
-    if (1)
+    if (perform_analysis)
     {
-       total_buffer = IMAX(240, delay_compensation);
+       total_buffer = IMAX(st->Fs/200, delay_compensation);
+    } else {
+       total_buffer = delay_compensation;
     }
     extra_buffer = total_buffer-delay_compensation;
     st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes);
@@ -852,13 +863,17 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
           pcm_buf[total_buffer*st->channels + i] = pcm[i];
     }
 
-    static TonalityAnalysisState tonal;
-    float tonality;
-    float tonality_slope;
-    tonality_analysis(&tonal, celt_enc, pcm_buf, st->channels, &tonality_slope);
-    tonality = tonality_analysis(&tonal, celt_enc, pcm_buf+(st->Fs/100)*st->channels, st->channels, &tonality_slope);
-    celt_encoder_ctl(celt_enc, CELT_SET_TONALITY(32768*tonality));
-    celt_encoder_ctl(celt_enc, CELT_SET_TONALITY_SLOPE(16384*tonality_slope));
+#ifndef FIXED_POINT
+    if (perform_analysis)
+    {
+       int nb_analysis_frames;
+       nb_analysis_frames = frame_size/(st->Fs/100);
+       for (i=0;i<nb_analysis_frames;i++)
+          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm_buf+i*(st->Fs/100)*st->channels, st->channels);
+    } else {
+       analysis_info.valid = 0;
+    }
+#endif
 
     /* SILK processing */
     if (st->mode != MODE_CELT_ONLY)
@@ -1174,6 +1189,10 @@ opus_int32 opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_s
         /* If false, we already busted the budget and we'll end up with a "PLC packet" */
         if (ec_tell(&enc) <= 8*nb_compr_bytes)
         {
+#ifndef FIXED_POINT
+           if (perform_analysis)
+              celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info));
+#endif
            ret = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, frame_size, NULL, nb_compr_bytes, &enc);
            if (ret < 0)
            {