Revisit surround rate allocation
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 extern const MLP net;
43
44 #ifndef M_PI
45 #define M_PI 3.141592653
46 #endif
47
48 static const float dct_table[128] = {
49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 };
66
67 static const float analysis_window[240] = {
68       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 };
99
100 static const int tbands[NB_TBANDS+1] = {
101        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 };
103
104 static const int extra_bands[NB_TOT_BANDS+1] = {
105       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 };
107
108 /*static const float tweight[NB_TBANDS+1] = {
109       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 };*/
111
112 #define NB_TONAL_SKIP_BANDS 9
113
114 #define cA 0.43157974f
115 #define cB 0.67848403f
116 #define cC 0.08595542f
117 #define cE ((float)M_PI/2)
118 static inline float fast_atan2f(float y, float x) {
119    float x2, y2;
120    /* Should avoid underflow on the values we'll get */
121    if (ABS16(x)+ABS16(y)<1e-9f)
122    {
123       x*=1e12f;
124       y*=1e12f;
125    }
126    x2 = x*x;
127    y2 = y*y;
128    if(x2<y2){
129       float den = (y2 + cB*x2) * (y2 + cC*x2);
130       if (den!=0)
131          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132       else
133          return (y<0 ? -cE : cE);
134    }else{
135       float den = (x2 + cB*y2) * (x2 + cC*y2);
136       if (den!=0)
137          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138       else
139          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140    }
141 }
142
143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144 {
145    int pos;
146    int curr_lookahead;
147    float psum;
148    int i;
149
150    pos = tonal->read_pos;
151    curr_lookahead = tonal->write_pos-tonal->read_pos;
152    if (curr_lookahead<0)
153       curr_lookahead += DETECT_SIZE;
154
155    if (len > 480 && pos != tonal->write_pos)
156    {
157       pos++;
158       if (pos==DETECT_SIZE)
159          pos=0;
160    }
161    if (pos == tonal->write_pos)
162       pos--;
163    if (pos<0)
164       pos = DETECT_SIZE-1;
165    OPUS_COPY(info_out, &tonal->info[pos], 1);
166    tonal->read_subframe += len/120;
167    while (tonal->read_subframe>=4)
168    {
169       tonal->read_subframe -= 4;
170       tonal->read_pos++;
171    }
172    if (tonal->read_pos>=DETECT_SIZE)
173       tonal->read_pos-=DETECT_SIZE;
174
175    /* Compensate for the delay in the features themselves.
176       FIXME: Need a better estimate the 10 I just made up */
177    curr_lookahead = IMAX(curr_lookahead-10, 0);
178
179    psum=0;
180    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
181       psum += tonal->pmusic[i];
182    for (;i<DETECT_SIZE;i++)
183       psum += tonal->pspeech[i];
184    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
185    /*printf("%f %f\n", psum, info_out->music_prob);*/
186
187    info_out->music_prob = psum;
188 }
189
190 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
191 {
192     int i, b;
193     const kiss_fft_state *kfft;
194     VARDECL(kiss_fft_cpx, in);
195     VARDECL(kiss_fft_cpx, out);
196     int N = 480, N2=240;
197     float * OPUS_RESTRICT A = tonal->angle;
198     float * OPUS_RESTRICT dA = tonal->d_angle;
199     float * OPUS_RESTRICT d2A = tonal->d2_angle;
200     VARDECL(float, tonality);
201     VARDECL(float, noisiness);
202     float band_tonality[NB_TBANDS];
203     float logE[NB_TBANDS];
204     float BFCC[8];
205     float features[25];
206     float frame_tonality;
207     float max_frame_tonality;
208     /*float tw_sum=0;*/
209     float frame_noisiness;
210     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
211     float slope=0;
212     float frame_stationarity;
213     float relativeE;
214     float frame_probs[2];
215     float alpha, alphaE, alphaE2;
216     float frame_loudness;
217     float bandwidth_mask;
218     int bandwidth=0;
219     float maxE = 0;
220     float noise_floor;
221     int remaining;
222     AnalysisInfo *info;
223     SAVE_STACK;
224
225     tonal->last_transition++;
226     alpha = 1.f/IMIN(20, 1+tonal->count);
227     alphaE = 1.f/IMIN(50, 1+tonal->count);
228     alphaE2 = 1.f/IMIN(6000, 1+tonal->count);
229
230     if (tonal->count<4)
231        tonal->music_prob = .5;
232     kfft = celt_mode->mdct.kfft[0];
233     if (tonal->count==0)
234        tonal->mem_fill = 240;
235     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C);
236     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
237     {
238        tonal->mem_fill += len;
239        /* Don't have enough to update the analysis */
240        RESTORE_STACK;
241        return;
242     }
243     info = &tonal->info[tonal->write_pos++];
244     if (tonal->write_pos>=DETECT_SIZE)
245        tonal->write_pos-=DETECT_SIZE;
246
247     ALLOC(in, 480, kiss_fft_cpx);
248     ALLOC(out, 480, kiss_fft_cpx);
249     ALLOC(tonality, 240, float);
250     ALLOC(noisiness, 240, float);
251     for (i=0;i<N2;i++)
252     {
253        float w = analysis_window[i];
254        in[i].r = MULT16_16(w, tonal->inmem[i]);
255        in[i].i = MULT16_16(w, tonal->inmem[N2+i]);
256        in[N-i-1].r = MULT16_16(w, tonal->inmem[N-i-1]);
257        in[N-i-1].i = MULT16_16(w, tonal->inmem[N+N2-i-1]);
258     }
259     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
260     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
261     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C);
262     tonal->mem_fill = 240 + remaining;
263     opus_fft(kfft, in, out);
264
265     for (i=1;i<N2;i++)
266     {
267        float X1r, X2r, X1i, X2i;
268        float angle, d_angle, d2_angle;
269        float angle2, d_angle2, d2_angle2;
270        float mod1, mod2, avg_mod;
271        X1r = out[i].r+out[N-i].r;
272        X1i = out[i].i-out[N-i].i;
273        X2r = out[i].i+out[N-i].i;
274        X2i = out[N-i].r-out[i].r;
275
276        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
277        d_angle = angle - A[i];
278        d2_angle = d_angle - dA[i];
279
280        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
281        d_angle2 = angle2 - angle;
282        d2_angle2 = d_angle2 - d_angle;
283
284        mod1 = d2_angle - (float)floor(.5+d2_angle);
285        noisiness[i] = ABS16(mod1);
286        mod1 *= mod1;
287        mod1 *= mod1;
288
289        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
290        noisiness[i] += ABS16(mod2);
291        mod2 *= mod2;
292        mod2 *= mod2;
293
294        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
295        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
296
297        A[i] = angle2;
298        dA[i] = d_angle2;
299        d2A[i] = mod2;
300     }
301
302     frame_tonality = 0;
303     max_frame_tonality = 0;
304     /*tw_sum = 0;*/
305     info->activity = 0;
306     frame_noisiness = 0;
307     frame_stationarity = 0;
308     if (!tonal->count)
309     {
310        for (b=0;b<NB_TBANDS;b++)
311        {
312           tonal->lowE[b] = 1e10;
313           tonal->highE[b] = -1e10;
314        }
315     }
316     relativeE = 0;
317     frame_loudness = 0;
318     bandwidth_mask = 0;
319     for (b=0;b<NB_TBANDS;b++)
320     {
321        float E=0, tE=0, nE=0;
322        float L1, L2;
323        float stationarity;
324        for (i=tbands[b];i<tbands[b+1];i++)
325        {
326           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
327                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
328           E += binE;
329           tE += binE*tonality[i];
330           nE += binE*2.f*(.5f-noisiness[i]);
331        }
332        tonal->E[tonal->E_count][b] = E;
333        frame_noisiness += nE/(1e-15f+E);
334
335        frame_loudness += celt_sqrt(E+1e-10f);
336        logE[b] = (float)log(E+1e-10f);
337        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
338        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
339        if (tonal->highE[b] < tonal->lowE[b]+1.f)
340        {
341           tonal->highE[b]+=.5f;
342           tonal->lowE[b]-=.5f;
343        }
344        relativeE += (logE[b]-tonal->lowE[b])/(EPSILON+tonal->highE[b]-tonal->lowE[b]);
345
346        L1=L2=0;
347        for (i=0;i<NB_FRAMES;i++)
348        {
349           L1 += celt_sqrt(tonal->E[i][b]);
350           L2 += tonal->E[i][b];
351        }
352
353        stationarity = MIN16(0.99f,L1/celt_sqrt(EPSILON+NB_FRAMES*L2));
354        stationarity *= stationarity;
355        stationarity *= stationarity;
356        frame_stationarity += stationarity;
357        /*band_tonality[b] = tE/(1e-15+E)*/;
358        band_tonality[b] = MAX16(tE/(EPSILON+E), stationarity*tonal->prev_band_tonality[b]);
359 #if 0
360        if (b>=NB_TONAL_SKIP_BANDS)
361        {
362           frame_tonality += tweight[b]*band_tonality[b];
363           tw_sum += tweight[b];
364        }
365 #else
366        frame_tonality += band_tonality[b];
367        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
368           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
369 #endif
370        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
371        slope += band_tonality[b]*(b-8);
372        /*printf("%f %f ", band_tonality[b], stationarity);*/
373        tonal->prev_band_tonality[b] = band_tonality[b];
374     }
375
376     bandwidth_mask = 0;
377     bandwidth = 0;
378     for (b=0;b<NB_TOT_BANDS;b++)
379        maxE = MAX32(maxE, tonal->meanE[b]);
380     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
381     noise_floor *= noise_floor;
382     for (b=0;b<NB_TOT_BANDS;b++)
383     {
384        float E=0;
385        int band_start, band_end;
386        /* Keep a margin of 300 Hz for aliasing */
387        band_start = extra_bands[b];
388        band_end = extra_bands[b+1];
389        for (i=band_start;i<band_end;i++)
390        {
391           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
392                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
393           E += binE;
394        }
395        E /= (band_end-band_start);
396        maxE = MAX32(maxE, E);
397        if (tonal->count>2)
398        {
399           tonal->meanE[b] = (1-alphaE2)*tonal->meanE[b] + alphaE2*E;
400        } else {
401           tonal->meanE[b] = E;
402        }
403        E = MAX32(E, tonal->meanE[b]);
404        /* 13 dB slope for spreading function */
405        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
406        /* Checks if band looks like stationary noise or if it's below a (trivial) masking curve */
407        if (E>.1*bandwidth_mask && E*1e10f > maxE && E > noise_floor)
408           bandwidth = b;
409     }
410     if (tonal->count<=2)
411        bandwidth = 20;
412     frame_loudness = 20*(float)log10(frame_loudness);
413     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
414     tonal->lowECount *= (1-alphaE);
415     if (frame_loudness < tonal->Etracker-30)
416        tonal->lowECount += alphaE;
417
418     for (i=0;i<8;i++)
419     {
420        float sum=0;
421        for (b=0;b<16;b++)
422           sum += dct_table[i*16+b]*logE[b];
423        BFCC[i] = sum;
424     }
425
426     frame_stationarity /= NB_TBANDS;
427     relativeE /= NB_TBANDS;
428     if (tonal->count<10)
429        relativeE = .5;
430     frame_noisiness /= NB_TBANDS;
431 #if 1
432     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
433 #else
434     info->activity = .5*(1+frame_noisiness-frame_stationarity);
435 #endif
436     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
437     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
438     tonal->prev_tonality = frame_tonality;
439
440     slope /= 8*8;
441     info->tonality_slope = slope;
442
443     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
444     tonal->count++;
445     info->tonality = frame_tonality;
446
447     for (i=0;i<4;i++)
448        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
449
450     for (i=0;i<4;i++)
451        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
452
453     for (i=0;i<4;i++)
454         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
455     for (i=0;i<3;i++)
456         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
457
458     if (tonal->count > 5)
459     {
460        for (i=0;i<9;i++)
461           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
462     }
463
464     for (i=0;i<8;i++)
465     {
466        tonal->mem[i+24] = tonal->mem[i+16];
467        tonal->mem[i+16] = tonal->mem[i+8];
468        tonal->mem[i+8] = tonal->mem[i];
469        tonal->mem[i] = BFCC[i];
470     }
471     for (i=0;i<9;i++)
472        features[11+i] = celt_sqrt(tonal->std[i]);
473     features[20] = info->tonality;
474     features[21] = info->activity;
475     features[22] = frame_stationarity;
476     features[23] = info->tonality_slope;
477     features[24] = tonal->lowECount;
478
479 #ifndef FIXED_POINT
480     mlp_process(&net, features, frame_probs);
481     frame_probs[0] = .5f*(frame_probs[0]+1);
482     /* Curve fitting between the MLP probability and the actual probability */
483     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
484     frame_probs[1] = .5*frame_probs[1]+.5;
485     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5;
486
487     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
488     {
489        float tau, beta;
490        float p0, p1;
491        float s0, m0;
492        float psum;
493        float speech0;
494        float music0;
495
496        /* One transition every 3 minutes */
497        tau = .00005f*frame_probs[1];
498        beta = .05f;
499        if (1) {
500           /* Adapt beta based on how "unexpected" the new prob is */
501           float p, q;
502           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
503           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
504           beta = .01+.05*ABS16(p-q)/(p*(1-q)+q*(1-p));
505        }
506        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
507        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
508        p0 *= (float)pow(1-frame_probs[0], beta);
509        p1 *= (float)pow(frame_probs[0], beta);
510        tonal->music_prob = p1/(p0+p1);
511        info->music_prob = tonal->music_prob;
512
513        psum=1e-20;
514        speech0 = (float)pow(1-frame_probs[0], beta);
515        music0  = (float)pow(frame_probs[0], beta);
516        if (tonal->count==1)
517        {
518           tonal->pspeech[0]=.5;
519           tonal->pmusic [0]=.5;
520        }
521        s0 = tonal->pspeech[0] + tonal->pspeech[1];
522        m0 = tonal->pmusic [0] + tonal->pmusic [1];
523        tonal->pspeech[0] = s0*(1-tau)*speech0;
524        tonal->pmusic [0] = m0*(1-tau)*music0;
525        for (i=1;i<DETECT_SIZE-1;i++)
526        {
527           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
528           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
529        }
530        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
531        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
532
533        for (i=0;i<DETECT_SIZE;i++)
534           psum += tonal->pspeech[i] + tonal->pmusic[i];
535        psum = 1.f/psum;
536        for (i=0;i<DETECT_SIZE;i++)
537        {
538           tonal->pspeech[i] *= psum;
539           tonal->pmusic [i] *= psum;
540        }
541        psum = tonal->pmusic[0];
542        for (i=1;i<DETECT_SIZE;i++)
543           psum += tonal->pspeech[i];
544
545        /* Estimate our confidence in the speech/music decisions */
546        if (frame_probs[1]>.75)
547        {
548           if (tonal->music_prob>.9)
549           {
550              float adapt;
551              adapt = 1.f/(++tonal->music_confidence_count);
552              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
553              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
554           }
555           if (tonal->music_prob<.1)
556           {
557              float adapt;
558              adapt = 1.f/(++tonal->speech_confidence_count);
559              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
560              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
561           }
562        } else {
563           if (tonal->music_confidence_count==0)
564              tonal->music_confidence = .9;
565           if (tonal->speech_confidence_count==0)
566              tonal->speech_confidence = .1;
567        }
568        psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum));
569     }
570     if (tonal->last_music != (tonal->music_prob>.5f))
571        tonal->last_transition=0;
572     tonal->last_music = tonal->music_prob>.5f;
573 #else
574     info->music_prob = 0;
575 #endif
576     /*for (i=0;i<25;i++)
577        printf("%f ", features[i]);
578     printf("\n");*/
579
580     if (bandwidth<=12)
581        tonal->opus_bandwidth = OPUS_BANDWIDTH_NARROWBAND;
582     else if (bandwidth<=14)
583        tonal->opus_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND;
584     else if (bandwidth<=16)
585        tonal->opus_bandwidth = OPUS_BANDWIDTH_WIDEBAND;
586     else if (bandwidth<=18)
587        tonal->opus_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND;
588     else
589        tonal->opus_bandwidth = OPUS_BANDWIDTH_FULLBAND;
590
591     info->bandwidth = bandwidth;
592     info->opus_bandwidth = tonal->opus_bandwidth;
593     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
594     info->noisiness = frame_noisiness;
595     info->valid = 1;
596     if (info_out!=NULL)
597        OPUS_COPY(info_out, info, 1);
598     RESTORE_STACK;
599 }
600
601 int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
602                         const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
603                         int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
604 {
605    int offset;
606    int pcm_len;
607
608    /* Avoid overflow/wrap-around of the analysis buffer */
609    frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
610
611    pcm_len = frame_size - analysis->analysis_offset;
612    offset = 0;
613    do {
614       tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix);
615       offset += 480;
616       pcm_len -= 480;
617    } while (pcm_len>0);
618    analysis->analysis_offset = frame_size;
619
620    if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
621    {
622       int LM = 3;
623       LM = optimize_framesize((const opus_val16*)pcm, frame_size, C, Fs, bitrate_bps,
624             analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
625       while ((Fs/400<<LM)>frame_size)
626          LM--;
627       frame_size = (Fs/400<<LM);
628    } else {
629       frame_size = frame_size_select(frame_size, variable_duration, Fs);
630    }
631    if (frame_size<0)
632       return -1;
633    analysis->analysis_offset -= frame_size;
634
635    /* Only perform analysis up to 20-ms frames. Longer ones will be split if
636       they're in CELT-only mode. */
637    analysis_info->valid = 0;
638    tonality_get_info(analysis, analysis_info, frame_size);
639
640    return frame_size;
641 }