Commenting the speech/music Markov code
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 extern const MLP net;
43
44 #ifndef M_PI
45 #define M_PI 3.141592653
46 #endif
47
48 static const float dct_table[128] = {
49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 };
66
67 static const float analysis_window[240] = {
68       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 };
99
100 static const int tbands[NB_TBANDS+1] = {
101        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 };
103
104 static const int extra_bands[NB_TOT_BANDS+1] = {
105       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 };
107
108 /*static const float tweight[NB_TBANDS+1] = {
109       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 };*/
111
112 #define NB_TONAL_SKIP_BANDS 9
113
114 #define cA 0.43157974f
115 #define cB 0.67848403f
116 #define cC 0.08595542f
117 #define cE ((float)M_PI/2)
118 static inline float fast_atan2f(float y, float x) {
119    float x2, y2;
120    /* Should avoid underflow on the values we'll get */
121    if (ABS16(x)+ABS16(y)<1e-9f)
122    {
123       x*=1e12f;
124       y*=1e12f;
125    }
126    x2 = x*x;
127    y2 = y*y;
128    if(x2<y2){
129       float den = (y2 + cB*x2) * (y2 + cC*x2);
130       if (den!=0)
131          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132       else
133          return (y<0 ? -cE : cE);
134    }else{
135       float den = (x2 + cB*y2) * (x2 + cC*y2);
136       if (den!=0)
137          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138       else
139          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140    }
141 }
142
143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144 {
145    int pos;
146    int curr_lookahead;
147    float psum;
148    int i;
149
150    pos = tonal->read_pos;
151    curr_lookahead = tonal->write_pos-tonal->read_pos;
152    if (curr_lookahead<0)
153       curr_lookahead += DETECT_SIZE;
154
155    if (len > 480 && pos != tonal->write_pos)
156    {
157       pos++;
158       if (pos==DETECT_SIZE)
159          pos=0;
160    }
161    if (pos == tonal->write_pos)
162       pos--;
163    if (pos<0)
164       pos = DETECT_SIZE-1;
165    OPUS_COPY(info_out, &tonal->info[pos], 1);
166    tonal->read_subframe += len/120;
167    while (tonal->read_subframe>=4)
168    {
169       tonal->read_subframe -= 4;
170       tonal->read_pos++;
171    }
172    if (tonal->read_pos>=DETECT_SIZE)
173       tonal->read_pos-=DETECT_SIZE;
174
175    /* Compensate for the delay in the features themselves.
176       FIXME: Need a better estimate the 10 I just made up */
177    curr_lookahead = IMAX(curr_lookahead-10, 0);
178
179    psum=0;
180    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
181       psum += tonal->pmusic[i];
182    for (;i<DETECT_SIZE;i++)
183       psum += tonal->pspeech[i];
184    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
185    /*printf("%f %f\n", psum, info_out->music_prob);*/
186
187    info_out->music_prob = psum;
188 }
189
190 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
191 {
192     int i, b;
193     const kiss_fft_state *kfft;
194     VARDECL(kiss_fft_cpx, in);
195     VARDECL(kiss_fft_cpx, out);
196     int N = 480, N2=240;
197     float * OPUS_RESTRICT A = tonal->angle;
198     float * OPUS_RESTRICT dA = tonal->d_angle;
199     float * OPUS_RESTRICT d2A = tonal->d2_angle;
200     VARDECL(float, tonality);
201     VARDECL(float, noisiness);
202     float band_tonality[NB_TBANDS];
203     float logE[NB_TBANDS];
204     float BFCC[8];
205     float features[25];
206     float frame_tonality;
207     float max_frame_tonality;
208     /*float tw_sum=0;*/
209     float frame_noisiness;
210     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
211     float slope=0;
212     float frame_stationarity;
213     float relativeE;
214     float frame_probs[2];
215     float alpha, alphaE, alphaE2;
216     float frame_loudness;
217     float bandwidth_mask;
218     int bandwidth=0;
219     float maxE = 0;
220     float noise_floor;
221     int remaining;
222     AnalysisInfo *info;
223     SAVE_STACK;
224
225     tonal->last_transition++;
226     alpha = 1.f/IMIN(20, 1+tonal->count);
227     alphaE = 1.f/IMIN(50, 1+tonal->count);
228     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
229
230     if (tonal->count<4)
231        tonal->music_prob = .5;
232     kfft = celt_mode->mdct.kfft[0];
233     if (tonal->count==0)
234        tonal->mem_fill = 240;
235     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C);
236     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
237     {
238        tonal->mem_fill += len;
239        /* Don't have enough to update the analysis */
240        RESTORE_STACK;
241        return;
242     }
243     info = &tonal->info[tonal->write_pos++];
244     if (tonal->write_pos>=DETECT_SIZE)
245        tonal->write_pos-=DETECT_SIZE;
246
247     ALLOC(in, 480, kiss_fft_cpx);
248     ALLOC(out, 480, kiss_fft_cpx);
249     ALLOC(tonality, 240, float);
250     ALLOC(noisiness, 240, float);
251     for (i=0;i<N2;i++)
252     {
253        float w = analysis_window[i];
254        in[i].r = MULT16_16(w, tonal->inmem[i]);
255        in[i].i = MULT16_16(w, tonal->inmem[N2+i]);
256        in[N-i-1].r = MULT16_16(w, tonal->inmem[N-i-1]);
257        in[N-i-1].i = MULT16_16(w, tonal->inmem[N+N2-i-1]);
258     }
259     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
260     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
261     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C);
262     tonal->mem_fill = 240 + remaining;
263     opus_fft(kfft, in, out);
264
265     for (i=1;i<N2;i++)
266     {
267        float X1r, X2r, X1i, X2i;
268        float angle, d_angle, d2_angle;
269        float angle2, d_angle2, d2_angle2;
270        float mod1, mod2, avg_mod;
271        X1r = out[i].r+out[N-i].r;
272        X1i = out[i].i-out[N-i].i;
273        X2r = out[i].i+out[N-i].i;
274        X2i = out[N-i].r-out[i].r;
275
276        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
277        d_angle = angle - A[i];
278        d2_angle = d_angle - dA[i];
279
280        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
281        d_angle2 = angle2 - angle;
282        d2_angle2 = d_angle2 - d_angle;
283
284        mod1 = d2_angle - (float)floor(.5+d2_angle);
285        noisiness[i] = ABS16(mod1);
286        mod1 *= mod1;
287        mod1 *= mod1;
288
289        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
290        noisiness[i] += ABS16(mod2);
291        mod2 *= mod2;
292        mod2 *= mod2;
293
294        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
295        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
296
297        A[i] = angle2;
298        dA[i] = d_angle2;
299        d2A[i] = mod2;
300     }
301
302     frame_tonality = 0;
303     max_frame_tonality = 0;
304     /*tw_sum = 0;*/
305     info->activity = 0;
306     frame_noisiness = 0;
307     frame_stationarity = 0;
308     if (!tonal->count)
309     {
310        for (b=0;b<NB_TBANDS;b++)
311        {
312           tonal->lowE[b] = 1e10;
313           tonal->highE[b] = -1e10;
314        }
315     }
316     relativeE = 0;
317     frame_loudness = 0;
318     bandwidth_mask = 0;
319     for (b=0;b<NB_TBANDS;b++)
320     {
321        float E=0, tE=0, nE=0;
322        float L1, L2;
323        float stationarity;
324        for (i=tbands[b];i<tbands[b+1];i++)
325        {
326           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
327                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
328           E += binE;
329           tE += binE*tonality[i];
330           nE += binE*2.f*(.5f-noisiness[i]);
331        }
332        tonal->E[tonal->E_count][b] = E;
333        frame_noisiness += nE/(1e-15f+E);
334
335        frame_loudness += celt_sqrt(E+1e-10f);
336        logE[b] = (float)log(E+1e-10f);
337        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
338        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
339        if (tonal->highE[b] < tonal->lowE[b]+1.f)
340        {
341           tonal->highE[b]+=.5f;
342           tonal->lowE[b]-=.5f;
343        }
344        relativeE += (logE[b]-tonal->lowE[b])/(EPSILON+tonal->highE[b]-tonal->lowE[b]);
345
346        L1=L2=0;
347        for (i=0;i<NB_FRAMES;i++)
348        {
349           L1 += celt_sqrt(tonal->E[i][b]);
350           L2 += tonal->E[i][b];
351        }
352
353        stationarity = MIN16(0.99f,L1/celt_sqrt(EPSILON+NB_FRAMES*L2));
354        stationarity *= stationarity;
355        stationarity *= stationarity;
356        frame_stationarity += stationarity;
357        /*band_tonality[b] = tE/(1e-15+E)*/;
358        band_tonality[b] = MAX16(tE/(EPSILON+E), stationarity*tonal->prev_band_tonality[b]);
359 #if 0
360        if (b>=NB_TONAL_SKIP_BANDS)
361        {
362           frame_tonality += tweight[b]*band_tonality[b];
363           tw_sum += tweight[b];
364        }
365 #else
366        frame_tonality += band_tonality[b];
367        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
368           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
369 #endif
370        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
371        slope += band_tonality[b]*(b-8);
372        /*printf("%f %f ", band_tonality[b], stationarity);*/
373        tonal->prev_band_tonality[b] = band_tonality[b];
374     }
375
376     bandwidth_mask = 0;
377     bandwidth = 0;
378     maxE = 0;
379     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
380     noise_floor *= noise_floor;
381     for (b=0;b<NB_TOT_BANDS;b++)
382     {
383        float E=0;
384        int band_start, band_end;
385        /* Keep a margin of 300 Hz for aliasing */
386        band_start = extra_bands[b];
387        band_end = extra_bands[b+1];
388        for (i=band_start;i<band_end;i++)
389        {
390           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
391                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
392           E += binE;
393        }
394        maxE = MAX32(maxE, E);
395        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
396        E = MAX32(E, tonal->meanE[b]);
397        /* Use a simple follower with 13 dB/Bark slope for spreading function */
398        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
399        /* Consider the band "active" only if all these conditions are met:
400           1) less than 10 dB below the simple follower
401           2) less than 90 dB below the peak band (maximal masking possible considering
402              both the ATH and the loudness-dependent slope of the spreading function)
403           3) above the PCM quantization noise floor
404        */
405        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
406           bandwidth = b;
407     }
408     if (tonal->count<=2)
409        bandwidth = 20;
410     frame_loudness = 20*(float)log10(frame_loudness);
411     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
412     tonal->lowECount *= (1-alphaE);
413     if (frame_loudness < tonal->Etracker-30)
414        tonal->lowECount += alphaE;
415
416     for (i=0;i<8;i++)
417     {
418        float sum=0;
419        for (b=0;b<16;b++)
420           sum += dct_table[i*16+b]*logE[b];
421        BFCC[i] = sum;
422     }
423
424     frame_stationarity /= NB_TBANDS;
425     relativeE /= NB_TBANDS;
426     if (tonal->count<10)
427        relativeE = .5;
428     frame_noisiness /= NB_TBANDS;
429 #if 1
430     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
431 #else
432     info->activity = .5*(1+frame_noisiness-frame_stationarity);
433 #endif
434     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
435     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
436     tonal->prev_tonality = frame_tonality;
437
438     slope /= 8*8;
439     info->tonality_slope = slope;
440
441     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
442     tonal->count++;
443     info->tonality = frame_tonality;
444
445     for (i=0;i<4;i++)
446        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
447
448     for (i=0;i<4;i++)
449        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
450
451     for (i=0;i<4;i++)
452         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
453     for (i=0;i<3;i++)
454         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
455
456     if (tonal->count > 5)
457     {
458        for (i=0;i<9;i++)
459           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
460     }
461
462     for (i=0;i<8;i++)
463     {
464        tonal->mem[i+24] = tonal->mem[i+16];
465        tonal->mem[i+16] = tonal->mem[i+8];
466        tonal->mem[i+8] = tonal->mem[i];
467        tonal->mem[i] = BFCC[i];
468     }
469     for (i=0;i<9;i++)
470        features[11+i] = celt_sqrt(tonal->std[i]);
471     features[20] = info->tonality;
472     features[21] = info->activity;
473     features[22] = frame_stationarity;
474     features[23] = info->tonality_slope;
475     features[24] = tonal->lowECount;
476
477 #ifndef FIXED_POINT
478     mlp_process(&net, features, frame_probs);
479     frame_probs[0] = .5f*(frame_probs[0]+1);
480     /* Curve fitting between the MLP probability and the actual probability */
481     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
482     frame_probs[1] = .5f*frame_probs[1]+.5f;
483     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
484
485     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
486     {
487        /* Probability of state transition */
488        float tau;
489        /* Represents independence of the MLP probabilities, where
490           beta=1 means fully independent. */
491        float beta;
492        /* Denormalized probability of speech (p0) and music (p1) after update */
493        float p0, p1;
494        /* Delayed decision variables */
495        float s0, m0;
496        float psum;
497        float speech0;
498        float music0;
499
500        /* One transition every 3 minutes */
501        tau = .00005f*frame_probs[1];
502        beta = .05f;
503        if (1) {
504           /* Adapt beta based on how "unexpected" the new prob is */
505           float p, q;
506           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
507           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
508           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
509        }
510        /* p0 and p1 are the probabilities of speech and music at this frame
511           using only information from previous frame and applying the
512           state transition model */
513        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
514        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
515        /* We apply the current probability with exponent beta to work around
516           the fact that the probability estimates aren't independent. */
517        p0 *= (float)pow(1-frame_probs[0], beta);
518        p1 *= (float)pow(frame_probs[0], beta);
519        /* Normalise the probabilities to get the Marokv probability of music. */
520        tonal->music_prob = p1/(p0+p1);
521        info->music_prob = tonal->music_prob;
522
523        /* This chunk of code deals with delayed decision. */
524        psum=1e-20f;
525        speech0 = (float)pow(1-frame_probs[0], beta);
526        music0  = (float)pow(frame_probs[0], beta);
527        if (tonal->count==1)
528        {
529           tonal->pspeech[0]=.5;
530           tonal->pmusic [0]=.5;
531        }
532        s0 = tonal->pspeech[0] + tonal->pspeech[1];
533        m0 = tonal->pmusic [0] + tonal->pmusic [1];
534        tonal->pspeech[0] = s0*(1-tau)*speech0;
535        tonal->pmusic [0] = m0*(1-tau)*music0;
536        for (i=1;i<DETECT_SIZE-1;i++)
537        {
538           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
539           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
540        }
541        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
542        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
543
544        for (i=0;i<DETECT_SIZE;i++)
545           psum += tonal->pspeech[i] + tonal->pmusic[i];
546        psum = 1.f/psum;
547        for (i=0;i<DETECT_SIZE;i++)
548        {
549           tonal->pspeech[i] *= psum;
550           tonal->pmusic [i] *= psum;
551        }
552        psum = tonal->pmusic[0];
553        for (i=1;i<DETECT_SIZE;i++)
554           psum += tonal->pspeech[i];
555
556        /* Estimate our confidence in the speech/music decisions */
557        if (frame_probs[1]>.75)
558        {
559           if (tonal->music_prob>.9)
560           {
561              float adapt;
562              adapt = 1.f/(++tonal->music_confidence_count);
563              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
564              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
565           }
566           if (tonal->music_prob<.1)
567           {
568              float adapt;
569              adapt = 1.f/(++tonal->speech_confidence_count);
570              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
571              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
572           }
573        } else {
574           if (tonal->music_confidence_count==0)
575              tonal->music_confidence = .9f;
576           if (tonal->speech_confidence_count==0)
577              tonal->speech_confidence = .1f;
578        }
579        psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum));
580     }
581     if (tonal->last_music != (tonal->music_prob>.5f))
582        tonal->last_transition=0;
583     tonal->last_music = tonal->music_prob>.5f;
584 #else
585     info->music_prob = 0;
586 #endif
587     /*for (i=0;i<25;i++)
588        printf("%f ", features[i]);
589     printf("\n");*/
590
591     info->bandwidth = bandwidth;
592     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
593     info->noisiness = frame_noisiness;
594     info->valid = 1;
595     if (info_out!=NULL)
596        OPUS_COPY(info_out, info, 1);
597     RESTORE_STACK;
598 }
599
600 int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
601                         const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
602                         int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
603 {
604    int offset;
605    int pcm_len;
606
607    /* Avoid overflow/wrap-around of the analysis buffer */
608    frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
609
610    pcm_len = frame_size - analysis->analysis_offset;
611    offset = 0;
612    do {
613       tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix);
614       offset += 480;
615       pcm_len -= 480;
616    } while (pcm_len>0);
617    analysis->analysis_offset = frame_size;
618
619    if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
620    {
621       int LM = 3;
622       LM = optimize_framesize((const opus_val16*)pcm, frame_size, C, Fs, bitrate_bps,
623             analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
624       while ((Fs/400<<LM)>frame_size)
625          LM--;
626       frame_size = (Fs/400<<LM);
627    } else {
628       frame_size = frame_size_select(frame_size, variable_duration, Fs);
629    }
630    if (frame_size<0)
631       return -1;
632    analysis->analysis_offset -= frame_size;
633
634    /* Only perform analysis up to 20-ms frames. Longer ones will be split if
635       they're in CELT-only mode. */
636    analysis_info->valid = 0;
637    tonality_get_info(analysis, analysis_info, frame_size);
638
639    return frame_size;
640 }