Simplifying fast_atan2f()
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    x2 = x*x;
119    y2 = y*y;
120    /* For very small values, we don't care about the answer, so
121       we can just return 0. */
122    if (x2 + y2 < 1e-18f)
123    {
124       return 0;
125    }
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
129    }else{
130       float den = (x2 + cB*y2) * (x2 + cC*y2);
131       return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
132    }
133 }
134
135 void tonality_analysis_init(TonalityAnalysisState *tonal)
136 {
137   /* Initialize reusable fields. */
138   tonal->arch = opus_select_arch();
139   /* Clear remaining fields. */
140   tonality_analysis_reset(tonal);
141 }
142
143 void tonality_analysis_reset(TonalityAnalysisState *tonal)
144 {
145   /* Clear non-reusable fields. */
146   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
147   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
148 }
149
150 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
151 {
152    int pos;
153    int curr_lookahead;
154    float psum;
155    int i;
156
157    pos = tonal->read_pos;
158    curr_lookahead = tonal->write_pos-tonal->read_pos;
159    if (curr_lookahead<0)
160       curr_lookahead += DETECT_SIZE;
161
162    if (len > 480 && pos != tonal->write_pos)
163    {
164       pos++;
165       if (pos==DETECT_SIZE)
166          pos=0;
167    }
168    if (pos == tonal->write_pos)
169       pos--;
170    if (pos<0)
171       pos = DETECT_SIZE-1;
172    OPUS_COPY(info_out, &tonal->info[pos], 1);
173    tonal->read_subframe += len/120;
174    while (tonal->read_subframe>=4)
175    {
176       tonal->read_subframe -= 4;
177       tonal->read_pos++;
178    }
179    if (tonal->read_pos>=DETECT_SIZE)
180       tonal->read_pos-=DETECT_SIZE;
181
182    /* Compensate for the delay in the features themselves.
183       FIXME: Need a better estimate the 10 I just made up */
184    curr_lookahead = IMAX(curr_lookahead-10, 0);
185
186    psum=0;
187    /* Summing the probability of transition patterns that involve music at
188       time (DETECT_SIZE-curr_lookahead-1) */
189    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
190       psum += tonal->pmusic[i];
191    for (;i<DETECT_SIZE;i++)
192       psum += tonal->pspeech[i];
193    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
194    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
195
196    info_out->music_prob = psum;
197 }
198
199 static const float std_feature_bias[9] = {
200       5.684947, 3.475288, 1.770634, 1.599784, 3.773215,
201       2.163313, 1.260756, 1.116868, 1.918795
202 };
203
204 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
205 {
206     int i, b;
207     const kiss_fft_state *kfft;
208     VARDECL(kiss_fft_cpx, in);
209     VARDECL(kiss_fft_cpx, out);
210     int N = 480, N2=240;
211     float * OPUS_RESTRICT A = tonal->angle;
212     float * OPUS_RESTRICT dA = tonal->d_angle;
213     float * OPUS_RESTRICT d2A = tonal->d2_angle;
214     VARDECL(float, tonality);
215     VARDECL(float, noisiness);
216     float band_tonality[NB_TBANDS];
217     float logE[NB_TBANDS];
218     float BFCC[8];
219     float features[25];
220     float frame_tonality;
221     float max_frame_tonality;
222     /*float tw_sum=0;*/
223     float frame_noisiness;
224     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
225     float slope=0;
226     float frame_stationarity;
227     float relativeE;
228     float frame_probs[2];
229     float alpha, alphaE, alphaE2;
230     float frame_loudness;
231     float bandwidth_mask;
232     int bandwidth=0;
233     float maxE = 0;
234     float noise_floor;
235     int remaining;
236     AnalysisInfo *info;
237     SAVE_STACK;
238
239     tonal->last_transition++;
240     alpha = 1.f/IMIN(20, 1+tonal->count);
241     alphaE = 1.f/IMIN(50, 1+tonal->count);
242     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
243
244     if (tonal->count<4)
245        tonal->music_prob = .5;
246     kfft = celt_mode->mdct.kfft[0];
247     if (tonal->count==0)
248        tonal->mem_fill = 240;
249     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
250     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
251     {
252        tonal->mem_fill += len;
253        /* Don't have enough to update the analysis */
254        RESTORE_STACK;
255        return;
256     }
257     info = &tonal->info[tonal->write_pos++];
258     if (tonal->write_pos>=DETECT_SIZE)
259        tonal->write_pos-=DETECT_SIZE;
260
261     ALLOC(in, 480, kiss_fft_cpx);
262     ALLOC(out, 480, kiss_fft_cpx);
263     ALLOC(tonality, 240, float);
264     ALLOC(noisiness, 240, float);
265     for (i=0;i<N2;i++)
266     {
267        float w = analysis_window[i];
268        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
269        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
270        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
271        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
272     }
273     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
274     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
275     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
276     tonal->mem_fill = 240 + remaining;
277     opus_fft(kfft, in, out, tonal->arch);
278 #ifndef FIXED_POINT
279     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
280     if (celt_isnan(out[0].r))
281     {
282        info->valid = 0;
283        RESTORE_STACK;
284        return;
285     }
286 #endif
287
288     for (i=1;i<N2;i++)
289     {
290        float X1r, X2r, X1i, X2i;
291        float angle, d_angle, d2_angle;
292        float angle2, d_angle2, d2_angle2;
293        float mod1, mod2, avg_mod;
294        X1r = (float)out[i].r+out[N-i].r;
295        X1i = (float)out[i].i-out[N-i].i;
296        X2r = (float)out[i].i+out[N-i].i;
297        X2i = (float)out[N-i].r-out[i].r;
298
299        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
300        d_angle = angle - A[i];
301        d2_angle = d_angle - dA[i];
302
303        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
304        d_angle2 = angle2 - angle;
305        d2_angle2 = d_angle2 - d_angle;
306
307        mod1 = d2_angle - (float)floor(.5+d2_angle);
308        noisiness[i] = ABS16(mod1);
309        mod1 *= mod1;
310        mod1 *= mod1;
311
312        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
313        noisiness[i] += ABS16(mod2);
314        mod2 *= mod2;
315        mod2 *= mod2;
316
317        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
318        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
319
320        A[i] = angle2;
321        dA[i] = d_angle2;
322        d2A[i] = mod2;
323     }
324
325     frame_tonality = 0;
326     max_frame_tonality = 0;
327     /*tw_sum = 0;*/
328     info->activity = 0;
329     frame_noisiness = 0;
330     frame_stationarity = 0;
331     if (!tonal->count)
332     {
333        for (b=0;b<NB_TBANDS;b++)
334        {
335           tonal->lowE[b] = 1e10;
336           tonal->highE[b] = -1e10;
337        }
338     }
339     relativeE = 0;
340     frame_loudness = 0;
341     for (b=0;b<NB_TBANDS;b++)
342     {
343        float E=0, tE=0, nE=0;
344        float L1, L2;
345        float stationarity;
346        for (i=tbands[b];i<tbands[b+1];i++)
347        {
348           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
349                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
350 #ifdef FIXED_POINT
351           /* FIXME: It's probably best to change the BFCC filter initial state instead */
352           binE *= 5.55e-17f;
353 #endif
354           E += binE;
355           tE += binE*tonality[i];
356           nE += binE*2.f*(.5f-noisiness[i]);
357        }
358 #ifndef FIXED_POINT
359        /* Check for extreme band energies that could cause NaNs later. */
360        if (!(E<1e9f) || celt_isnan(E))
361        {
362           info->valid = 0;
363           RESTORE_STACK;
364           return;
365        }
366 #endif
367
368        tonal->E[tonal->E_count][b] = E;
369        frame_noisiness += nE/(1e-15f+E);
370
371        frame_loudness += (float)sqrt(E+1e-10f);
372        logE[b] = (float)log(E+1e-10f);
373        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
374        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
375        if (tonal->highE[b] < tonal->lowE[b]+1.f)
376        {
377           tonal->highE[b]+=.5f;
378           tonal->lowE[b]-=.5f;
379        }
380        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
381
382        L1=L2=0;
383        for (i=0;i<NB_FRAMES;i++)
384        {
385           L1 += (float)sqrt(tonal->E[i][b]);
386           L2 += tonal->E[i][b];
387        }
388
389        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
390        stationarity *= stationarity;
391        stationarity *= stationarity;
392        frame_stationarity += stationarity;
393        /*band_tonality[b] = tE/(1e-15+E)*/;
394        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
395 #if 0
396        if (b>=NB_TONAL_SKIP_BANDS)
397        {
398           frame_tonality += tweight[b]*band_tonality[b];
399           tw_sum += tweight[b];
400        }
401 #else
402        frame_tonality += band_tonality[b];
403        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
404           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
405 #endif
406        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
407        slope += band_tonality[b]*(b-8);
408        /*printf("%f %f ", band_tonality[b], stationarity);*/
409        tonal->prev_band_tonality[b] = band_tonality[b];
410     }
411
412     bandwidth_mask = 0;
413     bandwidth = 0;
414     maxE = 0;
415     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
416 #ifdef FIXED_POINT
417     noise_floor *= 1<<(15+SIG_SHIFT);
418 #endif
419     noise_floor *= noise_floor;
420     for (b=0;b<NB_TOT_BANDS;b++)
421     {
422        float E=0;
423        int band_start, band_end;
424        /* Keep a margin of 300 Hz for aliasing */
425        band_start = extra_bands[b];
426        band_end = extra_bands[b+1];
427        for (i=band_start;i<band_end;i++)
428        {
429           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
430                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
431           E += binE;
432        }
433        maxE = MAX32(maxE, E);
434        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
435        E = MAX32(E, tonal->meanE[b]);
436        /* Use a simple follower with 13 dB/Bark slope for spreading function */
437        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
438        /* Consider the band "active" only if all these conditions are met:
439           1) less than 10 dB below the simple follower
440           2) less than 90 dB below the peak band (maximal masking possible considering
441              both the ATH and the loudness-dependent slope of the spreading function)
442           3) above the PCM quantization noise floor
443        */
444        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
445           bandwidth = b;
446     }
447     if (tonal->count<=2)
448        bandwidth = 20;
449     frame_loudness = 20*(float)log10(frame_loudness);
450     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
451     tonal->lowECount *= (1-alphaE);
452     if (frame_loudness < tonal->Etracker-30)
453        tonal->lowECount += alphaE;
454
455     for (i=0;i<8;i++)
456     {
457        float sum=0;
458        for (b=0;b<16;b++)
459           sum += dct_table[i*16+b]*logE[b];
460        BFCC[i] = sum;
461     }
462
463     frame_stationarity /= NB_TBANDS;
464     relativeE /= NB_TBANDS;
465     if (tonal->count<10)
466        relativeE = .5;
467     frame_noisiness /= NB_TBANDS;
468 #if 1
469     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
470 #else
471     info->activity = .5*(1+frame_noisiness-frame_stationarity);
472 #endif
473     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
474     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
475     tonal->prev_tonality = frame_tonality;
476
477     slope /= 8*8;
478     info->tonality_slope = slope;
479
480     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
481     tonal->count++;
482     info->tonality = frame_tonality;
483
484     for (i=0;i<4;i++)
485        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
486
487     for (i=0;i<4;i++)
488        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
489
490     for (i=0;i<4;i++)
491         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
492     for (i=0;i<3;i++)
493         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
494
495     if (tonal->count > 5)
496     {
497        for (i=0;i<9;i++)
498           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
499     }
500
501     for (i=0;i<8;i++)
502     {
503        tonal->mem[i+24] = tonal->mem[i+16];
504        tonal->mem[i+16] = tonal->mem[i+8];
505        tonal->mem[i+8] = tonal->mem[i];
506        tonal->mem[i] = BFCC[i];
507     }
508     for (i=0;i<9;i++)
509        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
510     features[20] = info->tonality - 0.154723;
511     features[21] = info->activity - 0.724643;
512     features[22] = frame_stationarity - 0.743717;
513     features[23] = info->tonality_slope + 0.069216;
514     features[24] = tonal->lowECount - 0.067930;
515
516 #ifndef DISABLE_FLOAT_API
517     mlp_process(&net, features, frame_probs);
518     frame_probs[0] = .5f*(frame_probs[0]+1);
519     /* Curve fitting between the MLP probability and the actual probability */
520     /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
521     /* Probability of active audio (as opposed to silence) */
522     frame_probs[1] = .5f*frame_probs[1]+.5f;
523     frame_probs[1] *= frame_probs[1];
524     /* Consider that silence has a 50-50 probability. */
525     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526
527     /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
528     {
529        /* Probability of state transition */
530        float tau;
531        /* Represents independence of the MLP probabilities, where
532           beta=1 means fully independent. */
533        float beta;
534        /* Denormalized probability of speech (p0) and music (p1) after update */
535        float p0, p1;
536        /* Probabilities for "all speech" and "all music" */
537        float s0, m0;
538        /* Probability sum for renormalisation */
539        float psum;
540        /* Instantaneous probability of speech and music, with beta pre-applied. */
541        float speech0;
542        float music0;
543        float p, q;
544
545        /* One transition every 3 minutes of active audio */
546        tau = .00005f*frame_probs[1];
547        /* Adapt beta based on how "unexpected" the new prob is */
548        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
549        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
550        beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
551        /* p0 and p1 are the probabilities of speech and music at this frame
552           using only information from previous frame and applying the
553           state transition model */
554        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
555        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
556        /* We apply the current probability with exponent beta to work around
557           the fact that the probability estimates aren't independent. */
558        p0 *= (float)pow(1-frame_probs[0], beta);
559        p1 *= (float)pow(frame_probs[0], beta);
560        /* Normalise the probabilities to get the Marokv probability of music. */
561        tonal->music_prob = p1/(p0+p1);
562        info->music_prob = tonal->music_prob;
563
564        /* This chunk of code deals with delayed decision. */
565        psum=1e-20f;
566        /* Instantaneous probability of speech and music, with beta pre-applied. */
567        speech0 = (float)pow(1-frame_probs[0], beta);
568        music0  = (float)pow(frame_probs[0], beta);
569        if (tonal->count==1)
570        {
571           tonal->pspeech[0]=.5;
572           tonal->pmusic [0]=.5;
573        }
574        /* Updated probability of having only speech (s0) or only music (m0),
575           before considering the new observation. */
576        s0 = tonal->pspeech[0] + tonal->pspeech[1];
577        m0 = tonal->pmusic [0] + tonal->pmusic [1];
578        /* Updates s0 and m0 with instantaneous probability. */
579        tonal->pspeech[0] = s0*(1-tau)*speech0;
580        tonal->pmusic [0] = m0*(1-tau)*music0;
581        /* Propagate the transition probabilities */
582        for (i=1;i<DETECT_SIZE-1;i++)
583        {
584           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
585           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
586        }
587        /* Probability that the latest frame is speech, when all the previous ones were music. */
588        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
589        /* Probability that the latest frame is music, when all the previous ones were speech. */
590        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
591
592        /* Renormalise probabilities to 1 */
593        for (i=0;i<DETECT_SIZE;i++)
594           psum += tonal->pspeech[i] + tonal->pmusic[i];
595        psum = 1.f/psum;
596        for (i=0;i<DETECT_SIZE;i++)
597        {
598           tonal->pspeech[i] *= psum;
599           tonal->pmusic [i] *= psum;
600        }
601        psum = tonal->pmusic[0];
602        for (i=1;i<DETECT_SIZE;i++)
603           psum += tonal->pspeech[i];
604
605        /* Estimate our confidence in the speech/music decisions */
606        if (frame_probs[1]>.75)
607        {
608           if (tonal->music_prob>.9)
609           {
610              float adapt;
611              adapt = 1.f/(++tonal->music_confidence_count);
612              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
613              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
614           }
615           if (tonal->music_prob<.1)
616           {
617              float adapt;
618              adapt = 1.f/(++tonal->speech_confidence_count);
619              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
620              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
621           }
622        } else {
623           if (tonal->music_confidence_count==0)
624              tonal->music_confidence = .9f;
625           if (tonal->speech_confidence_count==0)
626              tonal->speech_confidence = .1f;
627        }
628     }
629     if (tonal->last_music != (tonal->music_prob>.5f))
630        tonal->last_transition=0;
631     tonal->last_music = tonal->music_prob>.5f;
632 #else
633     info->music_prob = 0;
634 #endif
635     /*for (i=0;i<25;i++)
636        printf("%f ", features[i]);
637     printf("\n");*/
638
639     info->bandwidth = bandwidth;
640     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
641     info->noisiness = frame_noisiness;
642     info->valid = 1;
643     RESTORE_STACK;
644 }
645
646 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
647                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
648                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
649 {
650    int offset;
651    int pcm_len;
652
653    if (analysis_pcm != NULL)
654    {
655       /* Avoid overflow/wrap-around of the analysis buffer */
656       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
657
658       pcm_len = analysis_frame_size - analysis->analysis_offset;
659       offset = analysis->analysis_offset;
660       do {
661          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
662          offset += 480;
663          pcm_len -= 480;
664       } while (pcm_len>0);
665       analysis->analysis_offset = analysis_frame_size;
666
667       analysis->analysis_offset -= frame_size;
668    }
669
670    analysis_info->valid = 0;
671    tonality_get_info(analysis, analysis_info, frame_size);
672 }