Quality: Retrained classifier MLP with better data
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    /* Should avoid underflow on the values we'll get */
119    if (ABS16(x)+ABS16(y)<1e-9f)
120    {
121       x*=1e12f;
122       y*=1e12f;
123    }
124    x2 = x*x;
125    y2 = y*y;
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       if (den!=0)
129          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130       else
131          return (y<0 ? -cE : cE);
132    }else{
133       float den = (x2 + cB*y2) * (x2 + cC*y2);
134       if (den!=0)
135          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136       else
137          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138    }
139 }
140
141 void tonality_analysis_init(TonalityAnalysisState *tonal)
142 {
143   /* Initialize reusable fields. */
144   tonal->arch = opus_select_arch();
145   /* Clear remaining fields. */
146   tonality_analysis_reset(tonal);
147 }
148
149 void tonality_analysis_reset(TonalityAnalysisState *tonal)
150 {
151   /* Clear non-reusable fields. */
152   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
154 }
155
156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157 {
158    int pos;
159    int curr_lookahead;
160    float psum;
161    int i;
162
163    pos = tonal->read_pos;
164    curr_lookahead = tonal->write_pos-tonal->read_pos;
165    if (curr_lookahead<0)
166       curr_lookahead += DETECT_SIZE;
167
168    if (len > 480 && pos != tonal->write_pos)
169    {
170       pos++;
171       if (pos==DETECT_SIZE)
172          pos=0;
173    }
174    if (pos == tonal->write_pos)
175       pos--;
176    if (pos<0)
177       pos = DETECT_SIZE-1;
178    OPUS_COPY(info_out, &tonal->info[pos], 1);
179    tonal->read_subframe += len/120;
180    while (tonal->read_subframe>=4)
181    {
182       tonal->read_subframe -= 4;
183       tonal->read_pos++;
184    }
185    if (tonal->read_pos>=DETECT_SIZE)
186       tonal->read_pos-=DETECT_SIZE;
187
188    /* Compensate for the delay in the features themselves.
189       FIXME: Need a better estimate the 10 I just made up */
190    curr_lookahead = IMAX(curr_lookahead-10, 0);
191
192    psum=0;
193    /* Summing the probability of transition patterns that involve music at
194       time (DETECT_SIZE-curr_lookahead-1) */
195    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196       psum += tonal->pmusic[i];
197    for (;i<DETECT_SIZE;i++)
198       psum += tonal->pspeech[i];
199    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
201
202    info_out->music_prob = psum;
203 }
204
205 static const float std_feature_bias[9] = {
206       5.684947, 3.475288, 1.770634, 1.599784, 3.773215,
207       2.163313, 1.260756, 1.116868, 1.918795
208 };
209
210 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
211 {
212     int i, b;
213     const kiss_fft_state *kfft;
214     VARDECL(kiss_fft_cpx, in);
215     VARDECL(kiss_fft_cpx, out);
216     int N = 480, N2=240;
217     float * OPUS_RESTRICT A = tonal->angle;
218     float * OPUS_RESTRICT dA = tonal->d_angle;
219     float * OPUS_RESTRICT d2A = tonal->d2_angle;
220     VARDECL(float, tonality);
221     VARDECL(float, noisiness);
222     float band_tonality[NB_TBANDS];
223     float logE[NB_TBANDS];
224     float BFCC[8];
225     float features[25];
226     float frame_tonality;
227     float max_frame_tonality;
228     /*float tw_sum=0;*/
229     float frame_noisiness;
230     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
231     float slope=0;
232     float frame_stationarity;
233     float relativeE;
234     float frame_probs[2];
235     float alpha, alphaE, alphaE2;
236     float frame_loudness;
237     float bandwidth_mask;
238     int bandwidth=0;
239     float maxE = 0;
240     float noise_floor;
241     int remaining;
242     AnalysisInfo *info;
243     SAVE_STACK;
244
245     tonal->last_transition++;
246     alpha = 1.f/IMIN(20, 1+tonal->count);
247     alphaE = 1.f/IMIN(50, 1+tonal->count);
248     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
249
250     if (tonal->count<4)
251        tonal->music_prob = .5;
252     kfft = celt_mode->mdct.kfft[0];
253     if (tonal->count==0)
254        tonal->mem_fill = 240;
255     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
256     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
257     {
258        tonal->mem_fill += len;
259        /* Don't have enough to update the analysis */
260        RESTORE_STACK;
261        return;
262     }
263     info = &tonal->info[tonal->write_pos++];
264     if (tonal->write_pos>=DETECT_SIZE)
265        tonal->write_pos-=DETECT_SIZE;
266
267     ALLOC(in, 480, kiss_fft_cpx);
268     ALLOC(out, 480, kiss_fft_cpx);
269     ALLOC(tonality, 240, float);
270     ALLOC(noisiness, 240, float);
271     for (i=0;i<N2;i++)
272     {
273        float w = analysis_window[i];
274        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
275        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
276        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
277        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
278     }
279     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
280     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
281     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
282     tonal->mem_fill = 240 + remaining;
283     opus_fft(kfft, in, out, tonal->arch);
284 #ifndef FIXED_POINT
285     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
286     if (celt_isnan(out[0].r))
287     {
288        info->valid = 0;
289        RESTORE_STACK;
290        return;
291     }
292 #endif
293
294     for (i=1;i<N2;i++)
295     {
296        float X1r, X2r, X1i, X2i;
297        float angle, d_angle, d2_angle;
298        float angle2, d_angle2, d2_angle2;
299        float mod1, mod2, avg_mod;
300        X1r = (float)out[i].r+out[N-i].r;
301        X1i = (float)out[i].i-out[N-i].i;
302        X2r = (float)out[i].i+out[N-i].i;
303        X2i = (float)out[N-i].r-out[i].r;
304
305        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
306        d_angle = angle - A[i];
307        d2_angle = d_angle - dA[i];
308
309        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
310        d_angle2 = angle2 - angle;
311        d2_angle2 = d_angle2 - d_angle;
312
313        mod1 = d2_angle - (float)floor(.5+d2_angle);
314        noisiness[i] = ABS16(mod1);
315        mod1 *= mod1;
316        mod1 *= mod1;
317
318        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
319        noisiness[i] += ABS16(mod2);
320        mod2 *= mod2;
321        mod2 *= mod2;
322
323        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
324        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
325
326        A[i] = angle2;
327        dA[i] = d_angle2;
328        d2A[i] = mod2;
329     }
330
331     frame_tonality = 0;
332     max_frame_tonality = 0;
333     /*tw_sum = 0;*/
334     info->activity = 0;
335     frame_noisiness = 0;
336     frame_stationarity = 0;
337     if (!tonal->count)
338     {
339        for (b=0;b<NB_TBANDS;b++)
340        {
341           tonal->lowE[b] = 1e10;
342           tonal->highE[b] = -1e10;
343        }
344     }
345     relativeE = 0;
346     frame_loudness = 0;
347     for (b=0;b<NB_TBANDS;b++)
348     {
349        float E=0, tE=0, nE=0;
350        float L1, L2;
351        float stationarity;
352        for (i=tbands[b];i<tbands[b+1];i++)
353        {
354           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
355                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
356 #ifdef FIXED_POINT
357           /* FIXME: It's probably best to change the BFCC filter initial state instead */
358           binE *= 5.55e-17f;
359 #endif
360           E += binE;
361           tE += binE*tonality[i];
362           nE += binE*2.f*(.5f-noisiness[i]);
363        }
364 #ifndef FIXED_POINT
365        /* Check for extreme band energies that could cause NaNs later. */
366        if (!(E<1e9f) || celt_isnan(E))
367        {
368           info->valid = 0;
369           RESTORE_STACK;
370           return;
371        }
372 #endif
373
374        tonal->E[tonal->E_count][b] = E;
375        frame_noisiness += nE/(1e-15f+E);
376
377        frame_loudness += (float)sqrt(E+1e-10f);
378        logE[b] = (float)log(E+1e-10f);
379        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
380        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
381        if (tonal->highE[b] < tonal->lowE[b]+1.f)
382        {
383           tonal->highE[b]+=.5f;
384           tonal->lowE[b]-=.5f;
385        }
386        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
387
388        L1=L2=0;
389        for (i=0;i<NB_FRAMES;i++)
390        {
391           L1 += (float)sqrt(tonal->E[i][b]);
392           L2 += tonal->E[i][b];
393        }
394
395        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
396        stationarity *= stationarity;
397        stationarity *= stationarity;
398        frame_stationarity += stationarity;
399        /*band_tonality[b] = tE/(1e-15+E)*/;
400        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
401 #if 0
402        if (b>=NB_TONAL_SKIP_BANDS)
403        {
404           frame_tonality += tweight[b]*band_tonality[b];
405           tw_sum += tweight[b];
406        }
407 #else
408        frame_tonality += band_tonality[b];
409        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
410           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
411 #endif
412        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
413        slope += band_tonality[b]*(b-8);
414        /*printf("%f %f ", band_tonality[b], stationarity);*/
415        tonal->prev_band_tonality[b] = band_tonality[b];
416     }
417
418     bandwidth_mask = 0;
419     bandwidth = 0;
420     maxE = 0;
421     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
422 #ifdef FIXED_POINT
423     noise_floor *= 1<<(15+SIG_SHIFT);
424 #endif
425     noise_floor *= noise_floor;
426     for (b=0;b<NB_TOT_BANDS;b++)
427     {
428        float E=0;
429        int band_start, band_end;
430        /* Keep a margin of 300 Hz for aliasing */
431        band_start = extra_bands[b];
432        band_end = extra_bands[b+1];
433        for (i=band_start;i<band_end;i++)
434        {
435           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
436                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
437           E += binE;
438        }
439        maxE = MAX32(maxE, E);
440        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
441        E = MAX32(E, tonal->meanE[b]);
442        /* Use a simple follower with 13 dB/Bark slope for spreading function */
443        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
444        /* Consider the band "active" only if all these conditions are met:
445           1) less than 10 dB below the simple follower
446           2) less than 90 dB below the peak band (maximal masking possible considering
447              both the ATH and the loudness-dependent slope of the spreading function)
448           3) above the PCM quantization noise floor
449        */
450        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
451           bandwidth = b;
452     }
453     if (tonal->count<=2)
454        bandwidth = 20;
455     frame_loudness = 20*(float)log10(frame_loudness);
456     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
457     tonal->lowECount *= (1-alphaE);
458     if (frame_loudness < tonal->Etracker-30)
459        tonal->lowECount += alphaE;
460
461     for (i=0;i<8;i++)
462     {
463        float sum=0;
464        for (b=0;b<16;b++)
465           sum += dct_table[i*16+b]*logE[b];
466        BFCC[i] = sum;
467     }
468
469     frame_stationarity /= NB_TBANDS;
470     relativeE /= NB_TBANDS;
471     if (tonal->count<10)
472        relativeE = .5;
473     frame_noisiness /= NB_TBANDS;
474 #if 1
475     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
476 #else
477     info->activity = .5*(1+frame_noisiness-frame_stationarity);
478 #endif
479     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
480     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
481     tonal->prev_tonality = frame_tonality;
482
483     slope /= 8*8;
484     info->tonality_slope = slope;
485
486     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
487     tonal->count++;
488     info->tonality = frame_tonality;
489
490     for (i=0;i<4;i++)
491        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
492
493     for (i=0;i<4;i++)
494        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
495
496     for (i=0;i<4;i++)
497         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
498     for (i=0;i<3;i++)
499         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
500
501     if (tonal->count > 5)
502     {
503        for (i=0;i<9;i++)
504           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
505     }
506
507     for (i=0;i<8;i++)
508     {
509        tonal->mem[i+24] = tonal->mem[i+16];
510        tonal->mem[i+16] = tonal->mem[i+8];
511        tonal->mem[i+8] = tonal->mem[i];
512        tonal->mem[i] = BFCC[i];
513     }
514     for (i=0;i<9;i++)
515        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
516     features[20] = info->tonality - 0.154723;
517     features[21] = info->activity - 0.724643;
518     features[22] = frame_stationarity - 0.743717;
519     features[23] = info->tonality_slope + 0.069216;
520     features[24] = tonal->lowECount - 0.067930;
521
522 #ifndef DISABLE_FLOAT_API
523     mlp_process(&net, features, frame_probs);
524     frame_probs[0] = .5f*(frame_probs[0]+1);
525     /* Curve fitting between the MLP probability and the actual probability */
526     /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
527     /* Probability of active audio (as opposed to silence) */
528     frame_probs[1] = .5f*frame_probs[1]+.5f;
529     frame_probs[1] *= frame_probs[1];
530     /* Consider that silence has a 50-50 probability. */
531     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
532
533     /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
534     {
535        /* Probability of state transition */
536        float tau;
537        /* Represents independence of the MLP probabilities, where
538           beta=1 means fully independent. */
539        float beta;
540        /* Denormalized probability of speech (p0) and music (p1) after update */
541        float p0, p1;
542        /* Probabilities for "all speech" and "all music" */
543        float s0, m0;
544        /* Probability sum for renormalisation */
545        float psum;
546        /* Instantaneous probability of speech and music, with beta pre-applied. */
547        float speech0;
548        float music0;
549        float p, q;
550
551        /* One transition every 3 minutes of active audio */
552        tau = .00005f*frame_probs[1];
553        /* Adapt beta based on how "unexpected" the new prob is */
554        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
555        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
556        beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
557        /* p0 and p1 are the probabilities of speech and music at this frame
558           using only information from previous frame and applying the
559           state transition model */
560        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
561        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
562        /* We apply the current probability with exponent beta to work around
563           the fact that the probability estimates aren't independent. */
564        p0 *= (float)pow(1-frame_probs[0], beta);
565        p1 *= (float)pow(frame_probs[0], beta);
566        /* Normalise the probabilities to get the Marokv probability of music. */
567        tonal->music_prob = p1/(p0+p1);
568        info->music_prob = tonal->music_prob;
569
570        /* This chunk of code deals with delayed decision. */
571        psum=1e-20f;
572        /* Instantaneous probability of speech and music, with beta pre-applied. */
573        speech0 = (float)pow(1-frame_probs[0], beta);
574        music0  = (float)pow(frame_probs[0], beta);
575        if (tonal->count==1)
576        {
577           tonal->pspeech[0]=.5;
578           tonal->pmusic [0]=.5;
579        }
580        /* Updated probability of having only speech (s0) or only music (m0),
581           before considering the new observation. */
582        s0 = tonal->pspeech[0] + tonal->pspeech[1];
583        m0 = tonal->pmusic [0] + tonal->pmusic [1];
584        /* Updates s0 and m0 with instantaneous probability. */
585        tonal->pspeech[0] = s0*(1-tau)*speech0;
586        tonal->pmusic [0] = m0*(1-tau)*music0;
587        /* Propagate the transition probabilities */
588        for (i=1;i<DETECT_SIZE-1;i++)
589        {
590           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
591           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
592        }
593        /* Probability that the latest frame is speech, when all the previous ones were music. */
594        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
595        /* Probability that the latest frame is music, when all the previous ones were speech. */
596        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
597
598        /* Renormalise probabilities to 1 */
599        for (i=0;i<DETECT_SIZE;i++)
600           psum += tonal->pspeech[i] + tonal->pmusic[i];
601        psum = 1.f/psum;
602        for (i=0;i<DETECT_SIZE;i++)
603        {
604           tonal->pspeech[i] *= psum;
605           tonal->pmusic [i] *= psum;
606        }
607        psum = tonal->pmusic[0];
608        for (i=1;i<DETECT_SIZE;i++)
609           psum += tonal->pspeech[i];
610
611        /* Estimate our confidence in the speech/music decisions */
612        if (frame_probs[1]>.75)
613        {
614           if (tonal->music_prob>.9)
615           {
616              float adapt;
617              adapt = 1.f/(++tonal->music_confidence_count);
618              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
619              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
620           }
621           if (tonal->music_prob<.1)
622           {
623              float adapt;
624              adapt = 1.f/(++tonal->speech_confidence_count);
625              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
626              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
627           }
628        } else {
629           if (tonal->music_confidence_count==0)
630              tonal->music_confidence = .9f;
631           if (tonal->speech_confidence_count==0)
632              tonal->speech_confidence = .1f;
633        }
634     }
635     if (tonal->last_music != (tonal->music_prob>.5f))
636        tonal->last_transition=0;
637     tonal->last_music = tonal->music_prob>.5f;
638 #else
639     info->music_prob = 0;
640 #endif
641     /*for (i=0;i<25;i++)
642        printf("%f ", features[i]);
643     printf("\n");*/
644
645     info->bandwidth = bandwidth;
646     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
647     info->noisiness = frame_noisiness;
648     info->valid = 1;
649     RESTORE_STACK;
650 }
651
652 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
653                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
654                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
655 {
656    int offset;
657    int pcm_len;
658
659    if (analysis_pcm != NULL)
660    {
661       /* Avoid overflow/wrap-around of the analysis buffer */
662       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
663
664       pcm_len = analysis_frame_size - analysis->analysis_offset;
665       offset = analysis->analysis_offset;
666       do {
667          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
668          offset += 480;
669          pcm_len -= 480;
670       } while (pcm_len>0);
671       analysis->analysis_offset = analysis_frame_size;
672
673       analysis->analysis_offset -= frame_size;
674    }
675
676    analysis_info->valid = 0;
677    tonality_get_info(analysis, analysis_info, frame_size);
678 }