New DTX that works in all modes (SILK/CELT/HYBRID)
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    x2 = x*x;
119    y2 = y*y;
120    /* For very small values, we don't care about the answer, so
121       we can just return 0. */
122    if (x2 + y2 < 1e-18f)
123    {
124       return 0;
125    }
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
129    }else{
130       float den = (x2 + cB*y2) * (x2 + cC*y2);
131       return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
132    }
133 }
134
135 void tonality_analysis_init(TonalityAnalysisState *tonal)
136 {
137   /* Initialize reusable fields. */
138   tonal->arch = opus_select_arch();
139   /* Clear remaining fields. */
140   tonality_analysis_reset(tonal);
141 }
142
143 void tonality_analysis_reset(TonalityAnalysisState *tonal)
144 {
145   /* Clear non-reusable fields. */
146   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
147   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
148 }
149
150 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
151 {
152    int pos;
153    int curr_lookahead;
154    float psum;
155    int i;
156
157    pos = tonal->read_pos;
158    curr_lookahead = tonal->write_pos-tonal->read_pos;
159    if (curr_lookahead<0)
160       curr_lookahead += DETECT_SIZE;
161
162    if (len > 480 && pos != tonal->write_pos)
163    {
164       pos++;
165       if (pos==DETECT_SIZE)
166          pos=0;
167    }
168    if (pos == tonal->write_pos)
169       pos--;
170    if (pos<0)
171       pos = DETECT_SIZE-1;
172    OPUS_COPY(info_out, &tonal->info[pos], 1);
173    tonal->read_subframe += len/120;
174    while (tonal->read_subframe>=4)
175    {
176       tonal->read_subframe -= 4;
177       tonal->read_pos++;
178    }
179    if (tonal->read_pos>=DETECT_SIZE)
180       tonal->read_pos-=DETECT_SIZE;
181
182    /* Compensate for the delay in the features themselves.
183       FIXME: Need a better estimate the 10 I just made up */
184    curr_lookahead = IMAX(curr_lookahead-10, 0);
185
186    psum=0;
187    /* Summing the probability of transition patterns that involve music at
188       time (DETECT_SIZE-curr_lookahead-1) */
189    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
190       psum += tonal->pmusic[i];
191    for (;i<DETECT_SIZE;i++)
192       psum += tonal->pspeech[i];
193    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
194    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
195
196    info_out->music_prob = psum;
197 }
198
199 static const float std_feature_bias[9] = {
200       5.684947, 3.475288, 1.770634, 1.599784, 3.773215,
201       2.163313, 1.260756, 1.116868, 1.918795
202 };
203
204 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
205 {
206     int i, b;
207     const kiss_fft_state *kfft;
208     VARDECL(kiss_fft_cpx, in);
209     VARDECL(kiss_fft_cpx, out);
210     int N = 480, N2=240;
211     float * OPUS_RESTRICT A = tonal->angle;
212     float * OPUS_RESTRICT dA = tonal->d_angle;
213     float * OPUS_RESTRICT d2A = tonal->d2_angle;
214     VARDECL(float, tonality);
215     VARDECL(float, noisiness);
216     float band_tonality[NB_TBANDS];
217     float logE[NB_TBANDS];
218     float BFCC[8];
219     float features[25];
220     float frame_tonality;
221     float max_frame_tonality;
222     /*float tw_sum=0;*/
223     float frame_noisiness;
224     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
225     float slope=0;
226     float frame_stationarity;
227     float relativeE;
228     float frame_probs[2];
229     float alpha, alphaE, alphaE2;
230     float frame_loudness;
231     float bandwidth_mask;
232     int bandwidth=0;
233     float maxE = 0;
234     float noise_floor;
235     int remaining;
236     AnalysisInfo *info;
237     SAVE_STACK;
238
239     tonal->last_transition++;
240     alpha = 1.f/IMIN(20, 1+tonal->count);
241     alphaE = 1.f/IMIN(50, 1+tonal->count);
242     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
243
244     if (tonal->count<4)
245        tonal->music_prob = .5;
246     kfft = celt_mode->mdct.kfft[0];
247     if (tonal->count==0)
248        tonal->mem_fill = 240;
249     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
250     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
251     {
252        tonal->mem_fill += len;
253        /* Don't have enough to update the analysis */
254        RESTORE_STACK;
255        return;
256     }
257     info = &tonal->info[tonal->write_pos++];
258     if (tonal->write_pos>=DETECT_SIZE)
259        tonal->write_pos-=DETECT_SIZE;
260
261     ALLOC(in, 480, kiss_fft_cpx);
262     ALLOC(out, 480, kiss_fft_cpx);
263     ALLOC(tonality, 240, float);
264     ALLOC(noisiness, 240, float);
265     for (i=0;i<N2;i++)
266     {
267        float w = analysis_window[i];
268        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
269        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
270        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
271        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
272     }
273     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
274     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
275     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
276     tonal->mem_fill = 240 + remaining;
277     opus_fft(kfft, in, out, tonal->arch);
278 #ifndef FIXED_POINT
279     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
280     if (celt_isnan(out[0].r))
281     {
282        info->valid = 0;
283        RESTORE_STACK;
284        return;
285     }
286 #endif
287
288     for (i=1;i<N2;i++)
289     {
290        float X1r, X2r, X1i, X2i;
291        float angle, d_angle, d2_angle;
292        float angle2, d_angle2, d2_angle2;
293        float mod1, mod2, avg_mod;
294        X1r = (float)out[i].r+out[N-i].r;
295        X1i = (float)out[i].i-out[N-i].i;
296        X2r = (float)out[i].i+out[N-i].i;
297        X2i = (float)out[N-i].r-out[i].r;
298
299        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
300        d_angle = angle - A[i];
301        d2_angle = d_angle - dA[i];
302
303        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
304        d_angle2 = angle2 - angle;
305        d2_angle2 = d_angle2 - d_angle;
306
307        mod1 = d2_angle - (float)floor(.5+d2_angle);
308        noisiness[i] = ABS16(mod1);
309        mod1 *= mod1;
310        mod1 *= mod1;
311
312        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
313        noisiness[i] += ABS16(mod2);
314        mod2 *= mod2;
315        mod2 *= mod2;
316
317        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
318        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
319
320        A[i] = angle2;
321        dA[i] = d_angle2;
322        d2A[i] = mod2;
323     }
324
325     frame_tonality = 0;
326     max_frame_tonality = 0;
327     /*tw_sum = 0;*/
328     info->activity = 0;
329     frame_noisiness = 0;
330     frame_stationarity = 0;
331     if (!tonal->count)
332     {
333        for (b=0;b<NB_TBANDS;b++)
334        {
335           tonal->lowE[b] = 1e10;
336           tonal->highE[b] = -1e10;
337        }
338     }
339     relativeE = 0;
340     frame_loudness = 0;
341     for (b=0;b<NB_TBANDS;b++)
342     {
343        float E=0, tE=0, nE=0;
344        float L1, L2;
345        float stationarity;
346        for (i=tbands[b];i<tbands[b+1];i++)
347        {
348           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
349                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
350 #ifdef FIXED_POINT
351           /* FIXME: It's probably best to change the BFCC filter initial state instead */
352           binE *= 5.55e-17f;
353 #endif
354           E += binE;
355           tE += binE*tonality[i];
356           nE += binE*2.f*(.5f-noisiness[i]);
357        }
358 #ifndef FIXED_POINT
359        /* Check for extreme band energies that could cause NaNs later. */
360        if (!(E<1e9f) || celt_isnan(E))
361        {
362           info->valid = 0;
363           RESTORE_STACK;
364           return;
365        }
366 #endif
367
368        tonal->E[tonal->E_count][b] = E;
369        frame_noisiness += nE/(1e-15f+E);
370
371        frame_loudness += (float)sqrt(E+1e-10f);
372        logE[b] = (float)log(E+1e-10f);
373        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
374        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
375        if (tonal->highE[b] < tonal->lowE[b]+1.f)
376        {
377           tonal->highE[b]+=.5f;
378           tonal->lowE[b]-=.5f;
379        }
380        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
381
382        L1=L2=0;
383        for (i=0;i<NB_FRAMES;i++)
384        {
385           L1 += (float)sqrt(tonal->E[i][b]);
386           L2 += tonal->E[i][b];
387        }
388
389        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
390        stationarity *= stationarity;
391        stationarity *= stationarity;
392        frame_stationarity += stationarity;
393        /*band_tonality[b] = tE/(1e-15+E)*/;
394        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
395 #if 0
396        if (b>=NB_TONAL_SKIP_BANDS)
397        {
398           frame_tonality += tweight[b]*band_tonality[b];
399           tw_sum += tweight[b];
400        }
401 #else
402        frame_tonality += band_tonality[b];
403        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
404           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
405 #endif
406        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
407        slope += band_tonality[b]*(b-8);
408        /*printf("%f %f ", band_tonality[b], stationarity);*/
409        tonal->prev_band_tonality[b] = band_tonality[b];
410     }
411
412     bandwidth_mask = 0;
413     bandwidth = 0;
414     maxE = 0;
415     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
416 #ifdef FIXED_POINT
417     noise_floor *= 1<<(15+SIG_SHIFT);
418 #endif
419     noise_floor *= noise_floor;
420     for (b=0;b<NB_TOT_BANDS;b++)
421     {
422        float E=0;
423        int band_start, band_end;
424        /* Keep a margin of 300 Hz for aliasing */
425        band_start = extra_bands[b];
426        band_end = extra_bands[b+1];
427        for (i=band_start;i<band_end;i++)
428        {
429           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
430                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
431           E += binE;
432        }
433        maxE = MAX32(maxE, E);
434        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
435        E = MAX32(E, tonal->meanE[b]);
436        /* Use a simple follower with 13 dB/Bark slope for spreading function */
437        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
438        /* Consider the band "active" only if all these conditions are met:
439           1) less than 10 dB below the simple follower
440           2) less than 90 dB below the peak band (maximal masking possible considering
441              both the ATH and the loudness-dependent slope of the spreading function)
442           3) above the PCM quantization noise floor
443        */
444        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
445           bandwidth = b;
446     }
447     if (tonal->count<=2)
448        bandwidth = 20;
449     frame_loudness = 20*(float)log10(frame_loudness);
450     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
451     tonal->lowECount *= (1-alphaE);
452     if (frame_loudness < tonal->Etracker-30)
453        tonal->lowECount += alphaE;
454
455     for (i=0;i<8;i++)
456     {
457        float sum=0;
458        for (b=0;b<16;b++)
459           sum += dct_table[i*16+b]*logE[b];
460        BFCC[i] = sum;
461     }
462
463     frame_stationarity /= NB_TBANDS;
464     relativeE /= NB_TBANDS;
465     if (tonal->count<10)
466        relativeE = .5;
467     frame_noisiness /= NB_TBANDS;
468 #if 1
469     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
470 #else
471     info->activity = .5*(1+frame_noisiness-frame_stationarity);
472 #endif
473     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
474     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
475     tonal->prev_tonality = frame_tonality;
476
477     slope /= 8*8;
478     info->tonality_slope = slope;
479
480     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
481     tonal->count++;
482     info->tonality = frame_tonality;
483
484     for (i=0;i<4;i++)
485        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
486
487     for (i=0;i<4;i++)
488        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
489
490     for (i=0;i<4;i++)
491         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
492     for (i=0;i<3;i++)
493         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
494
495     if (tonal->count > 5)
496     {
497        for (i=0;i<9;i++)
498           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
499     }
500
501     for (i=0;i<8;i++)
502     {
503        tonal->mem[i+24] = tonal->mem[i+16];
504        tonal->mem[i+16] = tonal->mem[i+8];
505        tonal->mem[i+8] = tonal->mem[i];
506        tonal->mem[i] = BFCC[i];
507     }
508     for (i=0;i<9;i++)
509        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
510     features[20] = info->tonality - 0.154723;
511     features[21] = info->activity - 0.724643;
512     features[22] = frame_stationarity - 0.743717;
513     features[23] = info->tonality_slope + 0.069216;
514     features[24] = tonal->lowECount - 0.067930;
515
516 #ifndef DISABLE_FLOAT_API
517     mlp_process(&net, features, frame_probs);
518     frame_probs[0] = .5f*(frame_probs[0]+1);
519     /* Curve fitting between the MLP probability and the actual probability */
520     /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
521     /* Probability of active audio (as opposed to silence) */
522     frame_probs[1] = .5f*frame_probs[1]+.5f;
523     frame_probs[1] *= frame_probs[1];
524     /* Consider that silence has a 50-50 probability. */
525     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526
527     /* Probability of speech or music vs noise */
528     info->activity_probability = frame_probs[1];
529
530     /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
531     {
532        /* Probability of state transition */
533        float tau;
534        /* Represents independence of the MLP probabilities, where
535           beta=1 means fully independent. */
536        float beta;
537        /* Denormalized probability of speech (p0) and music (p1) after update */
538        float p0, p1;
539        /* Probabilities for "all speech" and "all music" */
540        float s0, m0;
541        /* Probability sum for renormalisation */
542        float psum;
543        /* Instantaneous probability of speech and music, with beta pre-applied. */
544        float speech0;
545        float music0;
546        float p, q;
547
548        /* One transition every 3 minutes of active audio */
549        tau = .00005f*frame_probs[1];
550        /* Adapt beta based on how "unexpected" the new prob is */
551        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
552        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
553        beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
554        /* p0 and p1 are the probabilities of speech and music at this frame
555           using only information from previous frame and applying the
556           state transition model */
557        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
558        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
559        /* We apply the current probability with exponent beta to work around
560           the fact that the probability estimates aren't independent. */
561        p0 *= (float)pow(1-frame_probs[0], beta);
562        p1 *= (float)pow(frame_probs[0], beta);
563        /* Normalise the probabilities to get the Marokv probability of music. */
564        tonal->music_prob = p1/(p0+p1);
565        info->music_prob = tonal->music_prob;
566
567        /* This chunk of code deals with delayed decision. */
568        psum=1e-20f;
569        /* Instantaneous probability of speech and music, with beta pre-applied. */
570        speech0 = (float)pow(1-frame_probs[0], beta);
571        music0  = (float)pow(frame_probs[0], beta);
572        if (tonal->count==1)
573        {
574           tonal->pspeech[0]=.5;
575           tonal->pmusic [0]=.5;
576        }
577        /* Updated probability of having only speech (s0) or only music (m0),
578           before considering the new observation. */
579        s0 = tonal->pspeech[0] + tonal->pspeech[1];
580        m0 = tonal->pmusic [0] + tonal->pmusic [1];
581        /* Updates s0 and m0 with instantaneous probability. */
582        tonal->pspeech[0] = s0*(1-tau)*speech0;
583        tonal->pmusic [0] = m0*(1-tau)*music0;
584        /* Propagate the transition probabilities */
585        for (i=1;i<DETECT_SIZE-1;i++)
586        {
587           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
588           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
589        }
590        /* Probability that the latest frame is speech, when all the previous ones were music. */
591        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
592        /* Probability that the latest frame is music, when all the previous ones were speech. */
593        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
594
595        /* Renormalise probabilities to 1 */
596        for (i=0;i<DETECT_SIZE;i++)
597           psum += tonal->pspeech[i] + tonal->pmusic[i];
598        psum = 1.f/psum;
599        for (i=0;i<DETECT_SIZE;i++)
600        {
601           tonal->pspeech[i] *= psum;
602           tonal->pmusic [i] *= psum;
603        }
604        psum = tonal->pmusic[0];
605        for (i=1;i<DETECT_SIZE;i++)
606           psum += tonal->pspeech[i];
607
608        /* Estimate our confidence in the speech/music decisions */
609        if (frame_probs[1]>.75)
610        {
611           if (tonal->music_prob>.9)
612           {
613              float adapt;
614              adapt = 1.f/(++tonal->music_confidence_count);
615              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
616              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
617           }
618           if (tonal->music_prob<.1)
619           {
620              float adapt;
621              adapt = 1.f/(++tonal->speech_confidence_count);
622              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
623              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
624           }
625        } else {
626           if (tonal->music_confidence_count==0)
627              tonal->music_confidence = .9f;
628           if (tonal->speech_confidence_count==0)
629              tonal->speech_confidence = .1f;
630        }
631     }
632     if (tonal->last_music != (tonal->music_prob>.5f))
633        tonal->last_transition=0;
634     tonal->last_music = tonal->music_prob>.5f;
635 #else
636     info->music_prob = 0;
637 #endif
638     /*for (i=0;i<25;i++)
639        printf("%f ", features[i]);
640     printf("\n");*/
641
642     info->bandwidth = bandwidth;
643     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
644     info->noisiness = frame_noisiness;
645     info->valid = 1;
646     RESTORE_STACK;
647 }
648
649 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
650                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
651                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
652 {
653    int offset;
654    int pcm_len;
655
656    if (analysis_pcm != NULL)
657    {
658       /* Avoid overflow/wrap-around of the analysis buffer */
659       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
660
661       pcm_len = analysis_frame_size - analysis->analysis_offset;
662       offset = analysis->analysis_offset;
663       do {
664          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
665          offset += 480;
666          pcm_len -= 480;
667       } while (pcm_len>0);
668       analysis->analysis_offset = analysis_frame_size;
669
670       analysis->analysis_offset -= frame_size;
671    }
672
673    analysis_info->valid = 0;
674    tonality_get_info(analysis, analysis_info, frame_size);
675 }