More NaN hardening in the analysis code
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 extern const MLP net;
43
44 #ifndef M_PI
45 #define M_PI 3.141592653
46 #endif
47
48 static const float dct_table[128] = {
49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 };
66
67 static const float analysis_window[240] = {
68       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 };
99
100 static const int tbands[NB_TBANDS+1] = {
101        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 };
103
104 static const int extra_bands[NB_TOT_BANDS+1] = {
105       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 };
107
108 /*static const float tweight[NB_TBANDS+1] = {
109       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 };*/
111
112 #define NB_TONAL_SKIP_BANDS 9
113
114 #define cA 0.43157974f
115 #define cB 0.67848403f
116 #define cC 0.08595542f
117 #define cE ((float)M_PI/2)
118 static OPUS_INLINE float fast_atan2f(float y, float x) {
119    float x2, y2;
120    /* Should avoid underflow on the values we'll get */
121    if (ABS16(x)+ABS16(y)<1e-9f)
122    {
123       x*=1e12f;
124       y*=1e12f;
125    }
126    x2 = x*x;
127    y2 = y*y;
128    if(x2<y2){
129       float den = (y2 + cB*x2) * (y2 + cC*x2);
130       if (den!=0)
131          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132       else
133          return (y<0 ? -cE : cE);
134    }else{
135       float den = (x2 + cB*y2) * (x2 + cC*y2);
136       if (den!=0)
137          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138       else
139          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140    }
141 }
142
143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144 {
145    int pos;
146    int curr_lookahead;
147    float psum;
148    int i;
149
150    pos = tonal->read_pos;
151    curr_lookahead = tonal->write_pos-tonal->read_pos;
152    if (curr_lookahead<0)
153       curr_lookahead += DETECT_SIZE;
154
155    if (len > 480 && pos != tonal->write_pos)
156    {
157       pos++;
158       if (pos==DETECT_SIZE)
159          pos=0;
160    }
161    if (pos == tonal->write_pos)
162       pos--;
163    if (pos<0)
164       pos = DETECT_SIZE-1;
165    OPUS_COPY(info_out, &tonal->info[pos], 1);
166    tonal->read_subframe += len/120;
167    while (tonal->read_subframe>=4)
168    {
169       tonal->read_subframe -= 4;
170       tonal->read_pos++;
171    }
172    if (tonal->read_pos>=DETECT_SIZE)
173       tonal->read_pos-=DETECT_SIZE;
174
175    /* Compensate for the delay in the features themselves.
176       FIXME: Need a better estimate the 10 I just made up */
177    curr_lookahead = IMAX(curr_lookahead-10, 0);
178
179    psum=0;
180    /* Summing the probability of transition patterns that involve music at
181       time (DETECT_SIZE-curr_lookahead-1) */
182    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
183       psum += tonal->pmusic[i];
184    for (;i<DETECT_SIZE;i++)
185       psum += tonal->pspeech[i];
186    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
187    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
188
189    info_out->music_prob = psum;
190 }
191
192 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
193 {
194     int i, b;
195     const kiss_fft_state *kfft;
196     VARDECL(kiss_fft_cpx, in);
197     VARDECL(kiss_fft_cpx, out);
198     int N = 480, N2=240;
199     float * OPUS_RESTRICT A = tonal->angle;
200     float * OPUS_RESTRICT dA = tonal->d_angle;
201     float * OPUS_RESTRICT d2A = tonal->d2_angle;
202     VARDECL(float, tonality);
203     VARDECL(float, noisiness);
204     float band_tonality[NB_TBANDS];
205     float logE[NB_TBANDS];
206     float BFCC[8];
207     float features[25];
208     float frame_tonality;
209     float max_frame_tonality;
210     /*float tw_sum=0;*/
211     float frame_noisiness;
212     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
213     float slope=0;
214     float frame_stationarity;
215     float relativeE;
216     float frame_probs[2];
217     float alpha, alphaE, alphaE2;
218     float frame_loudness;
219     float bandwidth_mask;
220     int bandwidth=0;
221     float maxE = 0;
222     float noise_floor;
223     int remaining;
224     AnalysisInfo *info;
225     SAVE_STACK;
226
227     tonal->last_transition++;
228     alpha = 1.f/IMIN(20, 1+tonal->count);
229     alphaE = 1.f/IMIN(50, 1+tonal->count);
230     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
231
232     if (tonal->count<4)
233        tonal->music_prob = .5;
234     kfft = celt_mode->mdct.kfft[0];
235     if (tonal->count==0)
236        tonal->mem_fill = 240;
237     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
238     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
239     {
240        tonal->mem_fill += len;
241        /* Don't have enough to update the analysis */
242        RESTORE_STACK;
243        return;
244     }
245     info = &tonal->info[tonal->write_pos++];
246     if (tonal->write_pos>=DETECT_SIZE)
247        tonal->write_pos-=DETECT_SIZE;
248
249     ALLOC(in, 480, kiss_fft_cpx);
250     ALLOC(out, 480, kiss_fft_cpx);
251     ALLOC(tonality, 240, float);
252     ALLOC(noisiness, 240, float);
253     for (i=0;i<N2;i++)
254     {
255        float w = analysis_window[i];
256        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
257        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
258        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
259        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
260     }
261     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
262     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
263     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
264     tonal->mem_fill = 240 + remaining;
265     opus_fft(kfft, in, out);
266 #ifndef FIXED_POINT
267     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
268     if (celt_isnan(out[0].r))
269     {
270        info->valid = 0;
271        RESTORE_STACK;
272        return;
273     }
274 #endif
275
276     for (i=1;i<N2;i++)
277     {
278        float X1r, X2r, X1i, X2i;
279        float angle, d_angle, d2_angle;
280        float angle2, d_angle2, d2_angle2;
281        float mod1, mod2, avg_mod;
282        X1r = (float)out[i].r+out[N-i].r;
283        X1i = (float)out[i].i-out[N-i].i;
284        X2r = (float)out[i].i+out[N-i].i;
285        X2i = (float)out[N-i].r-out[i].r;
286
287        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
288        d_angle = angle - A[i];
289        d2_angle = d_angle - dA[i];
290
291        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
292        d_angle2 = angle2 - angle;
293        d2_angle2 = d_angle2 - d_angle;
294
295        mod1 = d2_angle - (float)floor(.5+d2_angle);
296        noisiness[i] = ABS16(mod1);
297        mod1 *= mod1;
298        mod1 *= mod1;
299
300        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
301        noisiness[i] += ABS16(mod2);
302        mod2 *= mod2;
303        mod2 *= mod2;
304
305        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
306        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
307
308        A[i] = angle2;
309        dA[i] = d_angle2;
310        d2A[i] = mod2;
311     }
312
313     frame_tonality = 0;
314     max_frame_tonality = 0;
315     /*tw_sum = 0;*/
316     info->activity = 0;
317     frame_noisiness = 0;
318     frame_stationarity = 0;
319     if (!tonal->count)
320     {
321        for (b=0;b<NB_TBANDS;b++)
322        {
323           tonal->lowE[b] = 1e10;
324           tonal->highE[b] = -1e10;
325        }
326     }
327     relativeE = 0;
328     frame_loudness = 0;
329     for (b=0;b<NB_TBANDS;b++)
330     {
331        float E=0, tE=0, nE=0;
332        float L1, L2;
333        float stationarity;
334        for (i=tbands[b];i<tbands[b+1];i++)
335        {
336           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
337                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
338 #ifdef FIXED_POINT
339           /* FIXME: It's probably best to change the BFCC filter initial state instead */
340           binE *= 5.55e-17f;
341 #endif
342           E += binE;
343           tE += binE*tonality[i];
344           nE += binE*2.f*(.5f-noisiness[i]);
345        }
346 #ifndef FIXED_POINT
347        /* Check for extreme band energies that could cause NaNs later. */
348        if (!(E<1e9f) || celt_isnan(E))
349        {
350           info->valid = 0;
351           RESTORE_STACK;
352           return;
353        }
354 #endif
355
356        tonal->E[tonal->E_count][b] = E;
357        frame_noisiness += nE/(1e-15f+E);
358
359        frame_loudness += (float)sqrt(E+1e-10f);
360        logE[b] = (float)log(E+1e-10f);
361        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
362        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
363        if (tonal->highE[b] < tonal->lowE[b]+1.f)
364        {
365           tonal->highE[b]+=.5f;
366           tonal->lowE[b]-=.5f;
367        }
368        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
369
370        L1=L2=0;
371        for (i=0;i<NB_FRAMES;i++)
372        {
373           L1 += (float)sqrt(tonal->E[i][b]);
374           L2 += tonal->E[i][b];
375        }
376
377        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
378        stationarity *= stationarity;
379        stationarity *= stationarity;
380        frame_stationarity += stationarity;
381        /*band_tonality[b] = tE/(1e-15+E)*/;
382        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
383 #if 0
384        if (b>=NB_TONAL_SKIP_BANDS)
385        {
386           frame_tonality += tweight[b]*band_tonality[b];
387           tw_sum += tweight[b];
388        }
389 #else
390        frame_tonality += band_tonality[b];
391        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
392           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
393 #endif
394        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
395        slope += band_tonality[b]*(b-8);
396        /*printf("%f %f ", band_tonality[b], stationarity);*/
397        tonal->prev_band_tonality[b] = band_tonality[b];
398     }
399
400     bandwidth_mask = 0;
401     bandwidth = 0;
402     maxE = 0;
403     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
404 #ifdef FIXED_POINT
405     noise_floor *= 1<<(15+SIG_SHIFT);
406 #endif
407     noise_floor *= noise_floor;
408     for (b=0;b<NB_TOT_BANDS;b++)
409     {
410        float E=0;
411        int band_start, band_end;
412        /* Keep a margin of 300 Hz for aliasing */
413        band_start = extra_bands[b];
414        band_end = extra_bands[b+1];
415        for (i=band_start;i<band_end;i++)
416        {
417           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
418                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
419           E += binE;
420        }
421        maxE = MAX32(maxE, E);
422        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
423        E = MAX32(E, tonal->meanE[b]);
424        /* Use a simple follower with 13 dB/Bark slope for spreading function */
425        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
426        /* Consider the band "active" only if all these conditions are met:
427           1) less than 10 dB below the simple follower
428           2) less than 90 dB below the peak band (maximal masking possible considering
429              both the ATH and the loudness-dependent slope of the spreading function)
430           3) above the PCM quantization noise floor
431        */
432        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
433           bandwidth = b;
434     }
435     if (tonal->count<=2)
436        bandwidth = 20;
437     frame_loudness = 20*(float)log10(frame_loudness);
438     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
439     tonal->lowECount *= (1-alphaE);
440     if (frame_loudness < tonal->Etracker-30)
441        tonal->lowECount += alphaE;
442
443     for (i=0;i<8;i++)
444     {
445        float sum=0;
446        for (b=0;b<16;b++)
447           sum += dct_table[i*16+b]*logE[b];
448        BFCC[i] = sum;
449     }
450
451     frame_stationarity /= NB_TBANDS;
452     relativeE /= NB_TBANDS;
453     if (tonal->count<10)
454        relativeE = .5;
455     frame_noisiness /= NB_TBANDS;
456 #if 1
457     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
458 #else
459     info->activity = .5*(1+frame_noisiness-frame_stationarity);
460 #endif
461     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
462     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
463     tonal->prev_tonality = frame_tonality;
464
465     slope /= 8*8;
466     info->tonality_slope = slope;
467
468     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
469     tonal->count++;
470     info->tonality = frame_tonality;
471
472     for (i=0;i<4;i++)
473        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
474
475     for (i=0;i<4;i++)
476        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
477
478     for (i=0;i<4;i++)
479         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
480     for (i=0;i<3;i++)
481         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
482
483     if (tonal->count > 5)
484     {
485        for (i=0;i<9;i++)
486           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
487     }
488
489     for (i=0;i<8;i++)
490     {
491        tonal->mem[i+24] = tonal->mem[i+16];
492        tonal->mem[i+16] = tonal->mem[i+8];
493        tonal->mem[i+8] = tonal->mem[i];
494        tonal->mem[i] = BFCC[i];
495     }
496     for (i=0;i<9;i++)
497        features[11+i] = (float)sqrt(tonal->std[i]);
498     features[20] = info->tonality;
499     features[21] = info->activity;
500     features[22] = frame_stationarity;
501     features[23] = info->tonality_slope;
502     features[24] = tonal->lowECount;
503
504 #ifndef DISABLE_FLOAT_API
505     mlp_process(&net, features, frame_probs);
506     frame_probs[0] = .5f*(frame_probs[0]+1);
507     /* Curve fitting between the MLP probability and the actual probability */
508     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
509     /* Probability of active audio (as opposed to silence) */
510     frame_probs[1] = .5f*frame_probs[1]+.5f;
511     /* Consider that silence has a 50-50 probability. */
512     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
513
514     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
515     {
516        /* Probability of state transition */
517        float tau;
518        /* Represents independence of the MLP probabilities, where
519           beta=1 means fully independent. */
520        float beta;
521        /* Denormalized probability of speech (p0) and music (p1) after update */
522        float p0, p1;
523        /* Probabilities for "all speech" and "all music" */
524        float s0, m0;
525        /* Probability sum for renormalisation */
526        float psum;
527        /* Instantaneous probability of speech and music, with beta pre-applied. */
528        float speech0;
529        float music0;
530
531        /* One transition every 3 minutes of active audio */
532        tau = .00005f*frame_probs[1];
533        beta = .05f;
534        if (1) {
535           /* Adapt beta based on how "unexpected" the new prob is */
536           float p, q;
537           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
538           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
539           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
540        }
541        /* p0 and p1 are the probabilities of speech and music at this frame
542           using only information from previous frame and applying the
543           state transition model */
544        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
545        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
546        /* We apply the current probability with exponent beta to work around
547           the fact that the probability estimates aren't independent. */
548        p0 *= (float)pow(1-frame_probs[0], beta);
549        p1 *= (float)pow(frame_probs[0], beta);
550        /* Normalise the probabilities to get the Marokv probability of music. */
551        tonal->music_prob = p1/(p0+p1);
552        info->music_prob = tonal->music_prob;
553
554        /* This chunk of code deals with delayed decision. */
555        psum=1e-20f;
556        /* Instantaneous probability of speech and music, with beta pre-applied. */
557        speech0 = (float)pow(1-frame_probs[0], beta);
558        music0  = (float)pow(frame_probs[0], beta);
559        if (tonal->count==1)
560        {
561           tonal->pspeech[0]=.5;
562           tonal->pmusic [0]=.5;
563        }
564        /* Updated probability of having only speech (s0) or only music (m0),
565           before considering the new observation. */
566        s0 = tonal->pspeech[0] + tonal->pspeech[1];
567        m0 = tonal->pmusic [0] + tonal->pmusic [1];
568        /* Updates s0 and m0 with instantaneous probability. */
569        tonal->pspeech[0] = s0*(1-tau)*speech0;
570        tonal->pmusic [0] = m0*(1-tau)*music0;
571        /* Propagate the transition probabilities */
572        for (i=1;i<DETECT_SIZE-1;i++)
573        {
574           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
575           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
576        }
577        /* Probability that the latest frame is speech, when all the previous ones were music. */
578        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
579        /* Probability that the latest frame is music, when all the previous ones were speech. */
580        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
581
582        /* Renormalise probabilities to 1 */
583        for (i=0;i<DETECT_SIZE;i++)
584           psum += tonal->pspeech[i] + tonal->pmusic[i];
585        psum = 1.f/psum;
586        for (i=0;i<DETECT_SIZE;i++)
587        {
588           tonal->pspeech[i] *= psum;
589           tonal->pmusic [i] *= psum;
590        }
591        psum = tonal->pmusic[0];
592        for (i=1;i<DETECT_SIZE;i++)
593           psum += tonal->pspeech[i];
594
595        /* Estimate our confidence in the speech/music decisions */
596        if (frame_probs[1]>.75)
597        {
598           if (tonal->music_prob>.9)
599           {
600              float adapt;
601              adapt = 1.f/(++tonal->music_confidence_count);
602              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
603              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
604           }
605           if (tonal->music_prob<.1)
606           {
607              float adapt;
608              adapt = 1.f/(++tonal->speech_confidence_count);
609              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
610              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
611           }
612        } else {
613           if (tonal->music_confidence_count==0)
614              tonal->music_confidence = .9f;
615           if (tonal->speech_confidence_count==0)
616              tonal->speech_confidence = .1f;
617        }
618     }
619     if (tonal->last_music != (tonal->music_prob>.5f))
620        tonal->last_transition=0;
621     tonal->last_music = tonal->music_prob>.5f;
622 #else
623     info->music_prob = 0;
624 #endif
625     /*for (i=0;i<25;i++)
626        printf("%f ", features[i]);
627     printf("\n");*/
628
629     info->bandwidth = bandwidth;
630     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
631     info->noisiness = frame_noisiness;
632     info->valid = 1;
633     RESTORE_STACK;
634 }
635
636 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
637                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
638                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
639 {
640    int offset;
641    int pcm_len;
642
643    if (analysis_pcm != NULL)
644    {
645       /* Avoid overflow/wrap-around of the analysis buffer */
646       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
647
648       pcm_len = analysis_frame_size - analysis->analysis_offset;
649       offset = analysis->analysis_offset;
650       do {
651          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
652          offset += 480;
653          pcm_len -= 480;
654       } while (pcm_len>0);
655       analysis->analysis_offset = analysis_frame_size;
656
657       analysis->analysis_offset -= frame_size;
658    }
659
660    analysis_info->valid = 0;
661    tonality_get_info(analysis, analysis_info, frame_size);
662 }