62b3937357f0500bacd1528777ed0ea763d2de70
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define ANALYSIS_C
33
34 #include <stdio.h>
35
36 #include "mathops.h"
37 #include "kiss_fft.h"
38 #include "celt.h"
39 #include "modes.h"
40 #include "arch.h"
41 #include "quant_bands.h"
42 #include "analysis.h"
43 #include "mlp.h"
44 #include "stack_alloc.h"
45
46 #ifndef M_PI
47 #define M_PI 3.141592653
48 #endif
49
50 static const float dct_table[128] = {
51         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
52         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
53         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
54        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
55         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
56        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
57         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
58         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
59         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
60         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
61         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
62        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
63         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
64        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
65         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
66         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
67 };
68
69 static const float analysis_window[240] = {
70       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
71       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
72       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
73       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
74       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
75       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
76       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
77       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
78       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
79       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
80       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
81       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
82       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
83       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
84       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
85       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
86       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
87       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
88       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
89       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
90       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
91       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
92       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
93       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
94       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
95       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
96       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
97       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
98       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
99       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
100 };
101
102 static const int tbands[NB_TBANDS+1] = {
103        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
104 };
105
106 static const int extra_bands[NB_TOT_BANDS+1] = {
107       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
108 };
109
110 /*static const float tweight[NB_TBANDS+1] = {
111       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
112 };*/
113
114 #define NB_TONAL_SKIP_BANDS 9
115
116
117 void tonality_analysis_init(TonalityAnalysisState *tonal)
118 {
119   /* Initialize reusable fields. */
120   tonal->arch = opus_select_arch();
121   /* Clear remaining fields. */
122   tonality_analysis_reset(tonal);
123 }
124
125 void tonality_analysis_reset(TonalityAnalysisState *tonal)
126 {
127   /* Clear non-reusable fields. */
128   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
129   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
130 }
131
132 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
133 {
134    int pos;
135    int curr_lookahead;
136    float psum;
137    int i;
138
139    pos = tonal->read_pos;
140    curr_lookahead = tonal->write_pos-tonal->read_pos;
141    if (curr_lookahead<0)
142       curr_lookahead += DETECT_SIZE;
143
144    if (len > 480 && pos != tonal->write_pos)
145    {
146       pos++;
147       if (pos==DETECT_SIZE)
148          pos=0;
149    }
150    if (pos == tonal->write_pos)
151       pos--;
152    if (pos<0)
153       pos = DETECT_SIZE-1;
154    OPUS_COPY(info_out, &tonal->info[pos], 1);
155    tonal->read_subframe += len/120;
156    while (tonal->read_subframe>=4)
157    {
158       tonal->read_subframe -= 4;
159       tonal->read_pos++;
160    }
161    if (tonal->read_pos>=DETECT_SIZE)
162       tonal->read_pos-=DETECT_SIZE;
163
164    /* Compensate for the delay in the features themselves.
165       FIXME: Need a better estimate the 10 I just made up */
166    curr_lookahead = IMAX(curr_lookahead-10, 0);
167
168    psum=0;
169    /* Summing the probability of transition patterns that involve music at
170       time (DETECT_SIZE-curr_lookahead-1) */
171    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
172       psum += tonal->pmusic[i];
173    for (;i<DETECT_SIZE;i++)
174       psum += tonal->pspeech[i];
175    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
176    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
177
178    info_out->music_prob = psum;
179 }
180
181 static const float std_feature_bias[9] = {
182       5.684947, 3.475288, 1.770634, 1.599784, 3.773215,
183       2.163313, 1.260756, 1.116868, 1.918795
184 };
185
186 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
187 {
188     int i, b;
189     const kiss_fft_state *kfft;
190     VARDECL(kiss_fft_cpx, in);
191     VARDECL(kiss_fft_cpx, out);
192     int N = 480, N2=240;
193     float * OPUS_RESTRICT A = tonal->angle;
194     float * OPUS_RESTRICT dA = tonal->d_angle;
195     float * OPUS_RESTRICT d2A = tonal->d2_angle;
196     VARDECL(float, tonality);
197     VARDECL(float, noisiness);
198     float band_tonality[NB_TBANDS];
199     float logE[NB_TBANDS];
200     float BFCC[8];
201     float features[25];
202     float frame_tonality;
203     float max_frame_tonality;
204     /*float tw_sum=0;*/
205     float frame_noisiness;
206     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
207     float slope=0;
208     float frame_stationarity;
209     float relativeE;
210     float frame_probs[2];
211     float alpha, alphaE, alphaE2;
212     float frame_loudness;
213     float bandwidth_mask;
214     int bandwidth=0;
215     float maxE = 0;
216     float noise_floor;
217     int remaining;
218     AnalysisInfo *info;
219     SAVE_STACK;
220
221     tonal->last_transition++;
222     alpha = 1.f/IMIN(20, 1+tonal->count);
223     alphaE = 1.f/IMIN(50, 1+tonal->count);
224     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
225
226     if (tonal->count<4)
227        tonal->music_prob = .5;
228     kfft = celt_mode->mdct.kfft[0];
229     if (tonal->count==0)
230        tonal->mem_fill = 240;
231     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
232     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
233     {
234        tonal->mem_fill += len;
235        /* Don't have enough to update the analysis */
236        RESTORE_STACK;
237        return;
238     }
239     info = &tonal->info[tonal->write_pos++];
240     if (tonal->write_pos>=DETECT_SIZE)
241        tonal->write_pos-=DETECT_SIZE;
242
243     ALLOC(in, 480, kiss_fft_cpx);
244     ALLOC(out, 480, kiss_fft_cpx);
245     ALLOC(tonality, 240, float);
246     ALLOC(noisiness, 240, float);
247     for (i=0;i<N2;i++)
248     {
249        float w = analysis_window[i];
250        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
251        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
252        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
253        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
254     }
255     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
256     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
257     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
258     tonal->mem_fill = 240 + remaining;
259     opus_fft(kfft, in, out, tonal->arch);
260 #ifndef FIXED_POINT
261     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
262     if (celt_isnan(out[0].r))
263     {
264        info->valid = 0;
265        RESTORE_STACK;
266        return;
267     }
268 #endif
269
270     for (i=1;i<N2;i++)
271     {
272        float X1r, X2r, X1i, X2i;
273        float angle, d_angle, d2_angle;
274        float angle2, d_angle2, d2_angle2;
275        float mod1, mod2, avg_mod;
276        X1r = (float)out[i].r+out[N-i].r;
277        X1i = (float)out[i].i-out[N-i].i;
278        X2r = (float)out[i].i+out[N-i].i;
279        X2i = (float)out[N-i].r-out[i].r;
280
281        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
282        d_angle = angle - A[i];
283        d2_angle = d_angle - dA[i];
284
285        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
286        d_angle2 = angle2 - angle;
287        d2_angle2 = d_angle2 - d_angle;
288
289        mod1 = d2_angle - (float)floor(.5+d2_angle);
290        noisiness[i] = ABS16(mod1);
291        mod1 *= mod1;
292        mod1 *= mod1;
293
294        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
295        noisiness[i] += ABS16(mod2);
296        mod2 *= mod2;
297        mod2 *= mod2;
298
299        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
300        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
301
302        A[i] = angle2;
303        dA[i] = d_angle2;
304        d2A[i] = mod2;
305     }
306
307     frame_tonality = 0;
308     max_frame_tonality = 0;
309     /*tw_sum = 0;*/
310     info->activity = 0;
311     frame_noisiness = 0;
312     frame_stationarity = 0;
313     if (!tonal->count)
314     {
315        for (b=0;b<NB_TBANDS;b++)
316        {
317           tonal->lowE[b] = 1e10;
318           tonal->highE[b] = -1e10;
319        }
320     }
321     relativeE = 0;
322     frame_loudness = 0;
323     for (b=0;b<NB_TBANDS;b++)
324     {
325        float E=0, tE=0, nE=0;
326        float L1, L2;
327        float stationarity;
328        for (i=tbands[b];i<tbands[b+1];i++)
329        {
330           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
331                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
332 #ifdef FIXED_POINT
333           /* FIXME: It's probably best to change the BFCC filter initial state instead */
334           binE *= 5.55e-17f;
335 #endif
336           E += binE;
337           tE += binE*tonality[i];
338           nE += binE*2.f*(.5f-noisiness[i]);
339        }
340 #ifndef FIXED_POINT
341        /* Check for extreme band energies that could cause NaNs later. */
342        if (!(E<1e9f) || celt_isnan(E))
343        {
344           info->valid = 0;
345           RESTORE_STACK;
346           return;
347        }
348 #endif
349
350        tonal->E[tonal->E_count][b] = E;
351        frame_noisiness += nE/(1e-15f+E);
352
353        frame_loudness += (float)sqrt(E+1e-10f);
354        logE[b] = (float)log(E+1e-10f);
355        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
356        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
357        if (tonal->highE[b] < tonal->lowE[b]+1.f)
358        {
359           tonal->highE[b]+=.5f;
360           tonal->lowE[b]-=.5f;
361        }
362        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
363
364        L1=L2=0;
365        for (i=0;i<NB_FRAMES;i++)
366        {
367           L1 += (float)sqrt(tonal->E[i][b]);
368           L2 += tonal->E[i][b];
369        }
370
371        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
372        stationarity *= stationarity;
373        stationarity *= stationarity;
374        frame_stationarity += stationarity;
375        /*band_tonality[b] = tE/(1e-15+E)*/;
376        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
377 #if 0
378        if (b>=NB_TONAL_SKIP_BANDS)
379        {
380           frame_tonality += tweight[b]*band_tonality[b];
381           tw_sum += tweight[b];
382        }
383 #else
384        frame_tonality += band_tonality[b];
385        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
386           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
387 #endif
388        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
389        slope += band_tonality[b]*(b-8);
390        /*printf("%f %f ", band_tonality[b], stationarity);*/
391        tonal->prev_band_tonality[b] = band_tonality[b];
392     }
393
394     bandwidth_mask = 0;
395     bandwidth = 0;
396     maxE = 0;
397     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
398 #ifdef FIXED_POINT
399     noise_floor *= 1<<(15+SIG_SHIFT);
400 #endif
401     noise_floor *= noise_floor;
402     for (b=0;b<NB_TOT_BANDS;b++)
403     {
404        float E=0;
405        int band_start, band_end;
406        /* Keep a margin of 300 Hz for aliasing */
407        band_start = extra_bands[b];
408        band_end = extra_bands[b+1];
409        for (i=band_start;i<band_end;i++)
410        {
411           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
412                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
413           E += binE;
414        }
415        maxE = MAX32(maxE, E);
416        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
417        E = MAX32(E, tonal->meanE[b]);
418        /* Use a simple follower with 13 dB/Bark slope for spreading function */
419        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
420        /* Consider the band "active" only if all these conditions are met:
421           1) less than 10 dB below the simple follower
422           2) less than 90 dB below the peak band (maximal masking possible considering
423              both the ATH and the loudness-dependent slope of the spreading function)
424           3) above the PCM quantization noise floor
425        */
426        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
427           bandwidth = b;
428     }
429     if (tonal->count<=2)
430        bandwidth = 20;
431     frame_loudness = 20*(float)log10(frame_loudness);
432     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
433     tonal->lowECount *= (1-alphaE);
434     if (frame_loudness < tonal->Etracker-30)
435        tonal->lowECount += alphaE;
436
437     for (i=0;i<8;i++)
438     {
439        float sum=0;
440        for (b=0;b<16;b++)
441           sum += dct_table[i*16+b]*logE[b];
442        BFCC[i] = sum;
443     }
444
445     frame_stationarity /= NB_TBANDS;
446     relativeE /= NB_TBANDS;
447     if (tonal->count<10)
448        relativeE = .5;
449     frame_noisiness /= NB_TBANDS;
450 #if 1
451     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
452 #else
453     info->activity = .5*(1+frame_noisiness-frame_stationarity);
454 #endif
455     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
456     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
457     tonal->prev_tonality = frame_tonality;
458
459     slope /= 8*8;
460     info->tonality_slope = slope;
461
462     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
463     tonal->count++;
464     info->tonality = frame_tonality;
465
466     for (i=0;i<4;i++)
467        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
468
469     for (i=0;i<4;i++)
470        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
471
472     for (i=0;i<4;i++)
473         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
474     for (i=0;i<3;i++)
475         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
476
477     if (tonal->count > 5)
478     {
479        for (i=0;i<9;i++)
480           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
481     }
482
483     for (i=0;i<8;i++)
484     {
485        tonal->mem[i+24] = tonal->mem[i+16];
486        tonal->mem[i+16] = tonal->mem[i+8];
487        tonal->mem[i+8] = tonal->mem[i];
488        tonal->mem[i] = BFCC[i];
489     }
490     for (i=0;i<9;i++)
491        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
492     features[20] = info->tonality - 0.154723;
493     features[21] = info->activity - 0.724643;
494     features[22] = frame_stationarity - 0.743717;
495     features[23] = info->tonality_slope + 0.069216;
496     features[24] = tonal->lowECount - 0.067930;
497
498 #ifndef DISABLE_FLOAT_API
499     mlp_process(&net, features, frame_probs);
500     frame_probs[0] = .5f*(frame_probs[0]+1);
501     /* Curve fitting between the MLP probability and the actual probability */
502     /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
503     /* Probability of active audio (as opposed to silence) */
504     frame_probs[1] = .5f*frame_probs[1]+.5f;
505     frame_probs[1] *= frame_probs[1];
506     /* Consider that silence has a 50-50 probability. */
507     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
508
509     /* Probability of speech or music vs noise */
510     info->activity_probability = frame_probs[1];
511
512     /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
513     {
514        /* Probability of state transition */
515        float tau;
516        /* Represents independence of the MLP probabilities, where
517           beta=1 means fully independent. */
518        float beta;
519        /* Denormalized probability of speech (p0) and music (p1) after update */
520        float p0, p1;
521        /* Probabilities for "all speech" and "all music" */
522        float s0, m0;
523        /* Probability sum for renormalisation */
524        float psum;
525        /* Instantaneous probability of speech and music, with beta pre-applied. */
526        float speech0;
527        float music0;
528        float p, q;
529
530        /* One transition every 3 minutes of active audio */
531        tau = .00005f*frame_probs[1];
532        /* Adapt beta based on how "unexpected" the new prob is */
533        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
534        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
535        beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
536        /* p0 and p1 are the probabilities of speech and music at this frame
537           using only information from previous frame and applying the
538           state transition model */
539        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
540        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
541        /* We apply the current probability with exponent beta to work around
542           the fact that the probability estimates aren't independent. */
543        p0 *= (float)pow(1-frame_probs[0], beta);
544        p1 *= (float)pow(frame_probs[0], beta);
545        /* Normalise the probabilities to get the Marokv probability of music. */
546        tonal->music_prob = p1/(p0+p1);
547        info->music_prob = tonal->music_prob;
548
549        /* This chunk of code deals with delayed decision. */
550        psum=1e-20f;
551        /* Instantaneous probability of speech and music, with beta pre-applied. */
552        speech0 = (float)pow(1-frame_probs[0], beta);
553        music0  = (float)pow(frame_probs[0], beta);
554        if (tonal->count==1)
555        {
556           tonal->pspeech[0]=.5;
557           tonal->pmusic [0]=.5;
558        }
559        /* Updated probability of having only speech (s0) or only music (m0),
560           before considering the new observation. */
561        s0 = tonal->pspeech[0] + tonal->pspeech[1];
562        m0 = tonal->pmusic [0] + tonal->pmusic [1];
563        /* Updates s0 and m0 with instantaneous probability. */
564        tonal->pspeech[0] = s0*(1-tau)*speech0;
565        tonal->pmusic [0] = m0*(1-tau)*music0;
566        /* Propagate the transition probabilities */
567        for (i=1;i<DETECT_SIZE-1;i++)
568        {
569           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
570           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
571        }
572        /* Probability that the latest frame is speech, when all the previous ones were music. */
573        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
574        /* Probability that the latest frame is music, when all the previous ones were speech. */
575        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
576
577        /* Renormalise probabilities to 1 */
578        for (i=0;i<DETECT_SIZE;i++)
579           psum += tonal->pspeech[i] + tonal->pmusic[i];
580        psum = 1.f/psum;
581        for (i=0;i<DETECT_SIZE;i++)
582        {
583           tonal->pspeech[i] *= psum;
584           tonal->pmusic [i] *= psum;
585        }
586        psum = tonal->pmusic[0];
587        for (i=1;i<DETECT_SIZE;i++)
588           psum += tonal->pspeech[i];
589
590        /* Estimate our confidence in the speech/music decisions */
591        if (frame_probs[1]>.75)
592        {
593           if (tonal->music_prob>.9)
594           {
595              float adapt;
596              adapt = 1.f/(++tonal->music_confidence_count);
597              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
598              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
599           }
600           if (tonal->music_prob<.1)
601           {
602              float adapt;
603              adapt = 1.f/(++tonal->speech_confidence_count);
604              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
605              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
606           }
607        } else {
608           if (tonal->music_confidence_count==0)
609              tonal->music_confidence = .9f;
610           if (tonal->speech_confidence_count==0)
611              tonal->speech_confidence = .1f;
612        }
613     }
614     if (tonal->last_music != (tonal->music_prob>.5f))
615        tonal->last_transition=0;
616     tonal->last_music = tonal->music_prob>.5f;
617 #else
618     info->music_prob = 0;
619 #endif
620     /*for (i=0;i<25;i++)
621        printf("%f ", features[i]);
622     printf("\n");*/
623
624     info->bandwidth = bandwidth;
625     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
626     info->noisiness = frame_noisiness;
627     info->valid = 1;
628     RESTORE_STACK;
629 }
630
631 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
632                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
633                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
634 {
635    int offset;
636    int pcm_len;
637
638    if (analysis_pcm != NULL)
639    {
640       /* Avoid overflow/wrap-around of the analysis buffer */
641       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
642
643       pcm_len = analysis_frame_size - analysis->analysis_offset;
644       offset = analysis->analysis_offset;
645       do {
646          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
647          offset += 480;
648          pcm_len -= 480;
649       } while (pcm_len>0);
650       analysis->analysis_offset = analysis_frame_size;
651
652       analysis->analysis_offset -= frame_size;
653    }
654
655    analysis_info->valid = 0;
656    tonality_get_info(analysis, analysis_info, frame_size);
657 }