f8187d03564b654f350aed7bad30e5fdbeafd84e
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    /* Should avoid underflow on the values we'll get */
119    if (ABS16(x)+ABS16(y)<1e-9f)
120    {
121       x*=1e12f;
122       y*=1e12f;
123    }
124    x2 = x*x;
125    y2 = y*y;
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       if (den!=0)
129          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130       else
131          return (y<0 ? -cE : cE);
132    }else{
133       float den = (x2 + cB*y2) * (x2 + cC*y2);
134       if (den!=0)
135          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136       else
137          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138    }
139 }
140
141 void tonality_analysis_init(TonalityAnalysisState *tonal)
142 {
143   /* Initialize reusable fields. */
144   tonal->arch = opus_select_arch();
145   /* Other fields will be overwritten in use. */
146 }
147
148 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
149 {
150    int pos;
151    int curr_lookahead;
152    float psum;
153    int i;
154
155    pos = tonal->read_pos;
156    curr_lookahead = tonal->write_pos-tonal->read_pos;
157    if (curr_lookahead<0)
158       curr_lookahead += DETECT_SIZE;
159
160    if (len > 480 && pos != tonal->write_pos)
161    {
162       pos++;
163       if (pos==DETECT_SIZE)
164          pos=0;
165    }
166    if (pos == tonal->write_pos)
167       pos--;
168    if (pos<0)
169       pos = DETECT_SIZE-1;
170    OPUS_COPY(info_out, &tonal->info[pos], 1);
171    tonal->read_subframe += len/120;
172    while (tonal->read_subframe>=4)
173    {
174       tonal->read_subframe -= 4;
175       tonal->read_pos++;
176    }
177    if (tonal->read_pos>=DETECT_SIZE)
178       tonal->read_pos-=DETECT_SIZE;
179
180    /* Compensate for the delay in the features themselves.
181       FIXME: Need a better estimate the 10 I just made up */
182    curr_lookahead = IMAX(curr_lookahead-10, 0);
183
184    psum=0;
185    /* Summing the probability of transition patterns that involve music at
186       time (DETECT_SIZE-curr_lookahead-1) */
187    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
188       psum += tonal->pmusic[i];
189    for (;i<DETECT_SIZE;i++)
190       psum += tonal->pspeech[i];
191    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
192    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
193
194    info_out->music_prob = psum;
195 }
196
197 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
198 {
199     int i, b;
200     const kiss_fft_state *kfft;
201     VARDECL(kiss_fft_cpx, in);
202     VARDECL(kiss_fft_cpx, out);
203     int N = 480, N2=240;
204     float * OPUS_RESTRICT A = tonal->angle;
205     float * OPUS_RESTRICT dA = tonal->d_angle;
206     float * OPUS_RESTRICT d2A = tonal->d2_angle;
207     VARDECL(float, tonality);
208     VARDECL(float, noisiness);
209     float band_tonality[NB_TBANDS];
210     float logE[NB_TBANDS];
211     float BFCC[8];
212     float features[25];
213     float frame_tonality;
214     float max_frame_tonality;
215     /*float tw_sum=0;*/
216     float frame_noisiness;
217     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
218     float slope=0;
219     float frame_stationarity;
220     float relativeE;
221     float frame_probs[2];
222     float alpha, alphaE, alphaE2;
223     float frame_loudness;
224     float bandwidth_mask;
225     int bandwidth=0;
226     float maxE = 0;
227     float noise_floor;
228     int remaining;
229     AnalysisInfo *info;
230     SAVE_STACK;
231
232     tonal->last_transition++;
233     alpha = 1.f/IMIN(20, 1+tonal->count);
234     alphaE = 1.f/IMIN(50, 1+tonal->count);
235     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
236
237     if (tonal->count<4)
238        tonal->music_prob = .5;
239     kfft = celt_mode->mdct.kfft[0];
240     if (tonal->count==0)
241        tonal->mem_fill = 240;
242     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
243     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
244     {
245        tonal->mem_fill += len;
246        /* Don't have enough to update the analysis */
247        RESTORE_STACK;
248        return;
249     }
250     info = &tonal->info[tonal->write_pos++];
251     if (tonal->write_pos>=DETECT_SIZE)
252        tonal->write_pos-=DETECT_SIZE;
253
254     ALLOC(in, 480, kiss_fft_cpx);
255     ALLOC(out, 480, kiss_fft_cpx);
256     ALLOC(tonality, 240, float);
257     ALLOC(noisiness, 240, float);
258     for (i=0;i<N2;i++)
259     {
260        float w = analysis_window[i];
261        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
262        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
263        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
264        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
265     }
266     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
267     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
268     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
269     tonal->mem_fill = 240 + remaining;
270     opus_fft(kfft, in, out, tonal->arch);
271 #ifndef FIXED_POINT
272     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
273     if (celt_isnan(out[0].r))
274     {
275        info->valid = 0;
276        RESTORE_STACK;
277        return;
278     }
279 #endif
280
281     for (i=1;i<N2;i++)
282     {
283        float X1r, X2r, X1i, X2i;
284        float angle, d_angle, d2_angle;
285        float angle2, d_angle2, d2_angle2;
286        float mod1, mod2, avg_mod;
287        X1r = (float)out[i].r+out[N-i].r;
288        X1i = (float)out[i].i-out[N-i].i;
289        X2r = (float)out[i].i+out[N-i].i;
290        X2i = (float)out[N-i].r-out[i].r;
291
292        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
293        d_angle = angle - A[i];
294        d2_angle = d_angle - dA[i];
295
296        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
297        d_angle2 = angle2 - angle;
298        d2_angle2 = d_angle2 - d_angle;
299
300        mod1 = d2_angle - (float)floor(.5+d2_angle);
301        noisiness[i] = ABS16(mod1);
302        mod1 *= mod1;
303        mod1 *= mod1;
304
305        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
306        noisiness[i] += ABS16(mod2);
307        mod2 *= mod2;
308        mod2 *= mod2;
309
310        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
311        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
312
313        A[i] = angle2;
314        dA[i] = d_angle2;
315        d2A[i] = mod2;
316     }
317
318     frame_tonality = 0;
319     max_frame_tonality = 0;
320     /*tw_sum = 0;*/
321     info->activity = 0;
322     frame_noisiness = 0;
323     frame_stationarity = 0;
324     if (!tonal->count)
325     {
326        for (b=0;b<NB_TBANDS;b++)
327        {
328           tonal->lowE[b] = 1e10;
329           tonal->highE[b] = -1e10;
330        }
331     }
332     relativeE = 0;
333     frame_loudness = 0;
334     for (b=0;b<NB_TBANDS;b++)
335     {
336        float E=0, tE=0, nE=0;
337        float L1, L2;
338        float stationarity;
339        for (i=tbands[b];i<tbands[b+1];i++)
340        {
341           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
342                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
343 #ifdef FIXED_POINT
344           /* FIXME: It's probably best to change the BFCC filter initial state instead */
345           binE *= 5.55e-17f;
346 #endif
347           E += binE;
348           tE += binE*tonality[i];
349           nE += binE*2.f*(.5f-noisiness[i]);
350        }
351 #ifndef FIXED_POINT
352        /* Check for extreme band energies that could cause NaNs later. */
353        if (!(E<1e9f) || celt_isnan(E))
354        {
355           info->valid = 0;
356           RESTORE_STACK;
357           return;
358        }
359 #endif
360
361        tonal->E[tonal->E_count][b] = E;
362        frame_noisiness += nE/(1e-15f+E);
363
364        frame_loudness += (float)sqrt(E+1e-10f);
365        logE[b] = (float)log(E+1e-10f);
366        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
367        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
368        if (tonal->highE[b] < tonal->lowE[b]+1.f)
369        {
370           tonal->highE[b]+=.5f;
371           tonal->lowE[b]-=.5f;
372        }
373        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
374
375        L1=L2=0;
376        for (i=0;i<NB_FRAMES;i++)
377        {
378           L1 += (float)sqrt(tonal->E[i][b]);
379           L2 += tonal->E[i][b];
380        }
381
382        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
383        stationarity *= stationarity;
384        stationarity *= stationarity;
385        frame_stationarity += stationarity;
386        /*band_tonality[b] = tE/(1e-15+E)*/;
387        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
388 #if 0
389        if (b>=NB_TONAL_SKIP_BANDS)
390        {
391           frame_tonality += tweight[b]*band_tonality[b];
392           tw_sum += tweight[b];
393        }
394 #else
395        frame_tonality += band_tonality[b];
396        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
397           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
398 #endif
399        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
400        slope += band_tonality[b]*(b-8);
401        /*printf("%f %f ", band_tonality[b], stationarity);*/
402        tonal->prev_band_tonality[b] = band_tonality[b];
403     }
404
405     bandwidth_mask = 0;
406     bandwidth = 0;
407     maxE = 0;
408     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
409 #ifdef FIXED_POINT
410     noise_floor *= 1<<(15+SIG_SHIFT);
411 #endif
412     noise_floor *= noise_floor;
413     for (b=0;b<NB_TOT_BANDS;b++)
414     {
415        float E=0;
416        int band_start, band_end;
417        /* Keep a margin of 300 Hz for aliasing */
418        band_start = extra_bands[b];
419        band_end = extra_bands[b+1];
420        for (i=band_start;i<band_end;i++)
421        {
422           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
423                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
424           E += binE;
425        }
426        maxE = MAX32(maxE, E);
427        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
428        E = MAX32(E, tonal->meanE[b]);
429        /* Use a simple follower with 13 dB/Bark slope for spreading function */
430        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
431        /* Consider the band "active" only if all these conditions are met:
432           1) less than 10 dB below the simple follower
433           2) less than 90 dB below the peak band (maximal masking possible considering
434              both the ATH and the loudness-dependent slope of the spreading function)
435           3) above the PCM quantization noise floor
436        */
437        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
438           bandwidth = b;
439     }
440     if (tonal->count<=2)
441        bandwidth = 20;
442     frame_loudness = 20*(float)log10(frame_loudness);
443     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
444     tonal->lowECount *= (1-alphaE);
445     if (frame_loudness < tonal->Etracker-30)
446        tonal->lowECount += alphaE;
447
448     for (i=0;i<8;i++)
449     {
450        float sum=0;
451        for (b=0;b<16;b++)
452           sum += dct_table[i*16+b]*logE[b];
453        BFCC[i] = sum;
454     }
455
456     frame_stationarity /= NB_TBANDS;
457     relativeE /= NB_TBANDS;
458     if (tonal->count<10)
459        relativeE = .5;
460     frame_noisiness /= NB_TBANDS;
461 #if 1
462     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
463 #else
464     info->activity = .5*(1+frame_noisiness-frame_stationarity);
465 #endif
466     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
467     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
468     tonal->prev_tonality = frame_tonality;
469
470     slope /= 8*8;
471     info->tonality_slope = slope;
472
473     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
474     tonal->count++;
475     info->tonality = frame_tonality;
476
477     for (i=0;i<4;i++)
478        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
479
480     for (i=0;i<4;i++)
481        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
482
483     for (i=0;i<4;i++)
484         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
485     for (i=0;i<3;i++)
486         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
487
488     if (tonal->count > 5)
489     {
490        for (i=0;i<9;i++)
491           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
492     }
493
494     for (i=0;i<8;i++)
495     {
496        tonal->mem[i+24] = tonal->mem[i+16];
497        tonal->mem[i+16] = tonal->mem[i+8];
498        tonal->mem[i+8] = tonal->mem[i];
499        tonal->mem[i] = BFCC[i];
500     }
501     for (i=0;i<9;i++)
502        features[11+i] = (float)sqrt(tonal->std[i]);
503     features[20] = info->tonality;
504     features[21] = info->activity;
505     features[22] = frame_stationarity;
506     features[23] = info->tonality_slope;
507     features[24] = tonal->lowECount;
508
509 #ifndef DISABLE_FLOAT_API
510     mlp_process(&net, features, frame_probs);
511     frame_probs[0] = .5f*(frame_probs[0]+1);
512     /* Curve fitting between the MLP probability and the actual probability */
513     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
514     /* Probability of active audio (as opposed to silence) */
515     frame_probs[1] = .5f*frame_probs[1]+.5f;
516     /* Consider that silence has a 50-50 probability. */
517     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
518
519     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
520     {
521        /* Probability of state transition */
522        float tau;
523        /* Represents independence of the MLP probabilities, where
524           beta=1 means fully independent. */
525        float beta;
526        /* Denormalized probability of speech (p0) and music (p1) after update */
527        float p0, p1;
528        /* Probabilities for "all speech" and "all music" */
529        float s0, m0;
530        /* Probability sum for renormalisation */
531        float psum;
532        /* Instantaneous probability of speech and music, with beta pre-applied. */
533        float speech0;
534        float music0;
535
536        /* One transition every 3 minutes of active audio */
537        tau = .00005f*frame_probs[1];
538        beta = .05f;
539        if (1) {
540           /* Adapt beta based on how "unexpected" the new prob is */
541           float p, q;
542           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
543           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
544           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
545        }
546        /* p0 and p1 are the probabilities of speech and music at this frame
547           using only information from previous frame and applying the
548           state transition model */
549        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
550        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
551        /* We apply the current probability with exponent beta to work around
552           the fact that the probability estimates aren't independent. */
553        p0 *= (float)pow(1-frame_probs[0], beta);
554        p1 *= (float)pow(frame_probs[0], beta);
555        /* Normalise the probabilities to get the Marokv probability of music. */
556        tonal->music_prob = p1/(p0+p1);
557        info->music_prob = tonal->music_prob;
558
559        /* This chunk of code deals with delayed decision. */
560        psum=1e-20f;
561        /* Instantaneous probability of speech and music, with beta pre-applied. */
562        speech0 = (float)pow(1-frame_probs[0], beta);
563        music0  = (float)pow(frame_probs[0], beta);
564        if (tonal->count==1)
565        {
566           tonal->pspeech[0]=.5;
567           tonal->pmusic [0]=.5;
568        }
569        /* Updated probability of having only speech (s0) or only music (m0),
570           before considering the new observation. */
571        s0 = tonal->pspeech[0] + tonal->pspeech[1];
572        m0 = tonal->pmusic [0] + tonal->pmusic [1];
573        /* Updates s0 and m0 with instantaneous probability. */
574        tonal->pspeech[0] = s0*(1-tau)*speech0;
575        tonal->pmusic [0] = m0*(1-tau)*music0;
576        /* Propagate the transition probabilities */
577        for (i=1;i<DETECT_SIZE-1;i++)
578        {
579           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
580           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
581        }
582        /* Probability that the latest frame is speech, when all the previous ones were music. */
583        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
584        /* Probability that the latest frame is music, when all the previous ones were speech. */
585        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
586
587        /* Renormalise probabilities to 1 */
588        for (i=0;i<DETECT_SIZE;i++)
589           psum += tonal->pspeech[i] + tonal->pmusic[i];
590        psum = 1.f/psum;
591        for (i=0;i<DETECT_SIZE;i++)
592        {
593           tonal->pspeech[i] *= psum;
594           tonal->pmusic [i] *= psum;
595        }
596        psum = tonal->pmusic[0];
597        for (i=1;i<DETECT_SIZE;i++)
598           psum += tonal->pspeech[i];
599
600        /* Estimate our confidence in the speech/music decisions */
601        if (frame_probs[1]>.75)
602        {
603           if (tonal->music_prob>.9)
604           {
605              float adapt;
606              adapt = 1.f/(++tonal->music_confidence_count);
607              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
608              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
609           }
610           if (tonal->music_prob<.1)
611           {
612              float adapt;
613              adapt = 1.f/(++tonal->speech_confidence_count);
614              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
615              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
616           }
617        } else {
618           if (tonal->music_confidence_count==0)
619              tonal->music_confidence = .9f;
620           if (tonal->speech_confidence_count==0)
621              tonal->speech_confidence = .1f;
622        }
623     }
624     if (tonal->last_music != (tonal->music_prob>.5f))
625        tonal->last_transition=0;
626     tonal->last_music = tonal->music_prob>.5f;
627 #else
628     info->music_prob = 0;
629 #endif
630     /*for (i=0;i<25;i++)
631        printf("%f ", features[i]);
632     printf("\n");*/
633
634     info->bandwidth = bandwidth;
635     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
636     info->noisiness = frame_noisiness;
637     info->valid = 1;
638     RESTORE_STACK;
639 }
640
641 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
642                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
643                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
644 {
645    int offset;
646    int pcm_len;
647
648    if (analysis_pcm != NULL)
649    {
650       /* Avoid overflow/wrap-around of the analysis buffer */
651       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
652
653       pcm_len = analysis_frame_size - analysis->analysis_offset;
654       offset = analysis->analysis_offset;
655       do {
656          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
657          offset += 480;
658          pcm_len -= 480;
659       } while (pcm_len>0);
660       analysis->analysis_offset = analysis_frame_size;
661
662       analysis->analysis_offset -= frame_size;
663    }
664
665    analysis_info->valid = 0;
666    tonality_get_info(analysis, analysis_info, frame_size);
667 }