First pass at making the analysis code run with FIXED_POINT
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 extern const MLP net;
43
44 #ifndef M_PI
45 #define M_PI 3.141592653
46 #endif
47
48 static const float dct_table[128] = {
49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 };
66
67 static const float analysis_window[240] = {
68       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 };
99
100 static const int tbands[NB_TBANDS+1] = {
101        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 };
103
104 static const int extra_bands[NB_TOT_BANDS+1] = {
105       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 };
107
108 /*static const float tweight[NB_TBANDS+1] = {
109       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 };*/
111
112 #define NB_TONAL_SKIP_BANDS 9
113
114 #define cA 0.43157974f
115 #define cB 0.67848403f
116 #define cC 0.08595542f
117 #define cE ((float)M_PI/2)
118 static inline float fast_atan2f(float y, float x) {
119    float x2, y2;
120    /* Should avoid underflow on the values we'll get */
121    if (ABS16(x)+ABS16(y)<1e-9f)
122    {
123       x*=1e12f;
124       y*=1e12f;
125    }
126    x2 = x*x;
127    y2 = y*y;
128    if(x2<y2){
129       float den = (y2 + cB*x2) * (y2 + cC*x2);
130       if (den!=0)
131          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132       else
133          return (y<0 ? -cE : cE);
134    }else{
135       float den = (x2 + cB*y2) * (x2 + cC*y2);
136       if (den!=0)
137          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138       else
139          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140    }
141 }
142
143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144 {
145    int pos;
146    int curr_lookahead;
147    float psum;
148    int i;
149
150    pos = tonal->read_pos;
151    curr_lookahead = tonal->write_pos-tonal->read_pos;
152    if (curr_lookahead<0)
153       curr_lookahead += DETECT_SIZE;
154
155    if (len > 480 && pos != tonal->write_pos)
156    {
157       pos++;
158       if (pos==DETECT_SIZE)
159          pos=0;
160    }
161    if (pos == tonal->write_pos)
162       pos--;
163    if (pos<0)
164       pos = DETECT_SIZE-1;
165    OPUS_COPY(info_out, &tonal->info[pos], 1);
166    tonal->read_subframe += len/120;
167    while (tonal->read_subframe>=4)
168    {
169       tonal->read_subframe -= 4;
170       tonal->read_pos++;
171    }
172    if (tonal->read_pos>=DETECT_SIZE)
173       tonal->read_pos-=DETECT_SIZE;
174
175    /* Compensate for the delay in the features themselves.
176       FIXME: Need a better estimate the 10 I just made up */
177    curr_lookahead = IMAX(curr_lookahead-10, 0);
178
179    psum=0;
180    /* Summing the probability of transition patterns that involve music at
181       time (DETECT_SIZE-curr_lookahead-1) */
182    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
183       psum += tonal->pmusic[i];
184    for (;i<DETECT_SIZE;i++)
185       psum += tonal->pspeech[i];
186    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
187    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
188
189    info_out->music_prob = psum;
190 }
191
192 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
193 {
194     int i, b;
195     const kiss_fft_state *kfft;
196     VARDECL(kiss_fft_cpx, in);
197     VARDECL(kiss_fft_cpx, out);
198     int N = 480, N2=240;
199     float * OPUS_RESTRICT A = tonal->angle;
200     float * OPUS_RESTRICT dA = tonal->d_angle;
201     float * OPUS_RESTRICT d2A = tonal->d2_angle;
202     VARDECL(float, tonality);
203     VARDECL(float, noisiness);
204     float band_tonality[NB_TBANDS];
205     float logE[NB_TBANDS];
206     float BFCC[8];
207     float features[25];
208     float frame_tonality;
209     float max_frame_tonality;
210     /*float tw_sum=0;*/
211     float frame_noisiness;
212     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
213     float slope=0;
214     float frame_stationarity;
215     float relativeE;
216     float frame_probs[2];
217     float alpha, alphaE, alphaE2;
218     float frame_loudness;
219     float bandwidth_mask;
220     int bandwidth=0;
221     float maxE = 0;
222     float noise_floor;
223     int remaining;
224     AnalysisInfo *info;
225     SAVE_STACK;
226
227     tonal->last_transition++;
228     alpha = 1.f/IMIN(20, 1+tonal->count);
229     alphaE = 1.f/IMIN(50, 1+tonal->count);
230     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
231
232     if (tonal->count<4)
233        tonal->music_prob = .5;
234     kfft = celt_mode->mdct.kfft[0];
235     if (tonal->count==0)
236        tonal->mem_fill = 240;
237     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
238     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
239     {
240        tonal->mem_fill += len;
241        /* Don't have enough to update the analysis */
242        RESTORE_STACK;
243        return;
244     }
245     info = &tonal->info[tonal->write_pos++];
246     if (tonal->write_pos>=DETECT_SIZE)
247        tonal->write_pos-=DETECT_SIZE;
248
249     ALLOC(in, 480, kiss_fft_cpx);
250     ALLOC(out, 480, kiss_fft_cpx);
251     ALLOC(tonality, 240, float);
252     ALLOC(noisiness, 240, float);
253     for (i=0;i<N2;i++)
254     {
255        float w = analysis_window[i];
256        in[i].r = w*tonal->inmem[i];
257        in[i].i = w*tonal->inmem[N2+i];
258        in[N-i-1].r = w*tonal->inmem[N-i-1];
259        in[N-i-1].i = w*tonal->inmem[N+N2-i-1];
260     }
261     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
262     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
263     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
264     tonal->mem_fill = 240 + remaining;
265     opus_fft(kfft, in, out);
266
267     for (i=1;i<N2;i++)
268     {
269        float X1r, X2r, X1i, X2i;
270        float angle, d_angle, d2_angle;
271        float angle2, d_angle2, d2_angle2;
272        float mod1, mod2, avg_mod;
273        X1r = out[i].r+out[N-i].r;
274        X1i = out[i].i-out[N-i].i;
275        X2r = out[i].i+out[N-i].i;
276        X2i = out[N-i].r-out[i].r;
277
278        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
279        d_angle = angle - A[i];
280        d2_angle = d_angle - dA[i];
281
282        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
283        d_angle2 = angle2 - angle;
284        d2_angle2 = d_angle2 - d_angle;
285
286        mod1 = d2_angle - (float)floor(.5+d2_angle);
287        noisiness[i] = ABS16(mod1);
288        mod1 *= mod1;
289        mod1 *= mod1;
290
291        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
292        noisiness[i] += ABS16(mod2);
293        mod2 *= mod2;
294        mod2 *= mod2;
295
296        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
297        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
298
299        A[i] = angle2;
300        dA[i] = d_angle2;
301        d2A[i] = mod2;
302     }
303
304     frame_tonality = 0;
305     max_frame_tonality = 0;
306     /*tw_sum = 0;*/
307     info->activity = 0;
308     frame_noisiness = 0;
309     frame_stationarity = 0;
310     if (!tonal->count)
311     {
312        for (b=0;b<NB_TBANDS;b++)
313        {
314           tonal->lowE[b] = 1e10;
315           tonal->highE[b] = -1e10;
316        }
317     }
318     relativeE = 0;
319     frame_loudness = 0;
320     bandwidth_mask = 0;
321     for (b=0;b<NB_TBANDS;b++)
322     {
323        float E=0, tE=0, nE=0;
324        float L1, L2;
325        float stationarity;
326        for (i=tbands[b];i<tbands[b+1];i++)
327        {
328           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
329                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
330           E += binE;
331           tE += binE*tonality[i];
332           nE += binE*2.f*(.5f-noisiness[i]);
333        }
334        tonal->E[tonal->E_count][b] = E;
335        frame_noisiness += nE/(1e-15f+E);
336
337        frame_loudness += sqrt(E+1e-10f);
338        logE[b] = (float)log(E+1e-10f);
339        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
340        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
341        if (tonal->highE[b] < tonal->lowE[b]+1.f)
342        {
343           tonal->highE[b]+=.5f;
344           tonal->lowE[b]-=.5f;
345        }
346        relativeE += (logE[b]-tonal->lowE[b])/(EPSILON+tonal->highE[b]-tonal->lowE[b]);
347
348        L1=L2=0;
349        for (i=0;i<NB_FRAMES;i++)
350        {
351           L1 += sqrt(tonal->E[i][b]);
352           L2 += tonal->E[i][b];
353        }
354
355        stationarity = MIN16(0.99f,L1/sqrt(EPSILON+NB_FRAMES*L2));
356        stationarity *= stationarity;
357        stationarity *= stationarity;
358        frame_stationarity += stationarity;
359        /*band_tonality[b] = tE/(1e-15+E)*/;
360        band_tonality[b] = MAX16(tE/(EPSILON+E), stationarity*tonal->prev_band_tonality[b]);
361 #if 0
362        if (b>=NB_TONAL_SKIP_BANDS)
363        {
364           frame_tonality += tweight[b]*band_tonality[b];
365           tw_sum += tweight[b];
366        }
367 #else
368        frame_tonality += band_tonality[b];
369        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
370           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
371 #endif
372        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
373        slope += band_tonality[b]*(b-8);
374        /*printf("%f %f ", band_tonality[b], stationarity);*/
375        tonal->prev_band_tonality[b] = band_tonality[b];
376     }
377
378     bandwidth_mask = 0;
379     bandwidth = 0;
380     maxE = 0;
381     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
382 #ifdef FIXED_POINT
383     noise_floor *= 1<<(15+SIG_SHIFT);
384 #endif
385     noise_floor *= noise_floor;
386     for (b=0;b<NB_TOT_BANDS;b++)
387     {
388        float E=0;
389        int band_start, band_end;
390        /* Keep a margin of 300 Hz for aliasing */
391        band_start = extra_bands[b];
392        band_end = extra_bands[b+1];
393        for (i=band_start;i<band_end;i++)
394        {
395           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
396                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
397           E += binE;
398        }
399        maxE = MAX32(maxE, E);
400        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
401        E = MAX32(E, tonal->meanE[b]);
402        /* Use a simple follower with 13 dB/Bark slope for spreading function */
403        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
404        /* Consider the band "active" only if all these conditions are met:
405           1) less than 10 dB below the simple follower
406           2) less than 90 dB below the peak band (maximal masking possible considering
407              both the ATH and the loudness-dependent slope of the spreading function)
408           3) above the PCM quantization noise floor
409        */
410        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
411           bandwidth = b;
412     }
413     if (tonal->count<=2)
414        bandwidth = 20;
415     frame_loudness = 20*(float)log10(frame_loudness);
416     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
417     tonal->lowECount *= (1-alphaE);
418     if (frame_loudness < tonal->Etracker-30)
419        tonal->lowECount += alphaE;
420
421     for (i=0;i<8;i++)
422     {
423        float sum=0;
424        for (b=0;b<16;b++)
425           sum += dct_table[i*16+b]*logE[b];
426        BFCC[i] = sum;
427     }
428
429     frame_stationarity /= NB_TBANDS;
430     relativeE /= NB_TBANDS;
431     if (tonal->count<10)
432        relativeE = .5;
433     frame_noisiness /= NB_TBANDS;
434 #if 1
435     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
436 #else
437     info->activity = .5*(1+frame_noisiness-frame_stationarity);
438 #endif
439     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
440     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
441     tonal->prev_tonality = frame_tonality;
442
443     slope /= 8*8;
444     info->tonality_slope = slope;
445
446     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
447     tonal->count++;
448     info->tonality = frame_tonality;
449
450     for (i=0;i<4;i++)
451        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
452
453     for (i=0;i<4;i++)
454        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
455
456     for (i=0;i<4;i++)
457         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
458     for (i=0;i<3;i++)
459         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
460
461     if (tonal->count > 5)
462     {
463        for (i=0;i<9;i++)
464           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
465     }
466
467     for (i=0;i<8;i++)
468     {
469        tonal->mem[i+24] = tonal->mem[i+16];
470        tonal->mem[i+16] = tonal->mem[i+8];
471        tonal->mem[i+8] = tonal->mem[i];
472        tonal->mem[i] = BFCC[i];
473     }
474     for (i=0;i<9;i++)
475        features[11+i] = sqrt(tonal->std[i]);
476     features[20] = info->tonality;
477     features[21] = info->activity;
478     features[22] = frame_stationarity;
479     features[23] = info->tonality_slope;
480     features[24] = tonal->lowECount;
481
482 #ifndef FIXED_POINT
483     mlp_process(&net, features, frame_probs);
484     frame_probs[0] = .5f*(frame_probs[0]+1);
485     /* Curve fitting between the MLP probability and the actual probability */
486     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
487     /* Probability of active audio (as opposed to silence) */
488     frame_probs[1] = .5f*frame_probs[1]+.5f;
489     /* Consider that silence has a 50-50 probability. */
490     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
491
492     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
493     {
494        /* Probability of state transition */
495        float tau;
496        /* Represents independence of the MLP probabilities, where
497           beta=1 means fully independent. */
498        float beta;
499        /* Denormalized probability of speech (p0) and music (p1) after update */
500        float p0, p1;
501        /* Probabilities for "all speech" and "all music" */
502        float s0, m0;
503        /* Probability sum for renormalisation */
504        float psum;
505        /* Instantaneous probability of speech and music, with beta pre-applied. */
506        float speech0;
507        float music0;
508
509        /* One transition every 3 minutes of active audio */
510        tau = .00005f*frame_probs[1];
511        beta = .05f;
512        if (1) {
513           /* Adapt beta based on how "unexpected" the new prob is */
514           float p, q;
515           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
516           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
517           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
518        }
519        /* p0 and p1 are the probabilities of speech and music at this frame
520           using only information from previous frame and applying the
521           state transition model */
522        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
523        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
524        /* We apply the current probability with exponent beta to work around
525           the fact that the probability estimates aren't independent. */
526        p0 *= (float)pow(1-frame_probs[0], beta);
527        p1 *= (float)pow(frame_probs[0], beta);
528        /* Normalise the probabilities to get the Marokv probability of music. */
529        tonal->music_prob = p1/(p0+p1);
530        info->music_prob = tonal->music_prob;
531
532        /* This chunk of code deals with delayed decision. */
533        psum=1e-20f;
534        /* Instantaneous probability of speech and music, with beta pre-applied. */
535        speech0 = (float)pow(1-frame_probs[0], beta);
536        music0  = (float)pow(frame_probs[0], beta);
537        if (tonal->count==1)
538        {
539           tonal->pspeech[0]=.5;
540           tonal->pmusic [0]=.5;
541        }
542        /* Updated probability of having only speech (s0) or only music (m0),
543           before considering the new observation. */
544        s0 = tonal->pspeech[0] + tonal->pspeech[1];
545        m0 = tonal->pmusic [0] + tonal->pmusic [1];
546        /* Updates s0 and m0 with instantaneous probability. */
547        tonal->pspeech[0] = s0*(1-tau)*speech0;
548        tonal->pmusic [0] = m0*(1-tau)*music0;
549        /* Propagate the transition probabilities */
550        for (i=1;i<DETECT_SIZE-1;i++)
551        {
552           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
553           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
554        }
555        /* Probability that the latest frame is speech, when all the previous ones were music. */
556        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
557        /* Probability that the latest frame is music, when all the previous ones were speech. */
558        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
559
560        /* Renormalise probabilities to 1 */
561        for (i=0;i<DETECT_SIZE;i++)
562           psum += tonal->pspeech[i] + tonal->pmusic[i];
563        psum = 1.f/psum;
564        for (i=0;i<DETECT_SIZE;i++)
565        {
566           tonal->pspeech[i] *= psum;
567           tonal->pmusic [i] *= psum;
568        }
569        psum = tonal->pmusic[0];
570        for (i=1;i<DETECT_SIZE;i++)
571           psum += tonal->pspeech[i];
572
573        /* Estimate our confidence in the speech/music decisions */
574        if (frame_probs[1]>.75)
575        {
576           if (tonal->music_prob>.9)
577           {
578              float adapt;
579              adapt = 1.f/(++tonal->music_confidence_count);
580              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
581              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
582           }
583           if (tonal->music_prob<.1)
584           {
585              float adapt;
586              adapt = 1.f/(++tonal->speech_confidence_count);
587              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
588              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
589           }
590        } else {
591           if (tonal->music_confidence_count==0)
592              tonal->music_confidence = .9f;
593           if (tonal->speech_confidence_count==0)
594              tonal->speech_confidence = .1f;
595        }
596        psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum));
597     }
598     if (tonal->last_music != (tonal->music_prob>.5f))
599        tonal->last_transition=0;
600     tonal->last_music = tonal->music_prob>.5f;
601 #else
602     info->music_prob = 0;
603 #endif
604     /*for (i=0;i<25;i++)
605        printf("%f ", features[i]);
606     printf("\n");*/
607
608     info->bandwidth = bandwidth;
609     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
610     info->noisiness = frame_noisiness;
611     info->valid = 1;
612     if (info_out!=NULL)
613        OPUS_COPY(info_out, info, 1);
614     RESTORE_STACK;
615 }
616
617 int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const opus_val16 *pcm,
618                         const void *analysis_pcm, int frame_size, int variable_duration, int c1, int c2, int C, opus_int32 Fs, int bitrate_bps,
619                         int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
620 {
621    int offset;
622    int pcm_len;
623
624    /* Avoid overflow/wrap-around of the analysis buffer */
625    frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
626
627    pcm_len = frame_size - analysis->analysis_offset;
628    offset = analysis->analysis_offset;
629    do {
630       tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
631       offset += 480;
632       pcm_len -= 480;
633    } while (pcm_len>0);
634    analysis->analysis_offset = frame_size;
635
636    if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
637    {
638       int LM = 3;
639       LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps,
640             analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
641       while ((Fs/400<<LM)>frame_size)
642          LM--;
643       frame_size = (Fs/400<<LM);
644    } else {
645       frame_size = frame_size_select(frame_size, variable_duration, Fs);
646    }
647    if (frame_size<0)
648       return -1;
649    analysis->analysis_offset -= frame_size;
650
651    /* Only perform analysis up to 20-ms frames. Longer ones will be split if
652       they're in CELT-only mode. */
653    analysis_info->valid = 0;
654    tonality_get_info(analysis, analysis_info, frame_size);
655
656    return frame_size;
657 }