Merge branch 'exp_analysis'
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40
41 extern const MLP net;
42
43 #ifndef M_PI
44 #define M_PI 3.141592653
45 #endif
46
47 static const float dct_table[128] = {
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
51        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
52         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
53        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
54         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
55         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
59        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
60         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
61        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
62         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
63         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
64 };
65
66 static const float analysis_window[240] = {
67       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
68       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
69       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
70       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
71       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
72       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
73       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
74       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
75       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
76       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
77       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
78       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
79       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
80       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
81       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
82       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
83       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
84       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
85       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
86       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
87       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
88       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
89       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
90       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
91       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
92       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
93       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
94       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
95       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
96       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
97 };
98
99 static const int tbands[NB_TBANDS+1] = {
100        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
101 };
102
103 static const int extra_bands[NB_TOT_BANDS+1] = {
104       0, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
105 };
106
107 /*static const float tweight[NB_TBANDS+1] = {
108       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
109 };*/
110
111 #define NB_TONAL_SKIP_BANDS 9
112
113 #define cA 0.43157974f
114 #define cB 0.67848403f
115 #define cC 0.08595542f
116 #define cE ((float)M_PI/2)
117 static inline float fast_atan2f(float y, float x) {
118    float x2, y2;
119    /* Should avoid underflow on the values we'll get */
120    if (ABS16(x)+ABS16(y)<1e-9f)
121    {
122       x*=1e12f;
123       y*=1e12f;
124    }
125    x2 = x*x;
126    y2 = y*y;
127    if(x2<y2){
128       float den = (y2 + cB*x2) * (y2 + cC*x2);
129       if (den!=0)
130          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
131       else
132          return (y<0 ? -cE : cE);
133    }else{
134       float den = (x2 + cB*y2) * (x2 + cC*y2);
135       if (den!=0)
136          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
137       else
138          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
139    }
140 }
141
142 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
143 {
144    int pos;
145    int curr_lookahead;
146    float psum;
147    int i;
148
149    pos = tonal->read_pos;
150    curr_lookahead = tonal->write_pos-tonal->read_pos;
151    if (curr_lookahead<0)
152       curr_lookahead += DETECT_SIZE;
153
154    if (len > 480 && pos != tonal->write_pos)
155    {
156       pos++;
157       if (pos==DETECT_SIZE)
158          pos=0;
159    }
160    if (pos == tonal->write_pos)
161       pos--;
162    if (pos<0)
163       pos = DETECT_SIZE-1;
164    OPUS_COPY(info_out, &tonal->info[pos], 1);
165    tonal->read_subframe += len/120;
166    while (tonal->read_subframe>=4)
167    {
168       tonal->read_subframe -= 4;
169       tonal->read_pos++;
170    }
171    if (tonal->read_pos>=DETECT_SIZE)
172       tonal->read_pos-=DETECT_SIZE;
173
174    /* Compensate for the delay in the features themselves.
175       FIXME: Need a better estimate the 10 I just made up */
176    curr_lookahead = IMAX(curr_lookahead-10, 0);
177
178    psum=0;
179    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
180       psum += tonal->pmusic[i];
181    for (;i<DETECT_SIZE;i++)
182       psum += tonal->pspeech[i];
183    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
184    /*printf("%f %f\n", psum, info_out->music_prob);*/
185
186    info_out->music_prob = psum;
187 }
188
189 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
190 {
191     int i, b;
192     const kiss_fft_state *kfft;
193     kiss_fft_cpx in[480], out[480];
194     int N = 480, N2=240;
195     float * OPUS_RESTRICT A = tonal->angle;
196     float * OPUS_RESTRICT dA = tonal->d_angle;
197     float * OPUS_RESTRICT d2A = tonal->d2_angle;
198     float tonality[240];
199     float noisiness[240];
200     float band_tonality[NB_TBANDS];
201     float logE[NB_TBANDS];
202     float BFCC[8];
203     float features[100];
204     float frame_tonality;
205     float max_frame_tonality;
206     /*float tw_sum=0;*/
207     float frame_noisiness;
208     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
209     float slope=0;
210     float frame_stationarity;
211     float relativeE;
212     float frame_probs[2];
213     float alpha, alphaE, alphaE2;
214     float frame_loudness;
215     float bandwidth_mask;
216     int bandwidth=0;
217     float maxE = 0;
218     float noise_floor;
219     int remaining;
220     AnalysisInfo *info;
221
222     tonal->last_transition++;
223     alpha = 1.f/IMIN(20, 1+tonal->count);
224     alphaE = 1.f/IMIN(50, 1+tonal->count);
225     alphaE2 = 1.f/IMIN(6000, 1+tonal->count);
226
227     if (tonal->count<4)
228        tonal->music_prob = .5;
229     kfft = celt_mode->mdct.kfft[0];
230     if (tonal->count==0)
231        tonal->mem_fill = 240;
232     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C);
233     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
234     {
235        tonal->mem_fill += len;
236        /* Don't have enough to update the analysis */
237        return;
238     }
239     info = &tonal->info[tonal->write_pos++];
240     if (tonal->write_pos>=DETECT_SIZE)
241        tonal->write_pos-=DETECT_SIZE;
242
243     for (i=0;i<N2;i++)
244     {
245        float w = analysis_window[i];
246        in[i].r = MULT16_16(w, tonal->inmem[i]);
247        in[i].i = MULT16_16(w, tonal->inmem[N2+i]);
248        in[N-i-1].r = MULT16_16(w, tonal->inmem[N-i-1]);
249        in[N-i-1].i = MULT16_16(w, tonal->inmem[N+N2-i-1]);
250     }
251     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
252     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
253     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C);
254     tonal->mem_fill = 240 + remaining;
255     opus_fft(kfft, in, out);
256
257     for (i=1;i<N2;i++)
258     {
259        float X1r, X2r, X1i, X2i;
260        float angle, d_angle, d2_angle;
261        float angle2, d_angle2, d2_angle2;
262        float mod1, mod2, avg_mod;
263        X1r = out[i].r+out[N-i].r;
264        X1i = out[i].i-out[N-i].i;
265        X2r = out[i].i+out[N-i].i;
266        X2i = out[N-i].r-out[i].r;
267
268        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
269        d_angle = angle - A[i];
270        d2_angle = d_angle - dA[i];
271
272        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
273        d_angle2 = angle2 - angle;
274        d2_angle2 = d_angle2 - d_angle;
275
276        mod1 = d2_angle - (float)floor(.5+d2_angle);
277        noisiness[i] = ABS16(mod1);
278        mod1 *= mod1;
279        mod1 *= mod1;
280
281        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
282        noisiness[i] += ABS16(mod2);
283        mod2 *= mod2;
284        mod2 *= mod2;
285
286        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
287        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
288
289        A[i] = angle2;
290        dA[i] = d_angle2;
291        d2A[i] = mod2;
292     }
293
294     frame_tonality = 0;
295     max_frame_tonality = 0;
296     /*tw_sum = 0;*/
297     info->activity = 0;
298     frame_noisiness = 0;
299     frame_stationarity = 0;
300     if (!tonal->count)
301     {
302        for (b=0;b<NB_TBANDS;b++)
303        {
304           tonal->lowE[b] = 1e10;
305           tonal->highE[b] = -1e10;
306        }
307     }
308     relativeE = 0;
309     frame_loudness = 0;
310     bandwidth_mask = 0;
311     for (b=0;b<NB_TBANDS;b++)
312     {
313        float E=0, tE=0, nE=0;
314        float L1, L2;
315        float stationarity;
316        for (i=tbands[b];i<tbands[b+1];i++)
317        {
318           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
319                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
320           E += binE;
321           tE += binE*tonality[i];
322           nE += binE*2.f*(.5f-noisiness[i]);
323        }
324        tonal->E[tonal->E_count][b] = E;
325        frame_noisiness += nE/(1e-15f+E);
326
327        frame_loudness += celt_sqrt(E+1e-10f);
328        logE[b] = (float)log(E+1e-10f);
329        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
330        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
331        if (tonal->highE[b] < tonal->lowE[b]+1.f)
332        {
333           tonal->highE[b]+=.5f;
334           tonal->lowE[b]-=.5f;
335        }
336        relativeE += (logE[b]-tonal->lowE[b])/(EPSILON+tonal->highE[b]-tonal->lowE[b]);
337
338        L1=L2=0;
339        for (i=0;i<NB_FRAMES;i++)
340        {
341           L1 += celt_sqrt(tonal->E[i][b]);
342           L2 += tonal->E[i][b];
343        }
344
345        stationarity = MIN16(0.99f,L1/celt_sqrt(EPSILON+NB_FRAMES*L2));
346        stationarity *= stationarity;
347        stationarity *= stationarity;
348        frame_stationarity += stationarity;
349        /*band_tonality[b] = tE/(1e-15+E)*/;
350        band_tonality[b] = MAX16(tE/(EPSILON+E), stationarity*tonal->prev_band_tonality[b]);
351 #if 0
352        if (b>=NB_TONAL_SKIP_BANDS)
353        {
354           frame_tonality += tweight[b]*band_tonality[b];
355           tw_sum += tweight[b];
356        }
357 #else
358        frame_tonality += band_tonality[b];
359        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
360           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
361 #endif
362        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
363        slope += band_tonality[b]*(b-8);
364        /*printf("%f %f ", band_tonality[b], stationarity);*/
365        tonal->prev_band_tonality[b] = band_tonality[b];
366     }
367
368     bandwidth_mask = 0;
369     bandwidth = 0;
370     for (b=0;b<NB_TOT_BANDS;b++)
371        maxE = MAX32(maxE, tonal->meanE[b]);
372     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
373     noise_floor *= noise_floor;
374     for (b=0;b<NB_TOT_BANDS;b++)
375     {
376        float E=0;
377        int band_start, band_end;
378        /* Keep a margin of 300 Hz for aliasing */
379        band_start = extra_bands[b]+3;
380        band_end = extra_bands[b+1]+3;
381        for (i=band_start;i<band_end;i++)
382        {
383           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
384                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
385           E += binE;
386        }
387        E /= (band_end-band_start);
388        maxE = MAX32(maxE, E);
389        if (tonal->count>2)
390        {
391           tonal->meanE[b] = (1-alphaE2)*tonal->meanE[b] + alphaE2*E;
392        } else {
393           tonal->meanE[b] = E;
394        }
395        E = MAX32(E, tonal->meanE[b]);
396        /* 13 dB slope for spreading function */
397        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
398        /* Checks if band looks like stationary noise or if it's below a (trivial) masking curve */
399        if (E>.1*bandwidth_mask && E*1e10f > maxE && E > noise_floor)
400           bandwidth = b;
401     }
402     if (tonal->count<=2)
403        bandwidth = 20;
404     frame_loudness = 20*(float)log10(frame_loudness);
405     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
406     tonal->lowECount *= (1-alphaE);
407     if (frame_loudness < tonal->Etracker-30)
408        tonal->lowECount += alphaE;
409
410     for (i=0;i<8;i++)
411     {
412        float sum=0;
413        for (b=0;b<16;b++)
414           sum += dct_table[i*16+b]*logE[b];
415        BFCC[i] = sum;
416     }
417
418     frame_stationarity /= NB_TBANDS;
419     relativeE /= NB_TBANDS;
420     if (tonal->count<10)
421        relativeE = .5;
422     frame_noisiness /= NB_TBANDS;
423 #if 1
424     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
425 #else
426     info->activity = .5*(1+frame_noisiness-frame_stationarity);
427 #endif
428     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
429     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
430     tonal->prev_tonality = frame_tonality;
431
432     slope /= 8*8;
433     info->tonality_slope = slope;
434
435     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
436     tonal->count++;
437     info->tonality = frame_tonality;
438
439     for (i=0;i<4;i++)
440        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
441
442     for (i=0;i<4;i++)
443        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
444
445     for (i=0;i<4;i++)
446         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
447     for (i=0;i<3;i++)
448         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
449
450     if (tonal->count > 5)
451     {
452        for (i=0;i<9;i++)
453           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
454     }
455
456     for (i=0;i<8;i++)
457     {
458        tonal->mem[i+24] = tonal->mem[i+16];
459        tonal->mem[i+16] = tonal->mem[i+8];
460        tonal->mem[i+8] = tonal->mem[i];
461        tonal->mem[i] = BFCC[i];
462     }
463     for (i=0;i<9;i++)
464        features[11+i] = celt_sqrt(tonal->std[i]);
465     features[20] = info->tonality;
466     features[21] = info->activity;
467     features[22] = frame_stationarity;
468     features[23] = info->tonality_slope;
469     features[24] = tonal->lowECount;
470
471 #ifndef FIXED_POINT
472     mlp_process(&net, features, frame_probs);
473     frame_probs[0] = .5f*(frame_probs[0]+1);
474     /* Curve fitting between the MLP probability and the actual probability */
475     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
476     frame_probs[1] = .5*frame_probs[1]+.5;
477     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5;
478
479     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
480     {
481        float tau, beta;
482        float p0, p1;
483        /* One transition every 3 minutes */
484        tau = .00005f*frame_probs[1];
485        beta = .05f;
486        if (1) {
487           /* Adapt beta based on how "unexpected" the new prob is */
488           float p, q;
489           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
490           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
491           beta = .01+.05*ABS16(p-q)/(p*(1-q)+q*(1-p));
492        }
493        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
494        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
495        p0 *= (float)pow(1-frame_probs[0], beta);
496        p1 *= (float)pow(frame_probs[0], beta);
497        tonal->music_prob = p1/(p0+p1);
498        info->music_prob = tonal->music_prob;
499
500        float psum=1e-20;
501        float speech0 = (float)pow(1-frame_probs[0], beta);
502        float music0  = (float)pow(frame_probs[0], beta);
503        if (tonal->count==1)
504        {
505           tonal->pspeech[0]=.5;
506           tonal->pmusic [0]=.5;
507        }
508        float s0, m0;
509        s0 = tonal->pspeech[0] + tonal->pspeech[1];
510        m0 = tonal->pmusic [0] + tonal->pmusic [1];
511        tonal->pspeech[0] = s0*(1-tau)*speech0;
512        tonal->pmusic [0] = m0*(1-tau)*music0;
513        for (i=1;i<DETECT_SIZE-1;i++)
514        {
515           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
516           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
517        }
518        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
519        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
520
521        for (i=0;i<DETECT_SIZE;i++)
522           psum += tonal->pspeech[i] + tonal->pmusic[i];
523        psum = 1.f/psum;
524        for (i=0;i<DETECT_SIZE;i++)
525        {
526           tonal->pspeech[i] *= psum;
527           tonal->pmusic [i] *= psum;
528        }
529        psum = tonal->pmusic[0];
530        for (i=1;i<DETECT_SIZE;i++)
531           psum += tonal->pspeech[i];
532
533        /* Estimate our confidence in the speech/music decisions */
534        if (frame_probs[1]>.75)
535        {
536           if (tonal->music_prob>.9)
537           {
538              float adapt;
539              adapt = 1.f/(++tonal->music_confidence_count);
540              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
541              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
542           }
543           if (tonal->music_prob<.1)
544           {
545              float adapt;
546              adapt = 1.f/(++tonal->speech_confidence_count);
547              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
548              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
549           }
550        } else {
551           if (tonal->music_confidence_count==0)
552              tonal->music_confidence = .9;
553           if (tonal->speech_confidence_count==0)
554              tonal->speech_confidence = .1;
555        }
556        psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum));
557     }
558     if (tonal->last_music != (tonal->music_prob>.5f))
559        tonal->last_transition=0;
560     tonal->last_music = tonal->music_prob>.5f;
561 #else
562     info->music_prob = 0;
563 #endif
564     /*for (i=0;i<25;i++)
565        printf("%f ", features[i]);
566     printf("\n");*/
567
568     if (bandwidth<=12 || (bandwidth==13 && tonal->opus_bandwidth == OPUS_BANDWIDTH_NARROWBAND))
569        tonal->opus_bandwidth = OPUS_BANDWIDTH_NARROWBAND;
570     else if (bandwidth<=14 || (bandwidth==15 && tonal->opus_bandwidth == OPUS_BANDWIDTH_MEDIUMBAND))
571        tonal->opus_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND;
572     else if (bandwidth<=16 || (bandwidth==17 && tonal->opus_bandwidth == OPUS_BANDWIDTH_WIDEBAND))
573        tonal->opus_bandwidth = OPUS_BANDWIDTH_WIDEBAND;
574     else if (bandwidth<=18)
575        tonal->opus_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND;
576     else
577        tonal->opus_bandwidth = OPUS_BANDWIDTH_FULLBAND;
578
579     info->bandwidth = bandwidth;
580     info->opus_bandwidth = tonal->opus_bandwidth;
581     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
582     info->noisiness = frame_noisiness;
583     info->valid = 1;
584     if (info_out!=NULL)
585        OPUS_COPY(info_out, info, 1);
586 }
587
588 int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
589                         const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
590                         int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
591 {
592    int offset;
593    int pcm_len;
594
595    /* Avoid overflow/wrap-around of the analysis buffer */
596    frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
597
598    pcm_len = frame_size - analysis->analysis_offset;
599    offset = 0;
600    do {
601       tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix);
602       offset += 480;
603       pcm_len -= 480;
604    } while (pcm_len>0);
605    analysis->analysis_offset = frame_size;
606
607    if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
608    {
609       int LM = 3;
610       LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps,
611             analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
612       while ((Fs/400<<LM)>frame_size)
613          LM--;
614       frame_size = (Fs/400<<LM);
615    } else {
616       frame_size = frame_size_select(frame_size, variable_duration, Fs);
617    }
618    if (frame_size<0)
619       return -1;
620    analysis->analysis_offset -= frame_size;
621
622    /* Only perform analysis up to 20-ms frames. Longer ones will be split if
623       they're in CELT-only mode. */
624    analysis_info->valid = 0;
625    tonality_get_info(analysis, analysis_info, frame_size);
626
627    return frame_size;
628 }