armv7(float): Optimize encode usecase using NE10 library
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    /* Should avoid underflow on the values we'll get */
119    if (ABS16(x)+ABS16(y)<1e-9f)
120    {
121       x*=1e12f;
122       y*=1e12f;
123    }
124    x2 = x*x;
125    y2 = y*y;
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       if (den!=0)
129          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130       else
131          return (y<0 ? -cE : cE);
132    }else{
133       float den = (x2 + cB*y2) * (x2 + cC*y2);
134       if (den!=0)
135          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136       else
137          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138    }
139 }
140
141 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
142 {
143    int pos;
144    int curr_lookahead;
145    float psum;
146    int i;
147
148    pos = tonal->read_pos;
149    curr_lookahead = tonal->write_pos-tonal->read_pos;
150    if (curr_lookahead<0)
151       curr_lookahead += DETECT_SIZE;
152
153    if (len > 480 && pos != tonal->write_pos)
154    {
155       pos++;
156       if (pos==DETECT_SIZE)
157          pos=0;
158    }
159    if (pos == tonal->write_pos)
160       pos--;
161    if (pos<0)
162       pos = DETECT_SIZE-1;
163    OPUS_COPY(info_out, &tonal->info[pos], 1);
164    tonal->read_subframe += len/120;
165    while (tonal->read_subframe>=4)
166    {
167       tonal->read_subframe -= 4;
168       tonal->read_pos++;
169    }
170    if (tonal->read_pos>=DETECT_SIZE)
171       tonal->read_pos-=DETECT_SIZE;
172
173    /* Compensate for the delay in the features themselves.
174       FIXME: Need a better estimate the 10 I just made up */
175    curr_lookahead = IMAX(curr_lookahead-10, 0);
176
177    psum=0;
178    /* Summing the probability of transition patterns that involve music at
179       time (DETECT_SIZE-curr_lookahead-1) */
180    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
181       psum += tonal->pmusic[i];
182    for (;i<DETECT_SIZE;i++)
183       psum += tonal->pspeech[i];
184    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
185    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
186
187    info_out->music_prob = psum;
188 }
189
190 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix, int arch)
191 {
192     int i, b;
193     const kiss_fft_state *kfft;
194     VARDECL(kiss_fft_cpx, in);
195     VARDECL(kiss_fft_cpx, out);
196     int N = 480, N2=240;
197     float * OPUS_RESTRICT A = tonal->angle;
198     float * OPUS_RESTRICT dA = tonal->d_angle;
199     float * OPUS_RESTRICT d2A = tonal->d2_angle;
200     VARDECL(float, tonality);
201     VARDECL(float, noisiness);
202     float band_tonality[NB_TBANDS];
203     float logE[NB_TBANDS];
204     float BFCC[8];
205     float features[25];
206     float frame_tonality;
207     float max_frame_tonality;
208     /*float tw_sum=0;*/
209     float frame_noisiness;
210     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
211     float slope=0;
212     float frame_stationarity;
213     float relativeE;
214     float frame_probs[2];
215     float alpha, alphaE, alphaE2;
216     float frame_loudness;
217     float bandwidth_mask;
218     int bandwidth=0;
219     float maxE = 0;
220     float noise_floor;
221     int remaining;
222     AnalysisInfo *info;
223     SAVE_STACK;
224
225     tonal->last_transition++;
226     alpha = 1.f/IMIN(20, 1+tonal->count);
227     alphaE = 1.f/IMIN(50, 1+tonal->count);
228     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
229
230     if (tonal->count<4)
231        tonal->music_prob = .5;
232     kfft = celt_mode->mdct.kfft[0];
233     if (tonal->count==0)
234        tonal->mem_fill = 240;
235     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
236     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
237     {
238        tonal->mem_fill += len;
239        /* Don't have enough to update the analysis */
240        RESTORE_STACK;
241        return;
242     }
243     info = &tonal->info[tonal->write_pos++];
244     if (tonal->write_pos>=DETECT_SIZE)
245        tonal->write_pos-=DETECT_SIZE;
246
247     ALLOC(in, 480, kiss_fft_cpx);
248     ALLOC(out, 480, kiss_fft_cpx);
249     ALLOC(tonality, 240, float);
250     ALLOC(noisiness, 240, float);
251     for (i=0;i<N2;i++)
252     {
253        float w = analysis_window[i];
254        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
255        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
256        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
257        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
258     }
259     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
260     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
261     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
262     tonal->mem_fill = 240 + remaining;
263     opus_fft(kfft, in, out, arch);
264 #ifndef FIXED_POINT
265     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
266     if (celt_isnan(out[0].r))
267     {
268        info->valid = 0;
269        RESTORE_STACK;
270        return;
271     }
272 #endif
273
274     for (i=1;i<N2;i++)
275     {
276        float X1r, X2r, X1i, X2i;
277        float angle, d_angle, d2_angle;
278        float angle2, d_angle2, d2_angle2;
279        float mod1, mod2, avg_mod;
280        X1r = (float)out[i].r+out[N-i].r;
281        X1i = (float)out[i].i-out[N-i].i;
282        X2r = (float)out[i].i+out[N-i].i;
283        X2i = (float)out[N-i].r-out[i].r;
284
285        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
286        d_angle = angle - A[i];
287        d2_angle = d_angle - dA[i];
288
289        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
290        d_angle2 = angle2 - angle;
291        d2_angle2 = d_angle2 - d_angle;
292
293        mod1 = d2_angle - (float)floor(.5+d2_angle);
294        noisiness[i] = ABS16(mod1);
295        mod1 *= mod1;
296        mod1 *= mod1;
297
298        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
299        noisiness[i] += ABS16(mod2);
300        mod2 *= mod2;
301        mod2 *= mod2;
302
303        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
304        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
305
306        A[i] = angle2;
307        dA[i] = d_angle2;
308        d2A[i] = mod2;
309     }
310
311     frame_tonality = 0;
312     max_frame_tonality = 0;
313     /*tw_sum = 0;*/
314     info->activity = 0;
315     frame_noisiness = 0;
316     frame_stationarity = 0;
317     if (!tonal->count)
318     {
319        for (b=0;b<NB_TBANDS;b++)
320        {
321           tonal->lowE[b] = 1e10;
322           tonal->highE[b] = -1e10;
323        }
324     }
325     relativeE = 0;
326     frame_loudness = 0;
327     for (b=0;b<NB_TBANDS;b++)
328     {
329        float E=0, tE=0, nE=0;
330        float L1, L2;
331        float stationarity;
332        for (i=tbands[b];i<tbands[b+1];i++)
333        {
334           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
335                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
336 #ifdef FIXED_POINT
337           /* FIXME: It's probably best to change the BFCC filter initial state instead */
338           binE *= 5.55e-17f;
339 #endif
340           E += binE;
341           tE += binE*tonality[i];
342           nE += binE*2.f*(.5f-noisiness[i]);
343        }
344 #ifndef FIXED_POINT
345        /* Check for extreme band energies that could cause NaNs later. */
346        if (!(E<1e9f) || celt_isnan(E))
347        {
348           info->valid = 0;
349           RESTORE_STACK;
350           return;
351        }
352 #endif
353
354        tonal->E[tonal->E_count][b] = E;
355        frame_noisiness += nE/(1e-15f+E);
356
357        frame_loudness += (float)sqrt(E+1e-10f);
358        logE[b] = (float)log(E+1e-10f);
359        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
360        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
361        if (tonal->highE[b] < tonal->lowE[b]+1.f)
362        {
363           tonal->highE[b]+=.5f;
364           tonal->lowE[b]-=.5f;
365        }
366        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
367
368        L1=L2=0;
369        for (i=0;i<NB_FRAMES;i++)
370        {
371           L1 += (float)sqrt(tonal->E[i][b]);
372           L2 += tonal->E[i][b];
373        }
374
375        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
376        stationarity *= stationarity;
377        stationarity *= stationarity;
378        frame_stationarity += stationarity;
379        /*band_tonality[b] = tE/(1e-15+E)*/;
380        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
381 #if 0
382        if (b>=NB_TONAL_SKIP_BANDS)
383        {
384           frame_tonality += tweight[b]*band_tonality[b];
385           tw_sum += tweight[b];
386        }
387 #else
388        frame_tonality += band_tonality[b];
389        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
390           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
391 #endif
392        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
393        slope += band_tonality[b]*(b-8);
394        /*printf("%f %f ", band_tonality[b], stationarity);*/
395        tonal->prev_band_tonality[b] = band_tonality[b];
396     }
397
398     bandwidth_mask = 0;
399     bandwidth = 0;
400     maxE = 0;
401     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
402 #ifdef FIXED_POINT
403     noise_floor *= 1<<(15+SIG_SHIFT);
404 #endif
405     noise_floor *= noise_floor;
406     for (b=0;b<NB_TOT_BANDS;b++)
407     {
408        float E=0;
409        int band_start, band_end;
410        /* Keep a margin of 300 Hz for aliasing */
411        band_start = extra_bands[b];
412        band_end = extra_bands[b+1];
413        for (i=band_start;i<band_end;i++)
414        {
415           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
416                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
417           E += binE;
418        }
419        maxE = MAX32(maxE, E);
420        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
421        E = MAX32(E, tonal->meanE[b]);
422        /* Use a simple follower with 13 dB/Bark slope for spreading function */
423        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
424        /* Consider the band "active" only if all these conditions are met:
425           1) less than 10 dB below the simple follower
426           2) less than 90 dB below the peak band (maximal masking possible considering
427              both the ATH and the loudness-dependent slope of the spreading function)
428           3) above the PCM quantization noise floor
429        */
430        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
431           bandwidth = b;
432     }
433     if (tonal->count<=2)
434        bandwidth = 20;
435     frame_loudness = 20*(float)log10(frame_loudness);
436     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
437     tonal->lowECount *= (1-alphaE);
438     if (frame_loudness < tonal->Etracker-30)
439        tonal->lowECount += alphaE;
440
441     for (i=0;i<8;i++)
442     {
443        float sum=0;
444        for (b=0;b<16;b++)
445           sum += dct_table[i*16+b]*logE[b];
446        BFCC[i] = sum;
447     }
448
449     frame_stationarity /= NB_TBANDS;
450     relativeE /= NB_TBANDS;
451     if (tonal->count<10)
452        relativeE = .5;
453     frame_noisiness /= NB_TBANDS;
454 #if 1
455     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
456 #else
457     info->activity = .5*(1+frame_noisiness-frame_stationarity);
458 #endif
459     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
460     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
461     tonal->prev_tonality = frame_tonality;
462
463     slope /= 8*8;
464     info->tonality_slope = slope;
465
466     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
467     tonal->count++;
468     info->tonality = frame_tonality;
469
470     for (i=0;i<4;i++)
471        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
472
473     for (i=0;i<4;i++)
474        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
475
476     for (i=0;i<4;i++)
477         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
478     for (i=0;i<3;i++)
479         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
480
481     if (tonal->count > 5)
482     {
483        for (i=0;i<9;i++)
484           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
485     }
486
487     for (i=0;i<8;i++)
488     {
489        tonal->mem[i+24] = tonal->mem[i+16];
490        tonal->mem[i+16] = tonal->mem[i+8];
491        tonal->mem[i+8] = tonal->mem[i];
492        tonal->mem[i] = BFCC[i];
493     }
494     for (i=0;i<9;i++)
495        features[11+i] = (float)sqrt(tonal->std[i]);
496     features[20] = info->tonality;
497     features[21] = info->activity;
498     features[22] = frame_stationarity;
499     features[23] = info->tonality_slope;
500     features[24] = tonal->lowECount;
501
502 #ifndef DISABLE_FLOAT_API
503     mlp_process(&net, features, frame_probs);
504     frame_probs[0] = .5f*(frame_probs[0]+1);
505     /* Curve fitting between the MLP probability and the actual probability */
506     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
507     /* Probability of active audio (as opposed to silence) */
508     frame_probs[1] = .5f*frame_probs[1]+.5f;
509     /* Consider that silence has a 50-50 probability. */
510     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
511
512     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
513     {
514        /* Probability of state transition */
515        float tau;
516        /* Represents independence of the MLP probabilities, where
517           beta=1 means fully independent. */
518        float beta;
519        /* Denormalized probability of speech (p0) and music (p1) after update */
520        float p0, p1;
521        /* Probabilities for "all speech" and "all music" */
522        float s0, m0;
523        /* Probability sum for renormalisation */
524        float psum;
525        /* Instantaneous probability of speech and music, with beta pre-applied. */
526        float speech0;
527        float music0;
528
529        /* One transition every 3 minutes of active audio */
530        tau = .00005f*frame_probs[1];
531        beta = .05f;
532        if (1) {
533           /* Adapt beta based on how "unexpected" the new prob is */
534           float p, q;
535           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
536           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
537           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
538        }
539        /* p0 and p1 are the probabilities of speech and music at this frame
540           using only information from previous frame and applying the
541           state transition model */
542        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
543        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
544        /* We apply the current probability with exponent beta to work around
545           the fact that the probability estimates aren't independent. */
546        p0 *= (float)pow(1-frame_probs[0], beta);
547        p1 *= (float)pow(frame_probs[0], beta);
548        /* Normalise the probabilities to get the Marokv probability of music. */
549        tonal->music_prob = p1/(p0+p1);
550        info->music_prob = tonal->music_prob;
551
552        /* This chunk of code deals with delayed decision. */
553        psum=1e-20f;
554        /* Instantaneous probability of speech and music, with beta pre-applied. */
555        speech0 = (float)pow(1-frame_probs[0], beta);
556        music0  = (float)pow(frame_probs[0], beta);
557        if (tonal->count==1)
558        {
559           tonal->pspeech[0]=.5;
560           tonal->pmusic [0]=.5;
561        }
562        /* Updated probability of having only speech (s0) or only music (m0),
563           before considering the new observation. */
564        s0 = tonal->pspeech[0] + tonal->pspeech[1];
565        m0 = tonal->pmusic [0] + tonal->pmusic [1];
566        /* Updates s0 and m0 with instantaneous probability. */
567        tonal->pspeech[0] = s0*(1-tau)*speech0;
568        tonal->pmusic [0] = m0*(1-tau)*music0;
569        /* Propagate the transition probabilities */
570        for (i=1;i<DETECT_SIZE-1;i++)
571        {
572           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
573           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
574        }
575        /* Probability that the latest frame is speech, when all the previous ones were music. */
576        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
577        /* Probability that the latest frame is music, when all the previous ones were speech. */
578        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
579
580        /* Renormalise probabilities to 1 */
581        for (i=0;i<DETECT_SIZE;i++)
582           psum += tonal->pspeech[i] + tonal->pmusic[i];
583        psum = 1.f/psum;
584        for (i=0;i<DETECT_SIZE;i++)
585        {
586           tonal->pspeech[i] *= psum;
587           tonal->pmusic [i] *= psum;
588        }
589        psum = tonal->pmusic[0];
590        for (i=1;i<DETECT_SIZE;i++)
591           psum += tonal->pspeech[i];
592
593        /* Estimate our confidence in the speech/music decisions */
594        if (frame_probs[1]>.75)
595        {
596           if (tonal->music_prob>.9)
597           {
598              float adapt;
599              adapt = 1.f/(++tonal->music_confidence_count);
600              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
601              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
602           }
603           if (tonal->music_prob<.1)
604           {
605              float adapt;
606              adapt = 1.f/(++tonal->speech_confidence_count);
607              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
608              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
609           }
610        } else {
611           if (tonal->music_confidence_count==0)
612              tonal->music_confidence = .9f;
613           if (tonal->speech_confidence_count==0)
614              tonal->speech_confidence = .1f;
615        }
616     }
617     if (tonal->last_music != (tonal->music_prob>.5f))
618        tonal->last_transition=0;
619     tonal->last_music = tonal->music_prob>.5f;
620 #else
621     info->music_prob = 0;
622 #endif
623     /*for (i=0;i<25;i++)
624        printf("%f ", features[i]);
625     printf("\n");*/
626
627     info->bandwidth = bandwidth;
628     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
629     info->noisiness = frame_noisiness;
630     info->valid = 1;
631     RESTORE_STACK;
632 }
633
634 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
635                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
636                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info, int arch)
637 {
638    int offset;
639    int pcm_len;
640
641    if (analysis_pcm != NULL)
642    {
643       /* Avoid overflow/wrap-around of the analysis buffer */
644       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
645
646       pcm_len = analysis_frame_size - analysis->analysis_offset;
647       offset = analysis->analysis_offset;
648       do {
649          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix, arch);
650          offset += 480;
651          pcm_len -= 480;
652       } while (pcm_len>0);
653       analysis->analysis_offset = analysis_frame_size;
654
655       analysis->analysis_offset -= frame_size;
656    }
657
658    analysis_info->valid = 0;
659    tonality_get_info(analysis, analysis_info, frame_size);
660 }