Add experimental support for ambisonic encoding
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    /* Should avoid underflow on the values we'll get */
119    if (ABS16(x)+ABS16(y)<1e-9f)
120    {
121       x*=1e12f;
122       y*=1e12f;
123    }
124    x2 = x*x;
125    y2 = y*y;
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       if (den!=0)
129          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130       else
131          return (y<0 ? -cE : cE);
132    }else{
133       float den = (x2 + cB*y2) * (x2 + cC*y2);
134       if (den!=0)
135          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136       else
137          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138    }
139 }
140
141 void tonality_analysis_init(TonalityAnalysisState *tonal)
142 {
143   /* Initialize reusable fields. */
144   tonal->arch = opus_select_arch();
145   /* Clear remaining fields. */
146   tonality_analysis_reset(tonal);
147 }
148
149 void tonality_analysis_reset(TonalityAnalysisState *tonal)
150 {
151   /* Clear non-reusable fields. */
152   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
154 }
155
156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157 {
158    int pos;
159    int curr_lookahead;
160    float psum;
161    int i;
162
163    pos = tonal->read_pos;
164    curr_lookahead = tonal->write_pos-tonal->read_pos;
165    if (curr_lookahead<0)
166       curr_lookahead += DETECT_SIZE;
167
168    if (len > 480 && pos != tonal->write_pos)
169    {
170       pos++;
171       if (pos==DETECT_SIZE)
172          pos=0;
173    }
174    if (pos == tonal->write_pos)
175       pos--;
176    if (pos<0)
177       pos = DETECT_SIZE-1;
178    OPUS_COPY(info_out, &tonal->info[pos], 1);
179    tonal->read_subframe += len/120;
180    while (tonal->read_subframe>=4)
181    {
182       tonal->read_subframe -= 4;
183       tonal->read_pos++;
184    }
185    if (tonal->read_pos>=DETECT_SIZE)
186       tonal->read_pos-=DETECT_SIZE;
187
188    /* Compensate for the delay in the features themselves.
189       FIXME: Need a better estimate the 10 I just made up */
190    curr_lookahead = IMAX(curr_lookahead-10, 0);
191
192    psum=0;
193    /* Summing the probability of transition patterns that involve music at
194       time (DETECT_SIZE-curr_lookahead-1) */
195    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196       psum += tonal->pmusic[i];
197    for (;i<DETECT_SIZE;i++)
198       psum += tonal->pspeech[i];
199    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
201
202    info_out->music_prob = psum;
203 }
204
205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
206 {
207     int i, b;
208     const kiss_fft_state *kfft;
209     VARDECL(kiss_fft_cpx, in);
210     VARDECL(kiss_fft_cpx, out);
211     int N = 480, N2=240;
212     float * OPUS_RESTRICT A = tonal->angle;
213     float * OPUS_RESTRICT dA = tonal->d_angle;
214     float * OPUS_RESTRICT d2A = tonal->d2_angle;
215     VARDECL(float, tonality);
216     VARDECL(float, noisiness);
217     float band_tonality[NB_TBANDS];
218     float logE[NB_TBANDS];
219     float BFCC[8];
220     float features[25];
221     float frame_tonality;
222     float max_frame_tonality;
223     /*float tw_sum=0;*/
224     float frame_noisiness;
225     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
226     float slope=0;
227     float frame_stationarity;
228     float relativeE;
229     float frame_probs[2];
230     float alpha, alphaE, alphaE2;
231     float frame_loudness;
232     float bandwidth_mask;
233     int bandwidth=0;
234     float maxE = 0;
235     float noise_floor;
236     int remaining;
237     AnalysisInfo *info;
238     SAVE_STACK;
239
240     tonal->last_transition++;
241     alpha = 1.f/IMIN(20, 1+tonal->count);
242     alphaE = 1.f/IMIN(50, 1+tonal->count);
243     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
244
245     if (tonal->count<4)
246        tonal->music_prob = .5;
247     kfft = celt_mode->mdct.kfft[0];
248     if (tonal->count==0)
249        tonal->mem_fill = 240;
250     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
251     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
252     {
253        tonal->mem_fill += len;
254        /* Don't have enough to update the analysis */
255        RESTORE_STACK;
256        return;
257     }
258     info = &tonal->info[tonal->write_pos++];
259     if (tonal->write_pos>=DETECT_SIZE)
260        tonal->write_pos-=DETECT_SIZE;
261
262     ALLOC(in, 480, kiss_fft_cpx);
263     ALLOC(out, 480, kiss_fft_cpx);
264     ALLOC(tonality, 240, float);
265     ALLOC(noisiness, 240, float);
266     for (i=0;i<N2;i++)
267     {
268        float w = analysis_window[i];
269        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
273     }
274     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
277     tonal->mem_fill = 240 + remaining;
278     opus_fft(kfft, in, out, tonal->arch);
279 #ifndef FIXED_POINT
280     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
281     if (celt_isnan(out[0].r))
282     {
283        info->valid = 0;
284        RESTORE_STACK;
285        return;
286     }
287 #endif
288
289     for (i=1;i<N2;i++)
290     {
291        float X1r, X2r, X1i, X2i;
292        float angle, d_angle, d2_angle;
293        float angle2, d_angle2, d2_angle2;
294        float mod1, mod2, avg_mod;
295        X1r = (float)out[i].r+out[N-i].r;
296        X1i = (float)out[i].i-out[N-i].i;
297        X2r = (float)out[i].i+out[N-i].i;
298        X2i = (float)out[N-i].r-out[i].r;
299
300        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301        d_angle = angle - A[i];
302        d2_angle = d_angle - dA[i];
303
304        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305        d_angle2 = angle2 - angle;
306        d2_angle2 = d_angle2 - d_angle;
307
308        mod1 = d2_angle - (float)floor(.5+d2_angle);
309        noisiness[i] = ABS16(mod1);
310        mod1 *= mod1;
311        mod1 *= mod1;
312
313        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
314        noisiness[i] += ABS16(mod2);
315        mod2 *= mod2;
316        mod2 *= mod2;
317
318        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
319        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
320
321        A[i] = angle2;
322        dA[i] = d_angle2;
323        d2A[i] = mod2;
324     }
325
326     frame_tonality = 0;
327     max_frame_tonality = 0;
328     /*tw_sum = 0;*/
329     info->activity = 0;
330     frame_noisiness = 0;
331     frame_stationarity = 0;
332     if (!tonal->count)
333     {
334        for (b=0;b<NB_TBANDS;b++)
335        {
336           tonal->lowE[b] = 1e10;
337           tonal->highE[b] = -1e10;
338        }
339     }
340     relativeE = 0;
341     frame_loudness = 0;
342     for (b=0;b<NB_TBANDS;b++)
343     {
344        float E=0, tE=0, nE=0;
345        float L1, L2;
346        float stationarity;
347        for (i=tbands[b];i<tbands[b+1];i++)
348        {
349           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
351 #ifdef FIXED_POINT
352           /* FIXME: It's probably best to change the BFCC filter initial state instead */
353           binE *= 5.55e-17f;
354 #endif
355           E += binE;
356           tE += binE*tonality[i];
357           nE += binE*2.f*(.5f-noisiness[i]);
358        }
359 #ifndef FIXED_POINT
360        /* Check for extreme band energies that could cause NaNs later. */
361        if (!(E<1e9f) || celt_isnan(E))
362        {
363           info->valid = 0;
364           RESTORE_STACK;
365           return;
366        }
367 #endif
368
369        tonal->E[tonal->E_count][b] = E;
370        frame_noisiness += nE/(1e-15f+E);
371
372        frame_loudness += (float)sqrt(E+1e-10f);
373        logE[b] = (float)log(E+1e-10f);
374        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
375        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
376        if (tonal->highE[b] < tonal->lowE[b]+1.f)
377        {
378           tonal->highE[b]+=.5f;
379           tonal->lowE[b]-=.5f;
380        }
381        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
382
383        L1=L2=0;
384        for (i=0;i<NB_FRAMES;i++)
385        {
386           L1 += (float)sqrt(tonal->E[i][b]);
387           L2 += tonal->E[i][b];
388        }
389
390        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391        stationarity *= stationarity;
392        stationarity *= stationarity;
393        frame_stationarity += stationarity;
394        /*band_tonality[b] = tE/(1e-15+E)*/;
395        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
396 #if 0
397        if (b>=NB_TONAL_SKIP_BANDS)
398        {
399           frame_tonality += tweight[b]*band_tonality[b];
400           tw_sum += tweight[b];
401        }
402 #else
403        frame_tonality += band_tonality[b];
404        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
406 #endif
407        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
408        slope += band_tonality[b]*(b-8);
409        /*printf("%f %f ", band_tonality[b], stationarity);*/
410        tonal->prev_band_tonality[b] = band_tonality[b];
411     }
412
413     bandwidth_mask = 0;
414     bandwidth = 0;
415     maxE = 0;
416     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
417 #ifdef FIXED_POINT
418     noise_floor *= 1<<(15+SIG_SHIFT);
419 #endif
420     noise_floor *= noise_floor;
421     for (b=0;b<NB_TOT_BANDS;b++)
422     {
423        float E=0;
424        int band_start, band_end;
425        /* Keep a margin of 300 Hz for aliasing */
426        band_start = extra_bands[b];
427        band_end = extra_bands[b+1];
428        for (i=band_start;i<band_end;i++)
429        {
430           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
432           E += binE;
433        }
434        maxE = MAX32(maxE, E);
435        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436        E = MAX32(E, tonal->meanE[b]);
437        /* Use a simple follower with 13 dB/Bark slope for spreading function */
438        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439        /* Consider the band "active" only if all these conditions are met:
440           1) less than 10 dB below the simple follower
441           2) less than 90 dB below the peak band (maximal masking possible considering
442              both the ATH and the loudness-dependent slope of the spreading function)
443           3) above the PCM quantization noise floor
444        */
445        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
446           bandwidth = b;
447     }
448     if (tonal->count<=2)
449        bandwidth = 20;
450     frame_loudness = 20*(float)log10(frame_loudness);
451     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
452     tonal->lowECount *= (1-alphaE);
453     if (frame_loudness < tonal->Etracker-30)
454        tonal->lowECount += alphaE;
455
456     for (i=0;i<8;i++)
457     {
458        float sum=0;
459        for (b=0;b<16;b++)
460           sum += dct_table[i*16+b]*logE[b];
461        BFCC[i] = sum;
462     }
463
464     frame_stationarity /= NB_TBANDS;
465     relativeE /= NB_TBANDS;
466     if (tonal->count<10)
467        relativeE = .5;
468     frame_noisiness /= NB_TBANDS;
469 #if 1
470     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
471 #else
472     info->activity = .5*(1+frame_noisiness-frame_stationarity);
473 #endif
474     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476     tonal->prev_tonality = frame_tonality;
477
478     slope /= 8*8;
479     info->tonality_slope = slope;
480
481     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
482     tonal->count++;
483     info->tonality = frame_tonality;
484
485     for (i=0;i<4;i++)
486        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
487
488     for (i=0;i<4;i++)
489        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
490
491     for (i=0;i<4;i++)
492         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
493     for (i=0;i<3;i++)
494         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
495
496     if (tonal->count > 5)
497     {
498        for (i=0;i<9;i++)
499           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
500     }
501
502     for (i=0;i<8;i++)
503     {
504        tonal->mem[i+24] = tonal->mem[i+16];
505        tonal->mem[i+16] = tonal->mem[i+8];
506        tonal->mem[i+8] = tonal->mem[i];
507        tonal->mem[i] = BFCC[i];
508     }
509     for (i=0;i<9;i++)
510        features[11+i] = (float)sqrt(tonal->std[i]);
511     features[20] = info->tonality;
512     features[21] = info->activity;
513     features[22] = frame_stationarity;
514     features[23] = info->tonality_slope;
515     features[24] = tonal->lowECount;
516
517 #ifndef DISABLE_FLOAT_API
518     mlp_process(&net, features, frame_probs);
519     frame_probs[0] = .5f*(frame_probs[0]+1);
520     /* Curve fitting between the MLP probability and the actual probability */
521     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
522     /* Probability of active audio (as opposed to silence) */
523     frame_probs[1] = .5f*frame_probs[1]+.5f;
524     /* Consider that silence has a 50-50 probability. */
525     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526
527     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
528     {
529        /* Probability of state transition */
530        float tau;
531        /* Represents independence of the MLP probabilities, where
532           beta=1 means fully independent. */
533        float beta;
534        /* Denormalized probability of speech (p0) and music (p1) after update */
535        float p0, p1;
536        /* Probabilities for "all speech" and "all music" */
537        float s0, m0;
538        /* Probability sum for renormalisation */
539        float psum;
540        /* Instantaneous probability of speech and music, with beta pre-applied. */
541        float speech0;
542        float music0;
543
544        /* One transition every 3 minutes of active audio */
545        tau = .00005f*frame_probs[1];
546        beta = .05f;
547        if (1) {
548           /* Adapt beta based on how "unexpected" the new prob is */
549           float p, q;
550           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
551           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
552           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
553        }
554        /* p0 and p1 are the probabilities of speech and music at this frame
555           using only information from previous frame and applying the
556           state transition model */
557        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
558        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
559        /* We apply the current probability with exponent beta to work around
560           the fact that the probability estimates aren't independent. */
561        p0 *= (float)pow(1-frame_probs[0], beta);
562        p1 *= (float)pow(frame_probs[0], beta);
563        /* Normalise the probabilities to get the Marokv probability of music. */
564        tonal->music_prob = p1/(p0+p1);
565        info->music_prob = tonal->music_prob;
566
567        /* This chunk of code deals with delayed decision. */
568        psum=1e-20f;
569        /* Instantaneous probability of speech and music, with beta pre-applied. */
570        speech0 = (float)pow(1-frame_probs[0], beta);
571        music0  = (float)pow(frame_probs[0], beta);
572        if (tonal->count==1)
573        {
574           tonal->pspeech[0]=.5;
575           tonal->pmusic [0]=.5;
576        }
577        /* Updated probability of having only speech (s0) or only music (m0),
578           before considering the new observation. */
579        s0 = tonal->pspeech[0] + tonal->pspeech[1];
580        m0 = tonal->pmusic [0] + tonal->pmusic [1];
581        /* Updates s0 and m0 with instantaneous probability. */
582        tonal->pspeech[0] = s0*(1-tau)*speech0;
583        tonal->pmusic [0] = m0*(1-tau)*music0;
584        /* Propagate the transition probabilities */
585        for (i=1;i<DETECT_SIZE-1;i++)
586        {
587           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
588           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
589        }
590        /* Probability that the latest frame is speech, when all the previous ones were music. */
591        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
592        /* Probability that the latest frame is music, when all the previous ones were speech. */
593        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
594
595        /* Renormalise probabilities to 1 */
596        for (i=0;i<DETECT_SIZE;i++)
597           psum += tonal->pspeech[i] + tonal->pmusic[i];
598        psum = 1.f/psum;
599        for (i=0;i<DETECT_SIZE;i++)
600        {
601           tonal->pspeech[i] *= psum;
602           tonal->pmusic [i] *= psum;
603        }
604        psum = tonal->pmusic[0];
605        for (i=1;i<DETECT_SIZE;i++)
606           psum += tonal->pspeech[i];
607
608        /* Estimate our confidence in the speech/music decisions */
609        if (frame_probs[1]>.75)
610        {
611           if (tonal->music_prob>.9)
612           {
613              float adapt;
614              adapt = 1.f/(++tonal->music_confidence_count);
615              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
616              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
617           }
618           if (tonal->music_prob<.1)
619           {
620              float adapt;
621              adapt = 1.f/(++tonal->speech_confidence_count);
622              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
623              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
624           }
625        } else {
626           if (tonal->music_confidence_count==0)
627              tonal->music_confidence = .9f;
628           if (tonal->speech_confidence_count==0)
629              tonal->speech_confidence = .1f;
630        }
631     }
632     if (tonal->last_music != (tonal->music_prob>.5f))
633        tonal->last_transition=0;
634     tonal->last_music = tonal->music_prob>.5f;
635 #else
636     info->music_prob = 0;
637 #endif
638     /*for (i=0;i<25;i++)
639        printf("%f ", features[i]);
640     printf("\n");*/
641
642     info->bandwidth = bandwidth;
643     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
644     info->noisiness = frame_noisiness;
645     info->valid = 1;
646     RESTORE_STACK;
647 }
648
649 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
650                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
651                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
652 {
653    int offset;
654    int pcm_len;
655
656    if (analysis_pcm != NULL)
657    {
658       /* Avoid overflow/wrap-around of the analysis buffer */
659       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
660
661       pcm_len = analysis_frame_size - analysis->analysis_offset;
662       offset = analysis->analysis_offset;
663       do {
664          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
665          offset += 480;
666          pcm_len -= 480;
667       } while (pcm_len>0);
668       analysis->analysis_offset = analysis_frame_size;
669
670       analysis->analysis_offset -= frame_size;
671    }
672
673    analysis_info->valid = 0;
674    tonality_get_info(analysis, analysis_info, frame_size);
675 }