Builds the analysis files more cleanly than #including C files
[opus.git] / src / analysis.c
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40
41 extern const MLP net;
42
43 #ifndef M_PI
44 #define M_PI 3.141592653
45 #endif
46
47 float dct_table[128] = {
48         0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000,
49         0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000,
50         0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0.034654,
51         -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,
52         0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0.346760,
53         -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0.346760,
54         0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,
55         0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0.338330,
56         0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641,
57         0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0.326641,
58         0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0.166664,
59         -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0.311806,
60         0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0.293969,
61         -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0.293969,
62         0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0.224292,
63         0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0.273300,
64 };
65
66 float analysis_window[240] = {
67       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
68       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
69       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
70       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
71       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
72       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
73       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
74       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
75       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
76       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
77       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
78       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
79       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
80       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
81       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
82       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
83       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
84       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
85       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
86       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
87       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
88       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
89       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
90       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
91       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
92       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
93       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
94       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
95       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
96       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
97 };
98
99 static const int tbands[NB_TBANDS+1] = {
100        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
101 };
102
103 static const float tweight[NB_TBANDS+1] = {
104       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
105 };
106
107 #define NB_TONAL_SKIP_BANDS 9
108
109 #define cA 0.43157974f
110 #define cB 0.67848403f
111 #define cC 0.08595542f
112 #define cE (M_PI/2)
113 static inline float fast_atan2f(float y, float x) {
114    float x2, y2;
115    /* Should avoid underflow on the values we'll get */
116    if (ABS16(x)+ABS16(y)<1e-9)
117    {
118       x*=1e12;
119       y*=1e12;
120    }
121    x2 = x*x;
122    y2 = y*y;
123    if(x2<y2){
124       float den = (y2 + cB*x2) * (y2 + cC*x2);
125       if (den!=0)
126          return -x*y*(y2 + cA*x2) / den + copysignf(cE,y);
127       else
128          return copysignf(cE,y);
129    }else{
130       float den = (x2 + cB*y2) * (x2 + cC*y2);
131       if (den!=0)
132          return  x*y*(x2 + cA*y2) / den + copysignf(cE,y) - copysignf(cE,x*y);
133       else
134          return copysignf(cE,y) - copysignf(cE,x*y);
135    }
136 }
137
138 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int C)
139 {
140     int i, b;
141     const CELTMode *mode;
142     const kiss_fft_state *kfft;
143     kiss_fft_cpx in[480], out[480];
144     int N = 480, N2=240;
145     float * restrict A = tonal->angle;
146     float * restrict dA = tonal->d_angle;
147     float * restrict d2A = tonal->d2_angle;
148     float tonality[240];
149     float noisiness[240];
150     float band_tonality[NB_TBANDS];
151     float logE[NB_TBANDS];
152     float BFCC[8];
153     float features[100];
154     float frame_tonality;
155     float max_frame_tonality;
156     float tw_sum=0;
157     float frame_noisiness;
158     const float pi4 = M_PI*M_PI*M_PI*M_PI;
159     float slope=0;
160     float frame_stationarity;
161     float relativeE;
162     float frame_prob;
163     float alpha, alphaE, alphaE2;
164     float frame_loudness;
165     float bandwidth_mask;
166     int bandwidth=0;
167     float bandE[NB_TBANDS];
168     celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
169
170     tonal->last_transition++;
171     alpha = 1.f/IMIN(20, 1+tonal->count);
172     alphaE = 1.f/IMIN(50, 1+tonal->count);
173     alphaE2 = 1.f/IMIN(6000, 1+tonal->count);
174
175     if (tonal->count<4)
176        tonal->music_prob = .5;
177     kfft = mode->mdct.kfft[0];
178     if (C==1)
179     {
180        for (i=0;i<N2;i++)
181        {
182           float w = analysis_window[i];
183           in[i].r = MULT16_16(w, x[i]);
184           in[i].i = MULT16_16(w, x[N-N2+i]);
185           in[N-i-1].r = MULT16_16(w, x[N-i-1]);
186           in[N-i-1].i = MULT16_16(w, x[2*N-N2-i-1]);
187        }
188     } else {
189        for (i=0;i<N2;i++)
190        {
191           float w = analysis_window[i];
192           in[i].r = MULT16_16(w, x[2*i]+x[2*i+1]);
193           in[i].i = MULT16_16(w, x[2*(N-N2+i)]+x[2*(N-N2+i)+1]);
194           in[N-i-1].r = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]);
195           in[N-i-1].i = MULT16_16(w, x[2*(2*N-N2-i-1)]+x[2*(2*N-N2-i-1)+1]);
196        }
197     }
198     opus_fft(kfft, in, out);
199
200     for (i=1;i<N2;i++)
201     {
202        float X1r, X2r, X1i, X2i;
203        float angle, d_angle, d2_angle;
204        float angle2, d_angle2, d2_angle2;
205        float mod1, mod2, avg_mod;
206        X1r = out[i].r+out[N-i].r;
207        X1i = out[i].i-out[N-i].i;
208        X2r = out[i].i+out[N-i].i;
209        X2i = out[N-i].r-out[i].r;
210
211        angle = (.5/M_PI)*fast_atan2f(X1i, X1r);
212        d_angle = angle - A[i];
213        d2_angle = d_angle - dA[i];
214
215        angle2 = (.5/M_PI)*fast_atan2f(X2i, X2r);
216        d_angle2 = angle2 - angle;
217        d2_angle2 = d_angle2 - d_angle;
218
219        mod1 = d2_angle - floor(.5+d2_angle);
220        noisiness[i] = fabs(mod1);
221        mod1 *= mod1;
222        mod1 *= mod1;
223
224        mod2 = d2_angle2 - floor(.5+d2_angle2);
225        noisiness[i] += fabs(mod2);
226        mod2 *= mod2;
227        mod2 *= mod2;
228
229        avg_mod = .25*(d2A[i]+2*mod1+mod2);
230        tonality[i] = 1./(1+40*16*pi4*avg_mod)-.015;
231
232        A[i] = angle2;
233        dA[i] = d_angle2;
234        d2A[i] = mod2;
235     }
236
237     frame_tonality = 0;
238     max_frame_tonality = 0;
239     tw_sum = 0;
240     info->activity = 0;
241     frame_noisiness = 0;
242     frame_stationarity = 0;
243     if (!tonal->count)
244     {
245        for (b=0;b<NB_TBANDS;b++)
246        {
247           tonal->lowE[b] = 1e10;
248           tonal->highE[b] = -1e10;
249        }
250     }
251     relativeE = 0;
252     info->boost_amount[0]=info->boost_amount[1]=0;
253     info->boost_band[0]=info->boost_band[1]=0;
254     frame_loudness = 0;
255     bandwidth_mask = 0;
256     for (b=0;b<NB_TBANDS;b++)
257     {
258        float E=0, tE=0, nE=0;
259        float L1, L2;
260        float stationarity;
261        for (i=tbands[b];i<tbands[b+1];i++)
262        {
263           float binE = out[i].r*out[i].r + out[N-i].r*out[N-i].r
264                      + out[i].i*out[i].i + out[N-i].i*out[N-i].i;
265           E += binE;
266           tE += binE*tonality[i];
267           nE += binE*2*(.5-noisiness[i]);
268        }
269        bandE[b] = E;
270        tonal->E[tonal->E_count][b] = E;
271        frame_noisiness += nE/(1e-15+E);
272
273        frame_loudness += sqrt(E+1e-10);
274        /* Add a reasonable noise floor */
275        tonal->meanE[b] = (1-alphaE2)*tonal->meanE[b] + alphaE2*E;
276        tonal->meanRE[b] = (1-alphaE2)*tonal->meanRE[b] + alphaE2*sqrt(E);
277        /* 13 dB slope for spreading function */
278        bandwidth_mask = MAX32(.05*bandwidth_mask, E);
279        /* Checks if band looks like stationary noise or if it's below a (trivial) masking curve */
280        if (tonal->meanRE[b]*tonal->meanRE[b] < tonal->meanE[b]*.95 && E>.1*bandwidth_mask)
281           bandwidth = b;
282        logE[b] = log(E+1e-10);
283        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01);
284        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1);
285        if (tonal->highE[b] < tonal->lowE[b]+1)
286        {
287           tonal->highE[b]+=.5;
288           tonal->lowE[b]-=.5;
289        }
290        relativeE += (logE[b]-tonal->lowE[b])/(EPSILON+tonal->highE[b]-tonal->lowE[b]);
291
292        L1=L2=0;
293        for (i=0;i<NB_FRAMES;i++)
294        {
295           L1 += sqrt(tonal->E[i][b]);
296           L2 += tonal->E[i][b];
297        }
298
299        stationarity = MIN16(0.99,L1/sqrt(EPSILON+NB_FRAMES*L2));
300        stationarity *= stationarity;
301        stationarity *= stationarity;
302        frame_stationarity += stationarity;
303        /*band_tonality[b] = tE/(1e-15+E)*/;
304        band_tonality[b] = MAX16(tE/(EPSILON+E), stationarity*tonal->prev_band_tonality[b]);
305 #if 0
306        if (b>=NB_TONAL_SKIP_BANDS)
307        {
308           frame_tonality += tweight[b]*band_tonality[b];
309           tw_sum += tweight[b];
310        }
311 #else
312        frame_tonality += band_tonality[b];
313        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
314           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
315 #endif
316        max_frame_tonality = MAX16(max_frame_tonality, (1+.03*(b-NB_TBANDS))*frame_tonality);
317        slope += band_tonality[b]*(b-8);
318        /*printf("%f %f ", band_tonality[b], stationarity);*/
319        if (band_tonality[b] > info->boost_amount[1] && b>=7 && b < NB_TBANDS-1)
320        {
321           if (band_tonality[b] > info->boost_amount[0])
322           {
323              info->boost_amount[1] = info->boost_amount[0];
324              info->boost_band[1] = info->boost_band[0];
325              info->boost_amount[0] = band_tonality[b];
326              info->boost_band[0] = b;
327           } else {
328              info->boost_amount[1] = band_tonality[b];
329              info->boost_band[1] = b;
330           }
331        }
332        tonal->prev_band_tonality[b] = band_tonality[b];
333     }
334
335     frame_loudness = 20*log10(frame_loudness);
336     tonal->Etracker = MAX32(tonal->Etracker-.03, frame_loudness);
337     tonal->lowECount *= (1-alphaE);
338     if (frame_loudness < tonal->Etracker-30)
339        tonal->lowECount += alphaE;
340
341     for (i=0;i<8;i++)
342     {
343        float sum=0;
344        for (b=0;b<16;b++)
345           sum += dct_table[i*16+b]*logE[b];
346        BFCC[i] = sum;
347     }
348
349     frame_stationarity /= NB_TBANDS;
350     relativeE /= NB_TBANDS;
351     if (tonal->count<10)
352        relativeE = .5;
353     frame_noisiness /= NB_TBANDS;
354 #if 1
355     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
356 #else
357     info->activity = .5*(1+frame_noisiness-frame_stationarity);
358 #endif
359     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
360     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8);
361     tonal->prev_tonality = frame_tonality;
362     info->boost_amount[0] -= frame_tonality+.2;
363     info->boost_amount[1] -= frame_tonality+.2;
364     if (band_tonality[info->boost_band[0]] < band_tonality[info->boost_band[0]+1]+.15
365         || band_tonality[info->boost_band[0]] < band_tonality[info->boost_band[0]-1]+.15)
366        info->boost_amount[0]=0;
367     if (band_tonality[info->boost_band[1]] < band_tonality[info->boost_band[1]+1]+.15
368         || band_tonality[info->boost_band[1]] < band_tonality[info->boost_band[1]-1]+.15)
369        info->boost_amount[1]=0;
370
371     slope /= 8*8;
372     info->tonality_slope = slope;
373
374     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
375     tonal->count++;
376     info->tonality = frame_tonality;
377
378     for (i=0;i<4;i++)
379        features[i] = -0.12299*(BFCC[i]+tonal->mem[i+24]) + 0.49195*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693*tonal->mem[i+8] - 1.4349*tonal->cmean[i];
380
381     for (i=0;i<4;i++)
382        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
383
384     for (i=0;i<4;i++)
385         features[4+i] = 0.63246*(BFCC[i]-tonal->mem[i+24]) + 0.31623*(tonal->mem[i]-tonal->mem[i+16]);
386     for (i=0;i<3;i++)
387         features[8+i] = 0.53452*(BFCC[i]+tonal->mem[i+24]) - 0.26726*(tonal->mem[i]+tonal->mem[i+16]) -0.53452*tonal->mem[i+8];
388
389     if (tonal->count > 5)
390     {
391        for (i=0;i<9;i++)
392           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
393     }
394
395     for (i=0;i<8;i++)
396     {
397        tonal->mem[i+24] = tonal->mem[i+16];
398        tonal->mem[i+16] = tonal->mem[i+8];
399        tonal->mem[i+8] = tonal->mem[i];
400        tonal->mem[i] = BFCC[i];
401     }
402     for (i=0;i<9;i++)
403        features[11+i] = sqrt(tonal->std[i]);
404     features[20] = info->tonality;
405     features[21] = info->activity;
406     features[22] = frame_stationarity;
407     features[23] = info->tonality_slope;
408     features[24] = tonal->lowECount;
409
410 #ifndef FIXED_POINT
411     mlp_process(&net, features, &frame_prob);
412     /* Adds a "probability dead zone", with a cap on certainty */
413     frame_prob = .90*frame_prob*frame_prob*frame_prob;
414
415     frame_prob = .5*(frame_prob+1);
416
417     /*printf("%f\n", frame_prob);*/
418     {
419        float tau, beta;
420        float p0, p1;
421        float max_certainty;
422        /* One transition every 3 minutes */
423        tau = .00005;
424        beta = .1;
425        max_certainty = 1.f/(10+1*tonal->last_transition);
426        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
427        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
428        p0 *= pow(1-frame_prob, beta);
429        p1 *= pow(frame_prob, beta);
430        tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1)));
431        info->music_prob = tonal->music_prob;
432        /*printf("%f %f\n", frame_prob, info->music_prob);*/
433     }
434     if (tonal->last_music != (tonal->music_prob>.5))
435        tonal->last_transition=0;
436     tonal->last_music = tonal->music_prob>.5;
437 #else
438     info->music_prob = 0;
439 #endif
440     /*for (i=0;i<25;i++)
441        printf("%f ", features[i]);
442     printf("\n");*/
443
444     /* FIXME: Can't detect SWB for now because the last band ends at 12 kHz */
445     if (bandwidth == NB_TBANDS-1 || tonal->count<100)
446     {
447        tonal->opus_bandwidth = OPUS_BANDWIDTH_FULLBAND;
448     } else {
449        int close_enough = 0;
450        if (bandE[bandwidth-1] < 3000*bandE[NB_TBANDS-1] && bandwidth < NB_TBANDS-1)
451           close_enough=1;
452        if (bandwidth<=11 || (bandwidth==12 && close_enough))
453           tonal->opus_bandwidth = OPUS_BANDWIDTH_NARROWBAND;
454        else if (bandwidth<=13)
455           tonal->opus_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND;
456        else if (bandwidth<=15 || (bandwidth==16 && close_enough))
457           tonal->opus_bandwidth = OPUS_BANDWIDTH_WIDEBAND;
458     }
459     info->noisiness = frame_noisiness;
460     info->valid = 1;
461 }