1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "quant_bands.h"
40 #include "stack_alloc.h"
43 #define M_PI 3.141592653
46 static const float dct_table[128] = {
47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 static const float analysis_window[240] = {
66 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 static const int tbands[NB_TBANDS+1] = {
99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 /*static const float tweight[NB_TBANDS+1] = {
107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 #define NB_TONAL_SKIP_BANDS 9
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
116 static OPUS_INLINE float fast_atan2f(float y, float x) {
118 /* Should avoid underflow on the values we'll get */
119 if (ABS16(x)+ABS16(y)<1e-9f)
127 float den = (y2 + cB*x2) * (y2 + cC*x2);
129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
131 return (y<0 ? -cE : cE);
133 float den = (x2 + cB*y2) * (x2 + cC*y2);
135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
141 void tonality_analysis_init(TonalityAnalysisState *tonal)
143 /* Initialize reusable fields. */
144 tonal->arch = opus_select_arch();
145 /* Clear remaining fields. */
146 tonality_analysis_reset(tonal);
149 void tonality_analysis_reset(TonalityAnalysisState *tonal)
151 /* Clear non-reusable fields. */
152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
163 pos = tonal->read_pos;
164 curr_lookahead = tonal->write_pos-tonal->read_pos;
165 if (curr_lookahead<0)
166 curr_lookahead += DETECT_SIZE;
168 if (len > 480 && pos != tonal->write_pos)
171 if (pos==DETECT_SIZE)
174 if (pos == tonal->write_pos)
178 OPUS_COPY(info_out, &tonal->info[pos], 1);
179 tonal->read_subframe += len/120;
180 while (tonal->read_subframe>=4)
182 tonal->read_subframe -= 4;
185 if (tonal->read_pos>=DETECT_SIZE)
186 tonal->read_pos-=DETECT_SIZE;
188 /* Compensate for the delay in the features themselves.
189 FIXME: Need a better estimate the 10 I just made up */
190 curr_lookahead = IMAX(curr_lookahead-10, 0);
193 /* Summing the probability of transition patterns that involve music at
194 time (DETECT_SIZE-curr_lookahead-1) */
195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196 psum += tonal->pmusic[i];
197 for (;i<DETECT_SIZE;i++)
198 psum += tonal->pspeech[i];
199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
202 info_out->music_prob = psum;
205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
208 const kiss_fft_state *kfft;
209 VARDECL(kiss_fft_cpx, in);
210 VARDECL(kiss_fft_cpx, out);
212 float * OPUS_RESTRICT A = tonal->angle;
213 float * OPUS_RESTRICT dA = tonal->d_angle;
214 float * OPUS_RESTRICT d2A = tonal->d2_angle;
215 VARDECL(float, tonality);
216 VARDECL(float, noisiness);
217 float band_tonality[NB_TBANDS];
218 float logE[NB_TBANDS];
221 float frame_tonality;
222 float max_frame_tonality;
224 float frame_noisiness;
225 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
227 float frame_stationarity;
229 float frame_probs[2];
230 float alpha, alphaE, alphaE2;
231 float frame_loudness;
232 float bandwidth_mask;
240 tonal->last_transition++;
241 alpha = 1.f/IMIN(20, 1+tonal->count);
242 alphaE = 1.f/IMIN(50, 1+tonal->count);
243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
246 tonal->music_prob = .5;
247 kfft = celt_mode->mdct.kfft[0];
249 tonal->mem_fill = 240;
250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
253 tonal->mem_fill += len;
254 /* Don't have enough to update the analysis */
258 info = &tonal->info[tonal->write_pos++];
259 if (tonal->write_pos>=DETECT_SIZE)
260 tonal->write_pos-=DETECT_SIZE;
262 ALLOC(in, 480, kiss_fft_cpx);
263 ALLOC(out, 480, kiss_fft_cpx);
264 ALLOC(tonality, 240, float);
265 ALLOC(noisiness, 240, float);
268 float w = analysis_window[i];
269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
277 tonal->mem_fill = 240 + remaining;
278 opus_fft(kfft, in, out, tonal->arch);
280 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
281 if (celt_isnan(out[0].r))
291 float X1r, X2r, X1i, X2i;
292 float angle, d_angle, d2_angle;
293 float angle2, d_angle2, d2_angle2;
294 float mod1, mod2, avg_mod;
295 X1r = (float)out[i].r+out[N-i].r;
296 X1i = (float)out[i].i-out[N-i].i;
297 X2r = (float)out[i].i+out[N-i].i;
298 X2i = (float)out[N-i].r-out[i].r;
300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301 d_angle = angle - A[i];
302 d2_angle = d_angle - dA[i];
304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305 d_angle2 = angle2 - angle;
306 d2_angle2 = d_angle2 - d_angle;
308 mod1 = d2_angle - (float)floor(.5+d2_angle);
309 noisiness[i] = ABS16(mod1);
313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
314 noisiness[i] += ABS16(mod2);
318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
327 max_frame_tonality = 0;
331 frame_stationarity = 0;
334 for (b=0;b<NB_TBANDS;b++)
336 tonal->lowE[b] = 1e10;
337 tonal->highE[b] = -1e10;
342 for (b=0;b<NB_TBANDS;b++)
344 float E=0, tE=0, nE=0;
347 for (i=tbands[b];i<tbands[b+1];i++)
349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
352 /* FIXME: It's probably best to change the BFCC filter initial state instead */
356 tE += binE*tonality[i];
357 nE += binE*2.f*(.5f-noisiness[i]);
360 /* Check for extreme band energies that could cause NaNs later. */
361 if (!(E<1e9f) || celt_isnan(E))
369 tonal->E[tonal->E_count][b] = E;
370 frame_noisiness += nE/(1e-15f+E);
372 frame_loudness += (float)sqrt(E+1e-10f);
373 logE[b] = (float)log(E+1e-10f);
374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
376 if (tonal->highE[b] < tonal->lowE[b]+1.f)
378 tonal->highE[b]+=.5f;
381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
384 for (i=0;i<NB_FRAMES;i++)
386 L1 += (float)sqrt(tonal->E[i][b]);
387 L2 += tonal->E[i][b];
390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391 stationarity *= stationarity;
392 stationarity *= stationarity;
393 frame_stationarity += stationarity;
394 /*band_tonality[b] = tE/(1e-15+E)*/;
395 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
397 if (b>=NB_TONAL_SKIP_BANDS)
399 frame_tonality += tweight[b]*band_tonality[b];
400 tw_sum += tweight[b];
403 frame_tonality += band_tonality[b];
404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
408 slope += band_tonality[b]*(b-8);
409 /*printf("%f %f ", band_tonality[b], stationarity);*/
410 tonal->prev_band_tonality[b] = band_tonality[b];
416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
418 noise_floor *= 1<<(15+SIG_SHIFT);
420 noise_floor *= noise_floor;
421 for (b=0;b<NB_TOT_BANDS;b++)
424 int band_start, band_end;
425 /* Keep a margin of 300 Hz for aliasing */
426 band_start = extra_bands[b];
427 band_end = extra_bands[b+1];
428 for (i=band_start;i<band_end;i++)
430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
434 maxE = MAX32(maxE, E);
435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436 E = MAX32(E, tonal->meanE[b]);
437 /* Use a simple follower with 13 dB/Bark slope for spreading function */
438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439 /* Consider the band "active" only if all these conditions are met:
440 1) less than 10 dB below the simple follower
441 2) less than 90 dB below the peak band (maximal masking possible considering
442 both the ATH and the loudness-dependent slope of the spreading function)
443 3) above the PCM quantization noise floor
445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
450 frame_loudness = 20*(float)log10(frame_loudness);
451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
452 tonal->lowECount *= (1-alphaE);
453 if (frame_loudness < tonal->Etracker-30)
454 tonal->lowECount += alphaE;
460 sum += dct_table[i*16+b]*logE[b];
464 frame_stationarity /= NB_TBANDS;
465 relativeE /= NB_TBANDS;
468 frame_noisiness /= NB_TBANDS;
470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
472 info->activity = .5*(1+frame_noisiness-frame_stationarity);
474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476 tonal->prev_tonality = frame_tonality;
479 info->tonality_slope = slope;
481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
483 info->tonality = frame_tonality;
486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
496 if (tonal->count > 5)
499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
504 tonal->mem[i+24] = tonal->mem[i+16];
505 tonal->mem[i+16] = tonal->mem[i+8];
506 tonal->mem[i+8] = tonal->mem[i];
507 tonal->mem[i] = BFCC[i];
510 features[11+i] = (float)sqrt(tonal->std[i]);
511 features[20] = info->tonality;
512 features[21] = info->activity;
513 features[22] = frame_stationarity;
514 features[23] = info->tonality_slope;
515 features[24] = tonal->lowECount;
517 #ifndef DISABLE_FLOAT_API
518 mlp_process(&net, features, frame_probs);
519 frame_probs[0] = .5f*(frame_probs[0]+1);
520 /* Curve fitting between the MLP probability and the actual probability */
521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
522 /* Probability of active audio (as opposed to silence) */
523 frame_probs[1] = .5f*frame_probs[1]+.5f;
524 /* Consider that silence has a 50-50 probability. */
525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
529 /* Probability of state transition */
531 /* Represents independence of the MLP probabilities, where
532 beta=1 means fully independent. */
534 /* Denormalized probability of speech (p0) and music (p1) after update */
536 /* Probabilities for "all speech" and "all music" */
538 /* Probability sum for renormalisation */
540 /* Instantaneous probability of speech and music, with beta pre-applied. */
544 /* One transition every 3 minutes of active audio */
545 tau = .00005f*frame_probs[1];
548 /* Adapt beta based on how "unexpected" the new prob is */
550 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
551 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
552 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
554 /* p0 and p1 are the probabilities of speech and music at this frame
555 using only information from previous frame and applying the
556 state transition model */
557 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
558 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
559 /* We apply the current probability with exponent beta to work around
560 the fact that the probability estimates aren't independent. */
561 p0 *= (float)pow(1-frame_probs[0], beta);
562 p1 *= (float)pow(frame_probs[0], beta);
563 /* Normalise the probabilities to get the Marokv probability of music. */
564 tonal->music_prob = p1/(p0+p1);
565 info->music_prob = tonal->music_prob;
567 /* This chunk of code deals with delayed decision. */
569 /* Instantaneous probability of speech and music, with beta pre-applied. */
570 speech0 = (float)pow(1-frame_probs[0], beta);
571 music0 = (float)pow(frame_probs[0], beta);
574 tonal->pspeech[0]=.5;
575 tonal->pmusic [0]=.5;
577 /* Updated probability of having only speech (s0) or only music (m0),
578 before considering the new observation. */
579 s0 = tonal->pspeech[0] + tonal->pspeech[1];
580 m0 = tonal->pmusic [0] + tonal->pmusic [1];
581 /* Updates s0 and m0 with instantaneous probability. */
582 tonal->pspeech[0] = s0*(1-tau)*speech0;
583 tonal->pmusic [0] = m0*(1-tau)*music0;
584 /* Propagate the transition probabilities */
585 for (i=1;i<DETECT_SIZE-1;i++)
587 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
588 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
590 /* Probability that the latest frame is speech, when all the previous ones were music. */
591 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
592 /* Probability that the latest frame is music, when all the previous ones were speech. */
593 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
595 /* Renormalise probabilities to 1 */
596 for (i=0;i<DETECT_SIZE;i++)
597 psum += tonal->pspeech[i] + tonal->pmusic[i];
599 for (i=0;i<DETECT_SIZE;i++)
601 tonal->pspeech[i] *= psum;
602 tonal->pmusic [i] *= psum;
604 psum = tonal->pmusic[0];
605 for (i=1;i<DETECT_SIZE;i++)
606 psum += tonal->pspeech[i];
608 /* Estimate our confidence in the speech/music decisions */
609 if (frame_probs[1]>.75)
611 if (tonal->music_prob>.9)
614 adapt = 1.f/(++tonal->music_confidence_count);
615 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
616 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
618 if (tonal->music_prob<.1)
621 adapt = 1.f/(++tonal->speech_confidence_count);
622 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
623 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
626 if (tonal->music_confidence_count==0)
627 tonal->music_confidence = .9f;
628 if (tonal->speech_confidence_count==0)
629 tonal->speech_confidence = .1f;
632 if (tonal->last_music != (tonal->music_prob>.5f))
633 tonal->last_transition=0;
634 tonal->last_music = tonal->music_prob>.5f;
636 info->music_prob = 0;
639 printf("%f ", features[i]);
642 info->bandwidth = bandwidth;
643 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
644 info->noisiness = frame_noisiness;
649 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
650 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
651 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
656 if (analysis_pcm != NULL)
658 /* Avoid overflow/wrap-around of the analysis buffer */
659 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
661 pcm_len = analysis_frame_size - analysis->analysis_offset;
662 offset = analysis->analysis_offset;
664 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
668 analysis->analysis_offset = analysis_frame_size;
670 analysis->analysis_offset -= frame_size;
673 analysis_info->valid = 0;
674 tonality_get_info(analysis, analysis_info, frame_size);