1 /* Copyright (C) 2003 Epic Games
2 Written by Jean-Marc Valin
5 Preprocessor with denoising based on the algorithm by Ephraim and Malah
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
11 1. Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 3. The name of the author may not be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
35 #include "speex_preprocess.h"
39 #define max(a,b) ((a) > (b) ? (a) : (b))
40 #define min(a,b) ((a) < (b) ? (a) : (b))
43 #define M_PI 3.14159263
46 #define SQRT_M_PI_2 0.88623
47 #define LOUDNESS_EXP 2.5
54 #define LOG_MIN_MAX_1 0.86859
56 static void conj_window(float *w, int len)
61 float x=4*((float)i)/len;
77 w[i]=(.5-.5*cos(x))*(.5-.5*cos(x));
84 /* This function approximates the gain function
85 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
86 which multiplied by xi/(1+xi) is the optimal gain
87 in the loudness domain ( sqrt[amplitude] )
89 static float hypergeom_gain(float x)
93 static float table[21] = {
94 0.82157, 1.02017, 1.20461, 1.37534, 1.53363, 1.68092, 1.81865,
95 1.94811, 2.07038, 2.18638, 2.29688, 2.40255, 2.50391, 2.60144,
96 2.69551, 2.78647, 2.87458, 2.96015, 3.04333, 3.12431, 3.20326};
105 return ((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001);
108 SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
113 SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
114 st->frame_size = frame_size;
116 /* Round ps_size down to the nearest power of two */
119 st->ps_size = st->frame_size;
122 if (st->ps_size & ~i)
132 if (st->ps_size < 3*st->frame_size/4)
133 st->ps_size = st->ps_size * 3 / 2;
135 st->ps_size = st->frame_size;
139 N3 = 2*N - st->frame_size;
140 N4 = st->frame_size - N3;
142 st->sampling_rate = sampling_rate;
143 st->denoise_enabled = 1;
145 st->agc_level = 8000;
148 st->frame = (float*)speex_alloc(2*N*sizeof(float));
149 st->ps = (float*)speex_alloc(N*sizeof(float));
150 st->gain2 = (float*)speex_alloc(N*sizeof(float));
151 st->window = (float*)speex_alloc(2*N*sizeof(float));
152 st->noise = (float*)speex_alloc(N*sizeof(float));
153 st->old_ps = (float*)speex_alloc(N*sizeof(float));
154 st->gain = (float*)speex_alloc(N*sizeof(float));
155 st->prior = (float*)speex_alloc(N*sizeof(float));
156 st->post = (float*)speex_alloc(N*sizeof(float));
157 st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
158 st->inbuf = (float*)speex_alloc(N3*sizeof(float));
159 st->outbuf = (float*)speex_alloc(N3*sizeof(float));
160 st->echo_noise = (float*)speex_alloc(N*sizeof(float));
162 st->S = (float*)speex_alloc(N*sizeof(float));
163 st->Smin = (float*)speex_alloc(N*sizeof(float));
164 st->Stmp = (float*)speex_alloc(N*sizeof(float));
165 st->update_prob = (float*)speex_alloc(N*sizeof(float));
167 st->zeta = (float*)speex_alloc(N*sizeof(float));
171 st->noise_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
172 st->noise_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
173 st->speech_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
174 st->speech_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
175 st->noise_bandsN = st->speech_bandsN = 1;
177 conj_window(st->window, 2*N3);
178 for (i=2*N3;i<2*st->ps_size;i++)
183 for (i=N3-1;i>=0;i--)
185 st->window[i+N3+N4]=st->window[i+N3];
206 float ff=((float)i)*.5*sampling_rate/((float)N);
207 st->loudness_weight[i] = .35-.35*ff/16000+.73*exp(-.5*(ff-3800)*(ff-3800)/9e5);
208 if (st->loudness_weight[i]<.01)
209 st->loudness_weight[i]=.01;
210 st->loudness_weight[i] *= st->loudness_weight[i];
214 st->last_speech = 1000;
215 st->loudness = pow(6000,LOUDNESS_EXP);
216 st->loudness2 = 6000;
217 st->nb_loudness_adapt = 0;
219 st->fft_lookup = speex_alloc(sizeof(struct drft_lookup));
220 drft_init(st->fft_lookup,2*N);
228 void speex_preprocess_state_destroy(SpeexPreprocessState *st)
230 speex_free(st->frame);
232 speex_free(st->gain2);
233 speex_free(st->window);
234 speex_free(st->noise);
235 speex_free(st->old_ps);
236 speex_free(st->gain);
237 speex_free(st->prior);
238 speex_free(st->post);
239 speex_free(st->loudness_weight);
240 speex_free(st->echo_noise);
243 speex_free(st->Smin);
244 speex_free(st->Stmp);
245 speex_free(st->update_prob);
247 speex_free(st->noise_bands);
248 speex_free(st->noise_bands2);
249 speex_free(st->speech_bands);
250 speex_free(st->speech_bands2);
252 speex_free(st->inbuf);
253 speex_free(st->outbuf);
255 drft_clear(st->fft_lookup);
256 speex_free(st->fft_lookup);
261 static void update_noise(SpeexPreprocessState *st, float *ps, float *echo)
266 beta=1.0/st->nb_adapt;
272 for (i=0;i<st->ps_size;i++)
273 st->noise[i] = (1-beta)*st->noise[i] + beta*ps[i];
275 for (i=0;i<st->ps_size;i++)
276 st->noise[i] = (1-beta)*st->noise[i] + beta*max(0,ps[i]-echo[i]);
278 for (i=0;i<st->ps_size;i++)
284 static int speex_compute_vad(SpeexPreprocessState *st, float *ps, float mean_prior, float mean_post)
290 /* FIXME: Clean this up a bit */
292 float bands[NB_BANDS];
295 float tot_loudness=0;
296 float x = sqrt(mean_post);
300 tot_loudness += scale*st->ps[i] * st->loudness_weight[i];
303 for (i=0;i<NB_BANDS;i++)
306 for (j=i*N/NB_BANDS;j<(i+1)*N/NB_BANDS;j++)
310 bands[i]=log(bands[i]);
313 /*p1 = .0005+.6*exp(-.5*(x-.4)*(x-.4)*11)+.1*exp(-1.2*x);
315 p0=.1*exp(2*(x-1.5));
317 p0=.02+.1*exp(-.2*(x-1.5));
320 p0=1/(1+exp(3*(1.5-x)));
323 /*fprintf (stderr, "%f %f ", p0, p1);*/
324 /*p0 *= .99*st->speech_prob + .01*(1-st->speech_prob);
325 p1 *= .01*st->speech_prob + .99*(1-st->speech_prob);
327 st->speech_prob = p0/(p1+p0);
330 if (st->noise_bandsN < 50 || st->speech_bandsN < 50)
334 float adapt = 1./st->speech_bandsN++;
337 for (i=0;i<NB_BANDS;i++)
339 st->speech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i];
340 /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
341 st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
344 float adapt = 1./st->noise_bandsN++;
347 for (i=0;i<NB_BANDS;i++)
349 st->noise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i];
350 /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
351 st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
356 for (i=0;i<NB_BANDS;i++)
358 float noise_var, speech_var;
359 float noise_mean, speech_mean;
360 float tmp1, tmp2, pr;
362 /*noise_var = 1.01*st->noise_bands2[i] - st->noise_bands[i]*st->noise_bands[i];
363 speech_var = 1.01*st->speech_bands2[i] - st->speech_bands[i]*st->speech_bands[i];*/
364 noise_var = st->noise_bands2[i];
365 speech_var = st->speech_bands2[i];
371 /*speech_var = sqrt(speech_var*noise_var);
372 noise_var = speech_var;*/
373 if (speech_var < .05*speech_var)
374 noise_var = .05*speech_var;
375 if (speech_var < .05*noise_var)
376 speech_var = .05*noise_var;
378 if (bands[i] < st->noise_bands[i])
379 speech_var = noise_var;
380 if (bands[i] > st->speech_bands[i])
381 noise_var = speech_var;
383 speech_mean = st->speech_bands[i];
384 noise_mean = st->noise_bands[i];
385 if (noise_mean < speech_mean - 5)
386 noise_mean = speech_mean - 5;
388 tmp1 = exp(-.5*(bands[i]-speech_mean)*(bands[i]-speech_mean)/speech_var)/sqrt(2*M_PI*speech_var);
389 tmp2 = exp(-.5*(bands[i]-noise_mean)*(bands[i]-noise_mean)/noise_var)/sqrt(2*M_PI*noise_var);
390 /*fprintf (stderr, "%f ", (float)(p0/(.01+p0+p1)));*/
391 /*fprintf (stderr, "%f ", (float)(bands[i]));*/
392 pr = tmp1/(1e-25+tmp1+tmp2);
393 /*if (bands[i] < st->noise_bands[i])
395 if (bands[i] > st->speech_bands[i] && pr < .995)
401 /*fprintf (stderr, "%f ", pr);*/
412 if (st->last_speech>20)
414 float tmp = sqrt(tot_loudness)/st->loudness2;
415 tmp = 1-exp(-10*tmp);
421 if (sqrt(tot_loudness) < .6*st->loudness2 && p0>15*p1)
423 if (sqrt(tot_loudness) < .45*st->loudness2 && p0>7*p1)
425 if (sqrt(tot_loudness) < .3*st->loudness2 && p0>3*p1)
427 if (sqrt(tot_loudness) < .15*st->loudness2 && p0>p1)
429 /*fprintf (stderr, "%f %f ", (float)(sqrt(tot_loudness) /( .25*st->loudness2)), p0/(p1+p0));*/
432 p0 *= .99*st->speech_prob + .01*(1-st->speech_prob);
433 p1 *= .01*st->speech_prob + .99*(1-st->speech_prob);
435 st->speech_prob = p0/(1e-25+p1+p0);
436 /*fprintf (stderr, "%f %f %f ", tot_loudness, st->loudness2, st->speech_prob);*/
438 if (st->speech_prob>.35 || (st->last_speech < 20 && st->speech_prob>.1))
444 if (st->last_speech<20)
448 if (st->noise_bandsN > 50 && st->speech_bandsN > 50)
452 float adapt = 1./st->speech_bandsN++;
455 for (i=0;i<NB_BANDS;i++)
457 st->speech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i];
458 /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
459 st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
462 float adapt = 1./st->noise_bandsN++;
465 for (i=0;i<NB_BANDS;i++)
467 st->noise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i];
468 /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
469 st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
480 static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
486 int freq_start, freq_end;
487 float active_bands = 0;
489 freq_start = (int)(300.0*2*N/st->sampling_rate);
490 freq_end = (int)(2000.0*2*N/st->sampling_rate);
491 for (i=freq_start;i<freq_end;i++)
493 if (st->S[i] > 20*st->Smin[i]+1000)
496 active_bands /= (freq_end-freq_start+1);
498 if (active_bands > .2)
501 float rate, rate2=.2;
502 st->nb_loudness_adapt++;
503 rate=2.0/(1+st->nb_loudness_adapt);
506 if (rate < .1 && pow(loudness, LOUDNESS_EXP) > st->loudness)
508 if (rate < .2 && pow(loudness, LOUDNESS_EXP) > 3*st->loudness)
510 if (rate < .4 && pow(loudness, LOUDNESS_EXP) > 10*st->loudness)
515 loudness += scale*st->ps[i] * st->gain2[i] * st->gain2[i] * st->loudness_weight[i];
517 loudness=sqrt(loudness);
518 /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
519 loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
520 st->loudness = (1-rate)*st->loudness + (rate)*pow(loudness, LOUDNESS_EXP);
522 st->loudness2 = (1-rate2)*st->loudness2 + rate2*pow(st->loudness, 1.0/LOUDNESS_EXP);
524 loudness = pow(st->loudness, 1.0/LOUDNESS_EXP);
526 /*fprintf (stderr, "%f %f %f\n", loudness, st->loudness2, rate);*/
529 agc_gain = st->agc_level/st->loudness2;
530 /*fprintf (stderr, "%f %f %f %f\n", active_bands, st->loudness, st->loudness2, agc_gain);*/
535 st->gain2[i] *= agc_gain;
539 static void preprocess_analysis(SpeexPreprocessState *st, float *x)
543 int N3 = 2*N - st->frame_size;
544 int N4 = st->frame_size - N3;
547 /* 'Build' input frame */
549 st->frame[i]=st->inbuf[i];
550 for (i=0;i<st->frame_size;i++)
551 st->frame[N3+i]=x[i];
555 st->inbuf[i]=x[N4+i];
559 st->frame[i] *= st->window[i];
562 drft_forward(st->fft_lookup, st->frame);
567 ps[i]=1+st->frame[2*i-1]*st->frame[2*i-1] + st->frame[2*i]*st->frame[2*i];
571 static void update_noise_prob(SpeexPreprocessState *st)
577 st->S[i] = 100+ .8*st->S[i] + .05*st->ps[i-1]+.1*st->ps[i]+.05*st->ps[i+1];
579 if (st->nb_preprocess<1)
582 st->Smin[i] = st->Stmp[i] = st->S[i]+100;
585 if (st->nb_preprocess%80==0)
589 st->Smin[i] = min(st->Stmp[i], st->S[i]);
590 st->Stmp[i] = st->S[i];
595 st->Smin[i] = min(st->Smin[i], st->S[i]);
596 st->Stmp[i] = min(st->Stmp[i], st->S[i]);
601 st->update_prob[i] *= .2;
602 if (st->S[i] > 5*st->Smin[i])
603 st->update_prob[i] += .8;
604 /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
605 /*fprintf (stderr, "%f ", st->update_prob[i]);*/
610 int speex_preprocess(SpeexPreprocessState *st, float *x, float *echo)
617 int N3 = 2*N - st->frame_size;
618 int N4 = st->frame_size - N3;
621 float Zframe=0, Pframe;
623 preprocess_analysis(st, x);
625 update_noise_prob(st);
629 /* Noise estimation always updated for the 20 first times */
632 update_noise(st, ps, echo);
635 /* Deal with residual echo if provided */
638 st->echo_noise[i] = (.7*st->echo_noise[i] + .3* echo[i]);
640 /* Compute a posteriori SNR */
643 st->post[i] = ps[i]/(1+st->noise[i]+st->echo_noise[i]) - 1;
648 mean_post+=st->post[i];
654 /* Special case for first frame */
657 st->old_ps[i] = ps[i];
659 /* Compute a priori SNR */
661 /* A priori update rate */
663 float min_gamma=0.12;
664 gamma = 1.0/st->nb_preprocess;
666 /*Make update rate smaller when there's no speech*/
668 if (mean_post<3.5 && mean_prior < 1)
669 min_gamma *= (mean_post+.5);
673 min_gamma = .2*fabs(mean_prior - mean_post)*fabs(mean_prior - mean_post);
686 /* A priori SNR update */
687 st->prior[i] = gamma*max(0.0,st->post[i]) +
688 (1-gamma)*st->gain[i]*st->gain[i]*st->old_ps[i]/(1+st->noise[i]+st->echo_noise[i]);
690 if (st->prior[i]>100)
693 mean_prior+=st->prior[i];
701 fprintf (stderr, "%f ", st->prior[i]);
703 fprintf (stderr, "\n");
705 /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/
707 if (st->nb_preprocess>=20)
710 float noise_ener=0, sig_ener=0;
711 /* If SNR is low (both a priori and a posteriori), update the noise estimate*/
712 /*if (mean_prior<.23 && mean_post < .5)*/
713 if (mean_prior<.23 && mean_post < .5)
717 noise_ener += st->noise[i];
720 if (noise_ener > 3*sig_ener)
732 is_speech = speex_compute_vad(st, ps, mean_prior, mean_post);
735 if (st->consec_noise>=3)
737 update_noise(st, st->old_ps, echo);
741 if (st->update_prob[i]<.5)
742 st->noise[i] = .90*st->noise[i] + .1*st->ps[i];
748 st->zeta[i] = .7*st->zeta[i] + .3*st->prior[i];
752 int freq_start = (int)(300.0*2*N/st->sampling_rate);
753 int freq_end = (int)(2000.0*2*N/st->sampling_rate);
754 for (i=freq_start;i<freq_end;i++)
756 Zframe += st->zeta[i];
765 if (Zframe > 1.5*st->Zlast)
774 if (Zframe < st->Zpeak*ZMIN)
777 } else if (Zframe > st->Zpeak*ZMAX)
781 Pframe = log(Zframe/(st->Zpeak*ZMIN)) / log(ZMAX/ZMIN);
787 /*fprintf (stderr, "%f\n", Pframe);*/
788 /* Compute gain according to the Ephraim-Malah algorithm */
798 prior_ratio = st->prior[i]/(1.0001+st->prior[i]);
799 theta = (1+st->post[i])*prior_ratio;
804 zeta1 = .25*st->zeta[i-1] + .5*st->zeta[i] + .25*st->zeta[i+1];
810 P1 = LOG_MIN_MAX_1 * log(ZMIN_1*zeta1);
812 /*P1 = log(zeta1/ZMIN)/log(ZMAX/ZMIN);*/
814 /* FIXME: add global prop (P2) */
818 p=1/(1 + (q/(1-q))*(1+st->prior[i])*exp(-theta));
822 /* log-spectral magnitude estimator */
824 MM = 0.74082*pow(theta+1,.61)/sqrt(.0001+theta);
828 /* Optimal estimator for loudness domain */
829 MM = hypergeom_gain(theta);
832 st->gain[i] = prior_ratio * MM;
833 /*Put some (very arbitraty) limit on the gain*/
839 if (st->denoise_enabled)
841 st->gain2[i]=p*p*st->gain[i];
846 st->gain2[0]=st->gain[0]=0;
847 st->gain2[N-1]=st->gain[N-1]=0;
850 speex_compute_agc(st, mean_prior);
866 /* Apply computed gain */
869 st->frame[2*i-1] *= st->gain2[i];
870 st->frame[2*i] *= st->gain2[i];
873 /* Get rid of the DC and very low frequencies */
877 /* Nyquist frequency is mostly useless too */
880 /* Inverse FFT with 1/N scaling */
881 drft_backward(st->fft_lookup, st->frame);
884 st->frame[i] *= scale;
889 if (fabs(st->frame[i])>max_sample)
890 max_sample = fabs(st->frame[i]);
891 if (max_sample>28000)
893 float damp = 28000./max_sample;
895 st->frame[i] *= damp;
900 st->frame[i] *= st->window[i];
902 /* Perform overlap and add */
904 x[i] = st->outbuf[i] + st->frame[i];
906 x[N3+i] = st->frame[N3+i];
910 st->outbuf[i] = st->frame[st->frame_size+i];
912 /* Save old power spectrum */
914 st->old_ps[i] = ps[i];
919 void speex_preprocess_estimate_update(SpeexPreprocessState *st, float *x, float *noise)
923 int N3 = 2*N - st->frame_size;
927 preprocess_analysis(st, x);
929 update_noise_prob(st);
935 if (st->update_prob[i]<.5)
936 st->noise[i] = .90*st->noise[i] + .1*ps[i];
940 st->outbuf[i] = x[st->frame_size-N3+i]*st->window[st->frame_size+i];
942 /* Save old power spectrum */
944 st->old_ps[i] = ps[i];
949 int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
951 SpeexPreprocessState *st;
952 st=(SpeexPreprocessState*)state;
955 case SPEEX_PREPROCESS_SET_DENOISE:
956 st->denoise_enabled = (*(int*)ptr);
958 case SPEEX_PREPROCESS_GET_DENOISE:
959 (*(int*)ptr) = st->denoise_enabled;
962 case SPEEX_PREPROCESS_SET_AGC:
963 st->agc_enabled = (*(int*)ptr);
965 case SPEEX_PREPROCESS_GET_AGC:
966 (*(int*)ptr) = st->agc_enabled;
969 case SPEEX_PREPROCESS_SET_AGC_LEVEL:
970 st->agc_level = (*(float*)ptr);
973 if (st->agc_level>32768)
976 case SPEEX_PREPROCESS_GET_AGC_LEVEL:
977 (*(float*)ptr) = st->agc_level;
980 case SPEEX_PREPROCESS_SET_VAD:
981 st->vad_enabled = (*(int*)ptr);
983 case SPEEX_PREPROCESS_GET_VAD:
984 (*(int*)ptr) = st->vad_enabled;
987 speex_warning_int("Unknown speex_preprocess_ctl request: ", request);