1 /* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin)
2 Copyright (C) 2004-2006 Epic Games
5 Preprocessor with denoising based on the algorithm by Ephraim and Malah
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
11 1. Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 3. The name of the author may not be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
38 Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
39 short-time spectral amplitude estimator". IEEE Transactions on Acoustics,
40 Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984.
42 Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
43 log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and
44 Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985.
46 I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments".
47 Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001.
49 Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic
50 approach to combined acoustic echo cancellation and noise reduction". IEEE
51 Transactions on Speech and Audio Processing, 2002.
53 J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation
54 of simultaneous non-stationary sources". In Proceedings IEEE International
55 Conference on Acoustics, Speech, and Signal Processing, 2004.
63 #include "speex/speex_preprocess.h"
64 #include "speex/speex_echo.h"
67 #include "filterbank.h"
68 #include "math_approx.h"
71 #define M_PI 3.14159263
74 #define LOUDNESS_EXP 5.f
75 #define AMP_SCALE .001f
76 #define AMP_SCALE_1 1000.f
80 #define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15)
81 #define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15)
82 #define NOISE_SUPPRESS_DEFAULT -15
83 #define ECHO_SUPPRESS_DEFAULT -40
84 #define ECHO_SUPPRESS_ACTIVE_DEFAULT -15
90 #define SQR(x) ((x)*(x))
91 #define SQR16(x) (MULT16_16((x),(x)))
92 #define SQR16_Q15(x) (MULT16_16_Q15((x),(x)))
95 static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b)
101 if (b>=QCONST32(1,23))
106 if (b>=QCONST32(1,19))
111 if (b>=QCONST32(1,15))
117 return PDIV32_16(a,b);
121 static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b)
123 if (SHR32(a,15) >= b)
127 if (b>=QCONST32(1,23))
132 if (b>=QCONST32(1,19))
137 if (b>=QCONST32(1,15))
143 return DIV32_16(a,b);
146 #define SNR_SCALING 256.f
147 #define SNR_SCALING_1 0.0039062f
150 #define FRAC_SCALING 32767.f
151 #define FRAC_SCALING_1 3.0518e-05
154 #define EXPIN_SCALING 2048.f
155 #define EXPIN_SCALING_1 0.00048828f
156 #define EXPIN_SHIFT 11
157 #define EXPOUT_SCALING_1 1.5259e-05
159 #define NOISE_SHIFT 7
163 #define DIV32_16_Q8(a,b) ((a)/(b))
164 #define DIV32_16_Q15(a,b) ((a)/(b))
165 #define SNR_SCALING 1.f
166 #define SNR_SCALING_1 1.f
168 #define FRAC_SCALING 1.f
169 #define FRAC_SCALING_1 1.f
171 #define NOISE_SHIFT 0
173 #define EXPIN_SCALING 1.f
174 #define EXPIN_SCALING_1 1.f
175 #define EXPOUT_SCALING_1 1.f
179 /** Speex pre-processor state. */
180 struct SpeexPreprocessState_ {
182 int frame_size; /**< Number of samples processed each time */
183 int ps_size; /**< Number of points in the power spectrum */
184 int sampling_rate; /**< Sampling rate of the input/output */
191 int dereverb_enabled;
192 spx_word16_t reverb_decay;
193 spx_word16_t reverb_level;
194 spx_word16_t speech_prob_start;
195 spx_word16_t speech_prob_continue;
198 int echo_suppress_active;
199 SpeexEchoState *echo_state;
201 /* DSP-related arrays */
202 spx_word16_t *frame; /**< Processing frame (2*ps_size) */
203 spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */
204 spx_word32_t *ps; /**< Current power spectrum */
205 spx_word16_t *gain2; /**< Adjusted gains */
206 spx_word16_t *gain_floor; /**< Minimum gain allowed */
207 spx_word16_t *window; /**< Analysis/Synthesis window */
208 spx_word32_t *noise; /**< Noise estimate */
209 spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */
210 spx_word32_t *old_ps; /**< Power spectrum for last frame */
211 spx_word16_t *gain; /**< Ephraim Malah gain */
212 spx_word16_t *prior; /**< A-priori SNR */
213 spx_word16_t *post; /**< A-posteriori SNR */
215 spx_word32_t *S; /**< Smoothed power spectrum */
216 spx_word32_t *Smin; /**< See Cohen paper */
217 spx_word32_t *Stmp; /**< See Cohen paper */
218 int *update_prob; /**< Propability of speech presence for noise update */
220 spx_word16_t *zeta; /**< Smoothed a priori SNR */
221 spx_word32_t *echo_noise;
222 spx_word32_t *residual_echo;
225 spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */
226 spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */
228 /* AGC stuff, only for floating point for now */
232 float loudness_accum;
233 float *loudness_weight; /**< Perceptual loudness curve */
234 float loudness; /**< Loudness estimate */
235 float agc_gain; /**< Current AGC gain */
236 int nb_loudness_adapt; /**< Number of frames used for loudness adaptation so far */
237 float max_gain; /**< Maximum gain allowed */
238 float max_increase_step; /**< Maximum increase in gain from one frame to another */
239 float max_decrease_step; /**< Maximum decrease in gain from one frame to another */
240 float prev_loudness; /**< Loudness of previous frame */
241 float init_max; /**< Current gain limit during initialisation */
243 int nb_adapt; /**< Number of frames used for adaptation so far */
245 int min_count; /**< Number of frames processed so far */
246 void *fft_lookup; /**< Lookup table for the FFT */
253 static void conj_window(spx_word16_t *w, int len)
260 spx_word16_t x = DIV32_16(MULT16_16(32767,i),len);
262 spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len);
265 if (x<QCONST16(1.f,13))
267 } else if (x<QCONST16(2.f,13))
269 x=QCONST16(2.f,13)-x;
271 } else if (x<QCONST16(3.f,13))
273 x=x-QCONST16(2.f,13);
276 x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */
278 x = MULT16_16_Q14(QCONST16(1.271903f,14), x);
279 tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(QCONST32(x,2))));
281 tmp=SUB16(Q15_ONE,tmp);
282 w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15));
288 /* This function approximates the gain function
289 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
290 which multiplied by xi/(1+xi) is the optimal gain
291 in the loudness domain ( sqrt[amplitude] )
292 Input in Q11 format, output in Q15
294 static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
299 static const spx_word16_t table[21] = {
300 6730, 8357, 9868, 11267, 12563, 13770, 14898,
301 15959, 16961, 17911, 18816, 19682, 20512, 21311,
302 22082, 22827, 23549, 24250, 24931, 25594, 26241};
307 return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT))));
308 frac = SHL32(xx-SHL32(ind,10),5);
309 return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7);
312 static inline spx_word16_t qcurve(spx_word16_t x)
315 return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x))));
318 /* Compute the gain floor based on different floors for the background noise and residual echo */
319 static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
323 if (noise_suppress > effective_echo_suppress)
325 spx_word16_t noise_gain, gain_ratio;
326 noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1)));
327 gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1)));
329 /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
331 gain_floor[i] = MULT16_16_Q15(noise_gain,
332 spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]),
333 (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
335 spx_word16_t echo_gain, gain_ratio;
336 echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1)));
337 gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1)));
339 /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
341 gain_floor[i] = MULT16_16_Q15(echo_gain,
342 spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i],
343 (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
348 /* This function approximates the gain function
349 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
350 which multiplied by xi/(1+xi) is the optimal gain
351 in the loudness domain ( sqrt[amplitude] )
353 static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
358 static const float table[21] = {
359 0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
360 1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
361 2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
362 x = EXPIN_SCALING_1*xx;
363 integer = floor(2*x);
368 return FRAC_SCALING*(1+.1296/x);
370 return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
373 static inline spx_word16_t qcurve(spx_word16_t x)
375 return 1.f/(1.f+.15f/(SNR_SCALING_1*x));
378 static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
384 noise_floor = exp(.2302585f*noise_suppress);
385 echo_floor = exp(.2302585f*effective_echo_suppress);
387 /* Compute the gain floor based on different floors for the background noise and residual echo */
389 gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
393 SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
398 SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
399 st->frame_size = frame_size;
401 /* Round ps_size down to the nearest power of two */
404 st->ps_size = st->frame_size;
407 if (st->ps_size & ~i)
417 if (st->ps_size < 3*st->frame_size/4)
418 st->ps_size = st->ps_size * 3 / 2;
420 st->ps_size = st->frame_size;
424 N3 = 2*N - st->frame_size;
425 N4 = st->frame_size - N3;
427 st->sampling_rate = sampling_rate;
428 st->denoise_enabled = 1;
430 st->dereverb_enabled = 0;
431 st->reverb_decay = 0;
432 st->reverb_level = 0;
433 st->noise_suppress = NOISE_SUPPRESS_DEFAULT;
434 st->echo_suppress = ECHO_SUPPRESS_DEFAULT;
435 st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT;
437 st->speech_prob_start = SPEECH_PROB_START_DEFAULT;
438 st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT;
440 st->echo_state = NULL;
442 st->nbands = NB_BANDS;
444 st->bank = filterbank_new(M, sampling_rate, N, 1);
446 st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
447 st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
448 st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
450 st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
451 st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
452 st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
453 st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
454 st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
455 st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
456 st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
457 st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
458 st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
459 st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
460 st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
461 st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
463 st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
464 st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
465 st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
466 st->update_prob = (int*)speex_alloc(N*sizeof(int));
468 st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
469 st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
471 conj_window(st->window, 2*N3);
472 for (i=2*N3;i<2*st->ps_size;i++)
473 st->window[i]=Q15_ONE;
477 for (i=N3-1;i>=0;i--)
479 st->window[i+N3+N4]=st->window[i+N3];
485 st->noise[i]=QCONST32(1.f,NOISE_SHIFT);
486 st->reverb_estimate[i]=0;
489 st->post[i]=SHL16(1, SNR_SHIFT);
490 st->prior[i]=SHL16(1, SNR_SHIFT);
494 st->update_prob[i] = 1;
502 st->agc_level = 8000;
503 st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
506 float ff=((float)i)*.5*sampling_rate/((float)N);
507 /*st->loudness_weight[i] = .5f*(1.f/(1.f+ff/8000.f))+1.f*exp(-.5f*(ff-3800.f)*(ff-3800.f)/9e5f);*/
508 st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
509 if (st->loudness_weight[i]<.01f)
510 st->loudness_weight[i]=.01f;
511 st->loudness_weight[i] *= st->loudness_weight[i];
513 /*st->loudness = pow(AMP_SCALE*st->agc_level,LOUDNESS_EXP);*/
514 st->loudness = 1e-15;
516 st->nb_loudness_adapt = 0;
518 st->max_increase_step = exp(0.11513f * 12.*st->frame_size / st->sampling_rate);
519 st->max_decrease_step = exp(-0.11513f * 40.*st->frame_size / st->sampling_rate);
520 st->prev_loudness = 1;
525 st->fft_lookup = spx_fft_init(2*N);
532 void speex_preprocess_state_destroy(SpeexPreprocessState *st)
534 speex_free(st->frame);
537 speex_free(st->gain2);
538 speex_free(st->gain_floor);
539 speex_free(st->window);
540 speex_free(st->noise);
541 speex_free(st->reverb_estimate);
542 speex_free(st->old_ps);
543 speex_free(st->gain);
544 speex_free(st->prior);
545 speex_free(st->post);
547 speex_free(st->loudness_weight);
549 speex_free(st->echo_noise);
550 speex_free(st->residual_echo);
553 speex_free(st->Smin);
554 speex_free(st->Stmp);
555 speex_free(st->update_prob);
556 speex_free(st->zeta);
558 speex_free(st->inbuf);
559 speex_free(st->outbuf);
561 spx_fft_destroy(st->fft_lookup);
562 filterbank_destroy(st->bank);
566 /* FIXME: The AGC doesn't work yet with fixed-point*/
568 static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft)
578 loudness += 2.f*N*st->ps[i]* st->loudness_weight[i];
580 loudness=sqrt(loudness);
581 /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
582 loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
585 st->nb_loudness_adapt++;
586 /*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/
587 rate = .03*Pframe*Pframe;
588 st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP);
589 st->loudness_accum = (1-rate)*st->loudness_accum + rate;
590 if (st->init_max < st->max_gain && st->nb_adapt > 20)
591 st->init_max *= 1.f + .1f*Pframe*Pframe;
593 /*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/
595 target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP);
597 if ((Pframe>.5 && st->nb_adapt > 20) || target_gain < st->agc_gain)
599 if (target_gain > st->max_increase_step*st->agc_gain)
600 target_gain = st->max_increase_step*st->agc_gain;
601 if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness)
602 target_gain = st->max_decrease_step*st->agc_gain;
603 if (target_gain > st->max_gain)
604 target_gain = st->max_gain;
605 if (target_gain > st->init_max)
606 target_gain = st->init_max;
608 st->agc_gain = target_gain;
610 /*fprintf (stderr, "%f %f %f\n", loudness, (float)AMP_SCALE_1*pow(st->loudness, 1.0f/LOUDNESS_EXP), st->agc_gain);*/
613 ft[i] *= st->agc_gain;
614 st->prev_loudness = loudness;
618 static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
622 int N3 = 2*N - st->frame_size;
623 int N4 = st->frame_size - N3;
624 spx_word32_t *ps=st->ps;
626 /* 'Build' input frame */
628 st->frame[i]=st->inbuf[i];
629 for (i=0;i<st->frame_size;i++)
630 st->frame[N3+i]=x[i];
634 st->inbuf[i]=x[N4+i];
638 st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
642 spx_word16_t max_val=0;
644 max_val = MAX16(max_val, ABS16(st->frame[i]));
645 st->frame_shift = 14-spx_ilog2(EXTEND32(max_val));
647 st->frame[i] = SHL16(st->frame[i], st->frame_shift);
652 spx_fft(st->fft_lookup, st->frame, st->ft);
655 ps[0]=MULT16_16(st->ft[0],st->ft[0]);
657 ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]);
659 st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift);
661 filterbank_compute_bank32(st->bank, ps, ps+N);
664 static void update_noise_prob(SpeexPreprocessState *st)
671 st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1])
672 + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]);
673 st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]);
674 st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]);
679 st->Smin[i] = st->Stmp[i] = 0;
682 if (st->nb_adapt < 100)
684 else if (st->nb_adapt < 1000)
686 else if (st->nb_adapt < 10000)
690 if (st->min_count > min_range)
695 st->Smin[i] = MIN32(st->Stmp[i], st->S[i]);
696 st->Stmp[i] = st->S[i];
701 st->Smin[i] = MIN32(st->Smin[i], st->S[i]);
702 st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]);
707 if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > ADD32(st->Smin[i],EXTEND32(20)))
708 st->update_prob[i] = 1;
710 st->update_prob[i] = 0;
711 /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
712 /*fprintf (stderr, "%f ", st->update_prob[i]);*/
717 #define NOISE_OVERCOMPENS 1.
719 void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
721 int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
723 return speex_preprocess_run(st, x);
726 int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
731 int N3 = 2*N - st->frame_size;
732 int N4 = st->frame_size - N3;
733 spx_word32_t *ps=st->ps;
736 spx_word16_t beta, beta_1;
737 spx_word16_t effective_echo_suppress;
742 beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
743 beta_1 = Q15_ONE-beta;
745 /* Deal with residual echo if provided */
748 speex_echo_get_residual(st->echo_state, st->residual_echo, N);
750 /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */
751 if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f))
754 st->residual_echo[i] = 0;
758 st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]);
759 filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N);
762 st->echo_noise[i] = 0;
764 preprocess_analysis(st, x);
766 update_noise_prob(st);
768 /* Noise estimation always updated for the 10 first frames */
769 /*if (st->nb_adapt<10)
772 st->update_prob[i] = 0;
776 /* Update the noise estimate for the frequencies where it can be */
779 if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT))
780 st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT)));
782 filterbank_compute_bank32(st->bank, st->noise, st->noise+N);
784 /* Special case for first frame */
787 st->old_ps[i] = ps[i];
789 /* Compute a posteriori SNR */
794 /* Total noise estimate including residual echo and reverberation */
795 spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]);
797 /* A posteriori SNR = ps/noise - 1*/
798 st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
799 st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT));
801 /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */
802 gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise))));
804 /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */
805 st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15));
806 st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT));
809 /*print_vec(st->post, N+M, "");*/
811 /* Recursive average of the a priori SNR. A bit smoothed for the psd components */
812 st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15);
814 st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])),
815 MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15);
816 for (i=N-1;i<N+M;i++)
817 st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15);
819 /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */
822 Zframe = ADD32(Zframe, EXTEND32(st->zeta[i]));
823 Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands)));
825 effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15));
827 compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
829 /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale)
830 Technically this is actually wrong because the EM gaim assumes a slightly different probability
834 /* See EM and Cohen papers*/
836 /* Gain from hypergeometric function */
838 /* Weiner filter gain */
839 spx_word16_t prior_ratio;
840 /* a priority probability of speech presence based on Bark sub-band alone */
842 /* Speech absence a priori probability (considering sub-band and frame) */
848 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
849 theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
851 MM = hypergeom_gain(theta);
852 /* Gain with bound */
853 st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
854 /* Save old Bark power spectrum */
855 st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
857 P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i]));
858 q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
860 theta = MIN32(theta, EXTEND32(32767));
861 /*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
862 tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/
863 /*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
864 st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
866 st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
869 /* Convert the EM gains and speech prob to linear frequency */
870 filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
871 filterbank_compute_psd16(st->bank,st->gain+N, st->gain);
873 /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */
876 filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor);
878 /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */
883 spx_word16_t prior_ratio;
888 /* Wiener filter gain */
889 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
890 theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
892 /* Optimal estimator for loudness domain */
893 MM = hypergeom_gain(theta);
894 /* EM gain with bound */
895 g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
896 /* Interpolated speech probability of presence */
899 /* Constrain the gain to be close to the Bark scale gain */
900 if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i])
901 g = MULT16_16(3,st->gain[i]);
904 /* Save old power spectrum */
905 st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
907 /* Apply gain floor */
908 if (st->gain[i] < st->gain_floor[i])
909 st->gain[i] = st->gain_floor[i];
911 /* Exponential decay model for reverberation (unused) */
912 /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/
914 /* Take into account speech probability of presence (loudness domain MMSE estimator) */
915 /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */
916 tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
917 st->gain2[i]=SQR16_Q15(tmp);
919 /* Use this if you want a log-domain MMSE estimator instead */
920 /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/
926 spx_word16_t p = st->gain2[i];
927 st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]);
928 tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
929 st->gain2[i]=SQR16_Q15(tmp);
931 filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
934 /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */
935 if (!st->denoise_enabled)
938 st->gain2[i]=Q15_ONE;
941 /* Apply computed gain */
944 st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]);
945 st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]);
947 st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]);
948 st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]);
950 /*FIXME: This *will* not work for fixed-point */
953 speex_compute_agc(st, Pframe, st->ft);
956 /* Inverse FFT with 1/N scaling */
957 spx_ifft(st->fft_lookup, st->ft, st->frame);
958 /* Scale back to original (lower) amplitude */
960 st->frame[i] = PSHR16(st->frame[i], st->frame_shift);
962 /*FIXME: This *will* not work for fixed-point */
968 if (fabs(st->frame[i])>max_sample)
969 max_sample = fabs(st->frame[i]);
970 if (max_sample>28000.f)
972 float damp = 28000.f/max_sample;
974 st->frame[i] *= damp;
979 /* Synthesis window (for WOLA) */
981 st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
983 /* Perform overlap and add */
985 x[i] = st->outbuf[i] + st->frame[i];
987 x[N3+i] = st->frame[N3+i];
991 st->outbuf[i] = st->frame[st->frame_size+i];
993 /* FIXME: This VAD is a kludge */
996 if (Pframe > st->speech_prob_start || (st->was_speech && Pframe > st->speech_prob_continue))
1010 void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x)
1013 int N = st->ps_size;
1014 int N3 = 2*N - st->frame_size;
1016 spx_word32_t *ps=st->ps;
1021 preprocess_analysis(st, x);
1023 update_noise_prob(st);
1027 if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT))
1029 st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT));
1034 st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]);
1036 /* Save old power spectrum */
1038 st->old_ps[i] = ps[i];
1041 st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]);
1045 int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
1048 SpeexPreprocessState *st;
1049 st=(SpeexPreprocessState*)state;
1052 case SPEEX_PREPROCESS_SET_DENOISE:
1053 st->denoise_enabled = (*(spx_int32_t*)ptr);
1055 case SPEEX_PREPROCESS_GET_DENOISE:
1056 (*(spx_int32_t*)ptr) = st->denoise_enabled;
1059 case SPEEX_PREPROCESS_SET_AGC:
1060 st->agc_enabled = (*(spx_int32_t*)ptr);
1062 case SPEEX_PREPROCESS_GET_AGC:
1063 (*(spx_int32_t*)ptr) = st->agc_enabled;
1066 case SPEEX_PREPROCESS_SET_AGC_LEVEL:
1067 st->agc_level = (*(float*)ptr);
1068 if (st->agc_level<1)
1070 if (st->agc_level>32768)
1071 st->agc_level=32768;
1073 case SPEEX_PREPROCESS_GET_AGC_LEVEL:
1074 (*(float*)ptr) = st->agc_level;
1076 case SPEEX_PREPROCESS_SET_AGC_INCREMENT:
1077 st->max_increase_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1079 case SPEEX_PREPROCESS_GET_AGC_INCREMENT:
1080 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_increase_step)*st->sampling_rate/st->frame_size);
1082 case SPEEX_PREPROCESS_SET_AGC_DECREMENT:
1083 st->max_decrease_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1085 case SPEEX_PREPROCESS_GET_AGC_DECREMENT:
1086 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_decrease_step)*st->sampling_rate/st->frame_size);
1088 case SPEEX_PREPROCESS_SET_AGC_MAX_GAIN:
1089 st->max_gain = exp(0.11513f * (*(spx_int32_t*)ptr));
1091 case SPEEX_PREPROCESS_GET_AGC_MAX_GAIN:
1092 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_gain));
1095 case SPEEX_PREPROCESS_SET_VAD:
1096 speex_warning("The VAD has been replaced by a hack pending a complete rewrite");
1097 st->vad_enabled = (*(spx_int32_t*)ptr);
1099 case SPEEX_PREPROCESS_GET_VAD:
1100 (*(spx_int32_t*)ptr) = st->vad_enabled;
1103 case SPEEX_PREPROCESS_SET_DEREVERB:
1104 st->dereverb_enabled = (*(spx_int32_t*)ptr);
1105 for (i=0;i<st->ps_size;i++)
1106 st->reverb_estimate[i]=0;
1108 case SPEEX_PREPROCESS_GET_DEREVERB:
1109 (*(spx_int32_t*)ptr) = st->dereverb_enabled;
1112 case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
1113 st->reverb_level = (*(float*)ptr);
1115 case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL:
1116 (*(float*)ptr) = st->reverb_level;
1119 case SPEEX_PREPROCESS_SET_DEREVERB_DECAY:
1120 st->reverb_decay = (*(float*)ptr);
1122 case SPEEX_PREPROCESS_GET_DEREVERB_DECAY:
1123 (*(float*)ptr) = st->reverb_decay;
1126 case SPEEX_PREPROCESS_SET_PROB_START:
1127 *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr));
1128 st->speech_prob_start = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100);
1130 case SPEEX_PREPROCESS_GET_PROB_START:
1131 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100);
1134 case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
1135 *(spx_int32_t*)ptr = MIN32(Q15_ONE,MAX32(0, *(spx_int32_t*)ptr));
1136 st->speech_prob_continue = DIV32_16(MULT16_16(32767,*(spx_int32_t*)ptr), 100);
1138 case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
1139 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100);
1142 case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS:
1143 st->noise_suppress = -ABS(*(spx_int32_t*)ptr);
1145 case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS:
1146 (*(spx_int32_t*)ptr) = st->noise_suppress;
1148 case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS:
1149 st->echo_suppress = -ABS(*(spx_int32_t*)ptr);
1151 case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS:
1152 (*(spx_int32_t*)ptr) = st->echo_suppress;
1154 case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE:
1155 st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr);
1157 case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE:
1158 (*(spx_int32_t*)ptr) = st->echo_suppress_active;
1160 case SPEEX_PREPROCESS_SET_ECHO_STATE:
1161 st->echo_state = (SpeexEchoState*)ptr;
1163 case SPEEX_PREPROCESS_GET_ECHO_STATE:
1164 ptr = (void*)st->echo_state;
1168 speex_warning_int("Unknown speex_preprocess_ctl request: ", request);