Oops. Thanks to Jim Crichton for pointing out that the complexity could end up
[speexdsp.git] / libspeex / cb_search.c
index a86582a..5c68826 100644 (file)
-/*-----------------------------------------------------------------------*\
+/* Copyright (C) 2002-2006 Jean-Marc Valin 
+   File: cb_search.c
 
-    FILE........: GAINSHAPE.C
-    TYPE........: C Module
-    AUTHOR......: David Rowe
-    COMPANY.....: Voicetronix
-    DATE CREATED: 19/2/02
-
-    General gain-shape codebook search.
-
-\*-----------------------------------------------------------------------*/
-
-/* Modified by Jean-Marc Valin 2002
-
-   This library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
    
-   This library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
    
-   You should have received a copy of the GNU Lesser General Public
-   License along with this library; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
 
-
-#include <stdlib.h>
-#include <cb_search.h>
+#include "cb_search.h"
 #include "filters.h"
-#include <math.h>
-#include <stdio.h>
 #include "stack_alloc.h"
 #include "vq.h"
-#include "matrix.h"
-
-extern float exc_gains_wb2_table[];
-/*---------------------------------------------------------------------------*\
-                                                                             
- void overlap_cb_search()                                                            
-                                                                             
- Searches a gain/shape codebook consisting of overlapping entries for the    
- closest vector to the target.  Gives identical results to search() above   
- buts uses fast end correction algorithm for the synthesis of response       
- vectors.                                                                    
-                                                                             
-\*---------------------------------------------------------------------------*/
-
-float overlap_cb_search(
-float target[],                        /* target vector */
-float ak[],                    /* LPCs for this subframe */
-float awk1[],                  /* Weighted LPCs for this subframe */
-float awk2[],                  /* Weighted LPCs for this subframe */
-float codebook[],              /* overlapping codebook */
-int   entries,                 /* number of overlapping entries to search */
-float *gain,                   /* gain of optimum entry */
-int   *index,                  /* index of optimum entry */
-int   p,                        /* number of LPC coeffs */
-int   nsf                       /* number of samples in subframe */
-)
+#include "misc.h"
+
+#ifdef _USE_SSE
+#include "cb_search_sse.h"
+#elif defined(ARM4_ASM) || defined(ARM5E_ASM)
+#include "cb_search_arm4.h"
+#elif defined(BFIN_ASM)
+#include "cb_search_bfin.h"
+#endif
+
+#ifndef OVERRIDE_COMPUTE_WEIGHTED_CODEBOOK
+static void compute_weighted_codebook(const signed char *shape_cb, const spx_word16_t *r, spx_word16_t *resp, spx_word16_t *resp2, spx_word32_t *E, int shape_cb_size, int subvect_size, char *stack)
 {
-  float *resp;                 /* zero state response to current entry */
-  float *h;                    /* impulse response of synthesis filter */
-  float *impulse;              /* excitation vector containing one impulse */
-  float d,e,g,score;           /* codebook searching variables */
-  float bscore;                        /* score of "best" vector so far */
-  int i,k;                     /* loop variables */
-
-  /* Initialise */
-  
-  resp = (float*)malloc(sizeof(float)*nsf);
-  h = (float*)malloc(sizeof(float)*nsf);
-  impulse = (float*)malloc(sizeof(float)*nsf);
-
-  for(i=0; i<nsf; i++)
-    impulse[i] = 0.0;
-   
-  *gain = 0.0;
-  *index = 0;
-  bscore = 0.0;
-  impulse[0] = 1.0;
-
-  /* Calculate impulse response of  A(z/g2) / ( A(z)*(z/g1) ) */
-  residue_zero(impulse, awk1, h, nsf, p);
-  syn_filt_zero(h, ak, h, nsf, p);
-  syn_filt_zero(h, awk2, h, nsf,p);
-  
-  /* Calculate codebook zero-response */
-  residue_zero(&codebook[entries-1],awk1,resp,nsf,p);
-  syn_filt_zero(resp,ak,resp,nsf,p);
-  syn_filt_zero(resp,awk2,resp,nsf,p);
-    
-  /* Search codebook backwards using end correction for synthesis */
-  
-  for(k=entries-1; k>=0; k--) {
-
-    d = 0.0; e = 0.0;
-    for(i=0; i<nsf; i++) {
-      d += target[i]*resp[i];
-      e += resp[i]*resp[i];
-    }
-    g = d/(e+.1);
-    score = g*d;
-    /*printf ("score: %f %f %f %f\n", target[0],d,e,score);*/
-    if (score >= bscore) {
-      bscore = score;
-      *gain = g;
-      *index = k;
-    }
-    
-    /* Synthesise next entry */
-    
-    if (k) {
-      for(i=nsf-1; i>=1; i--)
-        resp[i] = resp[i-1] + codebook[k-1]*h[i];
-      resp[0] = codebook[k-1]*h[0];
-    }
-  }
-
-  free(resp);
-  free(h);
-  free(impulse);
-  return bscore;
+   int i, j, k;
+   VARDECL(spx_word16_t *shape);
+   ALLOC(shape, subvect_size, spx_word16_t);
+   for (i=0;i<shape_cb_size;i++)
+   {
+      spx_word16_t *res;
+      
+      res = resp+i*subvect_size;
+      for (k=0;k<subvect_size;k++)
+         shape[k] = (spx_word16_t)shape_cb[i*subvect_size+k];
+      E[i]=0;
+
+      /* Compute codeword response using convolution with impulse response */
+      for(j=0;j<subvect_size;j++)
+      {
+         spx_word32_t resj=0;
+         spx_word16_t res16;
+         for (k=0;k<=j;k++)
+            resj = MAC16_16(resj,shape[k],r[j-k]);
+#ifdef FIXED_POINT
+         res16 = EXTRACT16(SHR32(resj, 13));
+#else
+         res16 = 0.03125f*resj;
+#endif
+         /* Compute codeword energy */
+         E[i]=MAC16_16(E[i],res16,res16);
+         res[j] = res16;
+         /*printf ("%d\n", (int)res[j]);*/
+      }
+   }
+
 }
+#endif
+
+#ifndef OVERRIDE_TARGET_UPDATE
+static inline void target_update(spx_word16_t *t, spx_word16_t g, spx_word16_t *r, int len)
+{
+   int n;
+   for (n=0;n<len;n++)
+      t[n] = SUB16(t[n],PSHR32(MULT16_16(g,r[n]),13));
+}
+#endif
+
 
 
-void split_cb_search(
-float target[],                        /* target vector */
-float ak[],                    /* LPCs for this subframe */
-float awk1[],                  /* Weighted LPCs for this subframe */
-float awk2[],                  /* Weighted LPCs for this subframe */
-void *par,                      /* Codebook/search parameters*/
+static void split_cb_search_shape_sign_N1(
+spx_word16_t target[],                 /* target vector */
+spx_coef_t ak[],                       /* LPCs for this subframe */
+spx_coef_t awk1[],                     /* Weighted LPCs for this subframe */
+spx_coef_t awk2[],                     /* Weighted LPCs for this subframe */
+const void *par,                      /* Codebook/search parameters*/
 int   p,                        /* number of LPC coeffs */
 int   nsf,                      /* number of samples in subframe */
-float *exc,
-FrameBits *bits,
-float *stack
+spx_sig_t *exc,
+spx_word16_t *r,
+SpeexBits *bits,
+char *stack,
+int   update_target
 )
 {
-   int i,j;
-   float *resp, *E, *Ee;
-   float *t, *r, *e;
-   float *gains;
-   int *ind;
-   float *shape_cb, *gain_cb;
-   int shape_cb_size, gain_cb_size, subvect_size, nb_subvect;
-   split_cb_params *params;
-
-   params = (split_cb_params *) par;
+   int i,j,m,q;
+   VARDECL(spx_word16_t *resp);
+#ifdef _USE_SSE
+   VARDECL(__m128 *resp2);
+   VARDECL(__m128 *E);
+#else
+   spx_word16_t *resp2;
+   VARDECL(spx_word32_t *E);
+#endif
+   VARDECL(spx_word16_t *t);
+   VARDECL(spx_sig_t *e);
+   const signed char *shape_cb;
+   int shape_cb_size, subvect_size, nb_subvect;
+   const split_cb_params *params;
+   int best_index;
+   spx_word32_t best_dist;
+   int have_sign;
+   
+   params = (const split_cb_params *) par;
    subvect_size = params->subvect_size;
    nb_subvect = params->nb_subvect;
    shape_cb_size = 1<<params->shape_bits;
    shape_cb = params->shape_cb;
-   gain_cb_size = 1<<params->gain_bits;
-   gain_cb = params->gain_cb;
-   resp = PUSH(stack, shape_cb_size*subvect_size);
-   E = PUSH(stack, shape_cb_size);
-   Ee = PUSH(stack, shape_cb_size);
-   t = PUSH(stack, nsf);
-   r = PUSH(stack, nsf);
-   e = PUSH(stack, nsf);
-   gains = PUSH(stack, nb_subvect);
-   ind = (int*)PUSH(stack, nb_subvect);
+   have_sign = params->have_sign;
+   ALLOC(resp, shape_cb_size*subvect_size, spx_word16_t);
+#ifdef _USE_SSE
+   ALLOC(resp2, (shape_cb_size*subvect_size)>>2, __m128);
+   ALLOC(E, shape_cb_size>>2, __m128);
+#else
+   resp2 = resp;
+   ALLOC(E, shape_cb_size, spx_word32_t);
+#endif
+   ALLOC(t, nsf, spx_word16_t);
+   ALLOC(e, nsf, spx_sig_t);
    
-
+   /* FIXME: Do we still need to copy the target? */
    for (i=0;i<nsf;i++)
       t[i]=target[i];
-   for (i=0;i<shape_cb_size;i++)
-   {
-      float *res = resp+i*subvect_size;
-      residue_zero(shape_cb+i*subvect_size, awk1, res, subvect_size, p);
-      syn_filt_zero(res, ak, res, subvect_size, p);
-      syn_filt_zero(res, awk2, res, subvect_size,p);
-      E[i]=0;
-      for(j=0;j<subvect_size;j++)
-         E[i]+=res[j]*res[j];
-      Ee[i]=0;
-      for(j=0;j<subvect_size;j++)
-         Ee[i]+=shape_cb[i*subvect_size+j]*shape_cb[i*subvect_size+j];
-      
-   }
-   for (i=0;i<nb_subvect;i++)
-   {
-      int best_index=0;
-      float g, corr, best_gain=0, score, best_score=-1;
-      for (j=0;j<shape_cb_size;j++)
-      {
-         corr=xcorr(resp+j*subvect_size,t+subvect_size*i,subvect_size);
-         score=corr*corr/(.001+E[j]);
-         g = corr/(.001+E[j]);
-         if (score>best_score)
-         {
-            best_index=j;
-            best_score=score;
-            best_gain=corr/(.001+E[j]);
-         }
-      }
-      frame_bits_pack(bits,best_index,params->shape_bits);
-      if (best_gain>0)
-         frame_bits_pack(bits,0,1);
-      else
-          frame_bits_pack(bits,1,1);        
-      ind[i]=best_index;
-      gains[i]=best_gain*Ee[ind[i]];
 
-      for (j=0;j<nsf;j++)
-         e[j]=0;
-      for (j=0;j<subvect_size;j++)
-         e[subvect_size*i+j]=best_gain*shape_cb[best_index*subvect_size+j];
-      residue_zero(e, awk1, r, nsf, p);
-      syn_filt_zero(r, ak, r, nsf, p);
-      syn_filt_zero(r, awk2, r, nsf,p);
-      for (j=0;j<nsf;j++)
-         t[j]-=r[j];
-   }
+   compute_weighted_codebook(shape_cb, r, resp, resp2, E, shape_cb_size, subvect_size, stack);
 
+   for (i=0;i<nb_subvect;i++)
    {
-      int best_vq_index=0, max_index;
-      float max_gain=0, log_max, min_dist=0, *sign;
-
-      if (gain_cb) /*If no gain codebook, do not quantize (for testing/debugging) */
+      spx_word16_t *x=t+subvect_size*i;
+      /*Find new n-best based on previous n-best j*/
+      if (have_sign)
+         vq_nbest_sign(x, resp2, subvect_size, shape_cb_size, E, 1, &best_index, &best_dist, stack);
+      else
+         vq_nbest(x, resp2, subvect_size, shape_cb_size, E, 1, &best_index, &best_dist, stack);
+      
+      speex_bits_pack(bits,best_index,params->shape_bits+have_sign);
+      
       {
-         sign = PUSH(stack, nb_subvect);
-         for (i=0;i<nb_subvect;i++)
+         int rind;
+         spx_word16_t *res;
+         spx_word16_t sign=1;
+         rind = best_index;
+         if (rind>=shape_cb_size)
          {
-            if (gains[i]<0)
-            {
-               gains[i]=-gains[i];
-               sign[i]=-1;
-            } else {
-               sign[i]=1;
-            }
+            sign=-1;
+            rind-=shape_cb_size;
          }
-         for (i=0;i<nb_subvect;i++)
-            if (gains[i]>max_gain)
-               max_gain=gains[i];
-         log_max=log(max_gain+1);
-         max_index = (int)(floor(.5+log_max-3));
-         if (max_index>7)
-            max_index=7;
-         if (max_index<0)
-            max_index=0;
-         max_gain=1/exp(max_index+3.0);
-         for (i=0;i<nb_subvect;i++)
-            gains[i]*=max_gain;
-         frame_bits_pack(bits,max_index,3);
-
-         /*Vector quantize gains[i]*/
-         if (nb_subvect<=5)
+         res = resp+rind*subvect_size;
+         if (sign>0)
+            for (m=0;m<subvect_size;m++)
+               t[subvect_size*i+m] = SUB16(t[subvect_size*i+m], res[m]);
+         else
+            for (m=0;m<subvect_size;m++)
+               t[subvect_size*i+m] = ADD16(t[subvect_size*i+m], res[m]);
+
+#ifdef FIXED_POINT
+         if (sign)
          {
-         best_vq_index = vq_index(gains, gain_cb, nb_subvect, gain_cb_size);
-         frame_bits_pack(bits,best_vq_index,params->gain_bits);
-         printf ("best_gains_vq_index %d %f %d\n", best_vq_index, min_dist, max_index);
-         for (i=0;i<nb_subvect;i++)
-            gains[i]= sign[i]*gain_cb[best_vq_index*nb_subvect+i]/max_gain/(Ee[ind[i]]+.001);
-         } else
+            for (j=0;j<subvect_size;j++)
+               e[subvect_size*i+j]=SHL32(EXTEND32(shape_cb[rind*subvect_size+j]),SIG_SHIFT-5);
+         } else {
+            for (j=0;j<subvect_size;j++)
+               e[subvect_size*i+j]=NEG32(SHL32(EXTEND32(shape_cb[rind*subvect_size+j]),SIG_SHIFT-5));
+         }
+#else
+         for (j=0;j<subvect_size;j++)
+            e[subvect_size*i+j]=sign*0.03125*shape_cb[rind*subvect_size+j];
+#endif
+      
+      }
+            
+      for (m=0;m<subvect_size;m++)
+      {
+         spx_word16_t g;
+         int rind;
+         spx_word16_t sign=1;
+         rind = best_index;
+         if (rind>=shape_cb_size)
          {
-            float tmp[5];
-            int best_vq_index2;
-         best_vq_index = vq_index(gains, gain_cb, nb_subvect/2, gain_cb_size);
-         for (i=0;i<5;i++)
-            tmp[i]=gains[i]-gain_cb[best_vq_index*nb_subvect/2+i];
-         best_vq_index2 = vq_index(tmp, exc_gains_wb2_table, nb_subvect/2, 256);
-
-         frame_bits_pack(bits,best_vq_index,params->gain_bits);
-         printf ("best_gains_vq_index %d %f %d\n", best_vq_index, min_dist, max_index);
-         for (i=0;i<nb_subvect/2;i++)
-            gains[i]= sign[i]*(gain_cb[best_vq_index*nb_subvect/2+i]+exc_gains_wb2_table[best_vq_index2*nb_subvect/2+i])/max_gain/(Ee[ind[i]]+.001);
-
-
-         best_vq_index = vq_index(gains+5, gain_cb, nb_subvect/2, gain_cb_size);
-         frame_bits_pack(bits,best_vq_index,params->gain_bits);
-         for (i=0;i<5;i++)
-            tmp[i]=gains[i+5]-gain_cb[best_vq_index*nb_subvect/2+i];
-         best_vq_index2 = vq_index(tmp, exc_gains_wb2_table, nb_subvect/2, 256);
-
-         printf ("best_gains_vq_index %d %f %d\n", best_vq_index, min_dist, max_index);
-         for (i=0;i<nb_subvect/2;i++)
-            gains[i+5]= sign[i+5]*(gain_cb[best_vq_index*nb_subvect/2+i]+exc_gains_wb2_table[best_vq_index2*nb_subvect/2+i])/max_gain/(Ee[ind[i+5]]+.001);
+            sign=-1;
+            rind-=shape_cb_size;
          }
-
-    
-
-         POP(stack);
-      } else {
-         printf ("exc: ");
-         for (i=0;i<nb_subvect;i++)
-            printf ("%f ", gains[i]);
-         printf ("\n");
-         for (i=0;i<nb_subvect;i++)
-            gains[i]= gains[i]/(Ee[ind[i]]+.001);
+         
+         q=subvect_size-m;
+#ifdef FIXED_POINT
+         g=sign*shape_cb[rind*subvect_size+m];
+#else
+         g=sign*0.03125*shape_cb[rind*subvect_size+m];
+#endif
+         target_update(t+subvect_size*(i+1), g, r+q, nsf-subvect_size*(i+1));
       }
-
-      for (i=0;i<nb_subvect;i++)
-         for (j=0;j<subvect_size;j++)
-            exc[subvect_size*i+j]+=gains[i]*shape_cb[ind[i]*subvect_size+j];
-
    }
 
-   /*TODO: Perform joint optimization of gains*/
+   /* Update excitation */
+   /* FIXME: We could update the excitation directly above */
+   for (j=0;j<nsf;j++)
+      exc[j]=ADD32(exc[j],e[j]);
    
-   for (i=0;i<nsf;i++)
-      target[i]=t[i];
-
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
+   /* Update target: only update target if necessary */
+   if (update_target)
+   {
+      VARDECL(spx_sig_t *r2);
+      ALLOC(r2, nsf, spx_sig_t);
+      syn_percep_zero(e, ak, awk1, awk2, r2, nsf,p, stack);
+      for (j=0;j<nsf;j++)
+         target[j]=SUB16(target[j],EXTRACT16(PSHR32(r2[j],8)));
+   }
 }
 
 
 
-
-void split_cb_search_wb(
-float target[],                        /* target vector */
-float ak[],                    /* LPCs for this subframe */
-float awk1[],                  /* Weighted LPCs for this subframe */
-float awk2[],                  /* Weighted LPCs for this subframe */
-void *par,                      /* Codebook/search parameters*/
+void split_cb_search_shape_sign(
+spx_word16_t target[],                 /* target vector */
+spx_coef_t ak[],                       /* LPCs for this subframe */
+spx_coef_t awk1[],                     /* Weighted LPCs for this subframe */
+spx_coef_t awk2[],                     /* Weighted LPCs for this subframe */
+const void *par,                      /* Codebook/search parameters*/
 int   p,                        /* number of LPC coeffs */
 int   nsf,                      /* number of samples in subframe */
-float *exc,
-FrameBits *bits,
-float *stack
+spx_sig_t *exc,
+spx_word16_t *r,
+SpeexBits *bits,
+char *stack,
+int   complexity,
+int   update_target
 )
 {
-   int i,j;
-   float *resp, *E, *Ee;
-   float *t, *r, *e, *tresp;
-   float *gains;
-   int *ind;
-   float *shape_cb, *gain_cb;
-   int shape_cb_size, gain_cb_size, subvect_size, nb_subvect;
-   split_cb_params *params;
-
-   params = (split_cb_params *) par;
+   int i,j,k,m,n,q;
+   VARDECL(spx_word16_t *resp);
+#ifdef _USE_SSE
+   VARDECL(__m128 *resp2);
+   VARDECL(__m128 *E);
+#else
+   spx_word16_t *resp2;
+   VARDECL(spx_word32_t *E);
+#endif
+   VARDECL(spx_word16_t *t);
+   VARDECL(spx_sig_t *e);
+   VARDECL(spx_sig_t *r2);
+   VARDECL(spx_word16_t *tmp);
+   VARDECL(spx_word32_t *ndist);
+   VARDECL(spx_word32_t *odist);
+   VARDECL(int *itmp);
+   VARDECL(spx_word16_t **ot2);
+   VARDECL(spx_word16_t **nt2);
+   spx_word16_t **ot, **nt;
+   VARDECL(int **nind);
+   VARDECL(int **oind);
+   VARDECL(int *ind);
+   const signed char *shape_cb;
+   int shape_cb_size, subvect_size, nb_subvect;
+   const split_cb_params *params;
+   int N=2;
+   VARDECL(int *best_index);
+   VARDECL(spx_word32_t *best_dist);
+   VARDECL(int *best_nind);
+   VARDECL(int *best_ntarget);
+   int have_sign;
+   N=complexity;
+   if (N>10)
+      N=10;
+   /* Complexity isn't as important for the codebooks as it is for the pitch */
+   N=(2*N)/3;
+   if (N<1)
+      N=1;
+   if (N==1)
+   {
+      split_cb_search_shape_sign_N1(target,ak,awk1,awk2,par,p,nsf,exc,r,bits,stack,update_target);
+      return;
+   }
+   ALLOC(ot2, N, spx_word16_t*);
+   ALLOC(nt2, N, spx_word16_t*);
+   ALLOC(oind, N, int*);
+   ALLOC(nind, N, int*);
+
+   params = (const split_cb_params *) par;
    subvect_size = params->subvect_size;
    nb_subvect = params->nb_subvect;
    shape_cb_size = 1<<params->shape_bits;
    shape_cb = params->shape_cb;
-   gain_cb_size = 1<<params->gain_bits;
-   gain_cb = params->gain_cb;
-   resp = PUSH(stack, shape_cb_size*subvect_size);
-   tresp = PUSH(stack, shape_cb_size*nsf);
-   E = PUSH(stack, shape_cb_size);
-   Ee = PUSH(stack, shape_cb_size);
-   t = PUSH(stack, nsf);
-   r = PUSH(stack, nsf);
-   e = PUSH(stack, nsf);
-   gains = PUSH(stack, nb_subvect);
-   ind = (int*)PUSH(stack, nb_subvect);
-   
+   have_sign = params->have_sign;
+   ALLOC(resp, shape_cb_size*subvect_size, spx_word16_t);
+#ifdef _USE_SSE
+   ALLOC(resp2, (shape_cb_size*subvect_size)>>2, __m128);
+   ALLOC(E, shape_cb_size>>2, __m128);
+#else
+   resp2 = resp;
+   ALLOC(E, shape_cb_size, spx_word32_t);
+#endif
+   ALLOC(t, nsf, spx_word16_t);
+   ALLOC(e, nsf, spx_sig_t);
+   ALLOC(r2, nsf, spx_sig_t);
+   ALLOC(ind, nb_subvect, int);
 
-   for (i=0;i<nsf;i++)
-      t[i]=target[i];
-   for (i=0;i<shape_cb_size;i++)
+   ALLOC(tmp, 2*N*nsf, spx_word16_t);
+   for (i=0;i<N;i++)
    {
-      float *res = resp+i*subvect_size;
-      residue_zero(shape_cb+i*subvect_size, awk1, res, subvect_size, p);
-      syn_filt_zero(res, ak, res, subvect_size, p);
-      syn_filt_zero(res, awk2, res, subvect_size,p);
-      E[i]=0;
-      for(j=0;j<subvect_size;j++)
-         E[i]+=res[j]*res[j];
-      Ee[i]=0;
-      for(j=0;j<subvect_size;j++)
-         Ee[i]+=shape_cb[i*subvect_size+j]*shape_cb[i*subvect_size+j];
-      
+      ot2[i]=tmp+2*i*nsf;
+      nt2[i]=tmp+(2*i+1)*nsf;
    }
-#if 0
+   ot=ot2;
+   nt=nt2;
+   ALLOC(best_index, N, int);
+   ALLOC(best_dist, N, spx_word32_t);
+   ALLOC(best_nind, N, int);
+   ALLOC(best_ntarget, N, int);
+   ALLOC(ndist, N, spx_word32_t);
+   ALLOC(odist, N, spx_word32_t);
+   
+   ALLOC(itmp, 2*N*nb_subvect, int);
+   for (i=0;i<N;i++)
    {
-      int half,k;
-   for (half=0;half<2;half++)
+      nind[i]=itmp+2*i*nb_subvect;
+      oind[i]=itmp+(2*i+1)*nb_subvect;
+   }
+   
+   for (i=0;i<nsf;i++)
+      t[i]=target[i];
+
+   for (j=0;j<N;j++)
+      speex_move(&ot[j][0], t, nsf*sizeof(spx_word16_t));
+
+   /* Pre-compute codewords response and energy */
+   compute_weighted_codebook(shape_cb, r, resp, resp2, E, shape_cb_size, subvect_size, stack);
+
+   for (j=0;j<N;j++)
+      odist[j]=0;
+   
+   /*For all subvectors*/
+   for (i=0;i<nb_subvect;i++)
    {
-      int nb_half=nb_subvect/2;
-      int max_subvect=0;
-      float max_energy=0;
-      syn_filt_zero(t+half*nsf/2, awk1, r, nsf/2, p);
-      residue_zero(r, ak, r, nsf/2, p);
-      residue_zero(r, awk2, r, nsf/2,p);
-      for (i=0;i<nb_half;i++)
+      /*"erase" nbest list*/
+      for (j=0;j<N;j++)
+         ndist[j]=VERY_LARGE32;
+
+      /*For all n-bests of previous subvector*/
+      for (j=0;j<N;j++)
       {
-         float energy=0;
-         for (k=0;k<subvect_size;k++)
-            energy+=r[subvect_size*i+k]*r[subvect_size*i+k];
-         if (energy>max_energy)
+         spx_word16_t *x=ot[j]+subvect_size*i;
+         spx_word32_t tener = 0;
+         for (m=0;m<subvect_size;m++)
+            tener = MAC16_16(tener, x[m],x[m]);
+#ifdef FIXED_POINT
+         tener = SHR32(tener,1);
+#else
+         tener *= .5;
+#endif
+         /*Find new n-best based on previous n-best j*/
+         if (have_sign)
+            vq_nbest_sign(x, resp2, subvect_size, shape_cb_size, E, N, best_index, best_dist, stack);
+         else
+            vq_nbest(x, resp2, subvect_size, shape_cb_size, E, N, best_index, best_dist, stack);
+
+         /*For all new n-bests*/
+         for (k=0;k<N;k++)
          {
-            max_subvect=i;
-            max_energy=energy;
+            /* Compute total distance (including previous sub-vectors */
+            spx_word32_t err = ADD32(ADD32(odist[j],best_dist[k]),tener);
+            
+            /*update n-best list*/
+            if (err<ndist[N-1])
+            {
+               for (m=0;m<N;m++)
+               {
+                  if (err < ndist[m])
+                  {
+                     for (n=N-1;n>m;n--)
+                     {
+                        ndist[n] = ndist[n-1];
+                        best_nind[n] = best_nind[n-1];
+                        best_ntarget[n] = best_ntarget[n-1];
+                     }
+                     ndist[m] = err;
+                     best_nind[n] = best_index[k];
+                     best_ntarget[n] = j;
+                     break;
+                  }
+               }
+            }
          }
+         if (i==0)
+            break;
       }
-      printf ("max_energy: %d %f\n", max_subvect, max_energy);
-      
-      for (i=0;i<nb_half;i++)
+      for (j=0;j<N;j++)
       {
-         int nb_times=1;
-         if (i==max_subvect)
-            nb_times++;
-
-         for (k=0;k<nb_times;k++)
-         {
-         int best_index=0;
-         float g, corr, best_gain=0, score, best_score=-1;
-         for (j=0;j<shape_cb_size;j++)
+         /*previous target (we don't care what happened before*/
+         for (m=(i+1)*subvect_size;m<nsf;m++)
+            nt[j][m]=ot[best_ntarget[j]][m];
+         
+         /* New code: update the rest of the target only if it's worth it */
+         for (m=0;m<subvect_size;m++)
          {
-            corr=xcorr(resp+j*subvect_size,t+subvect_size*(i+half*nb_half),subvect_size);
-            score=corr*corr/(.001+E[j]);
-            g = corr/(.001+E[j]);
-            if (score>best_score)
+            spx_word16_t g;
+            int rind;
+            spx_word16_t sign=1;
+            rind = best_nind[j];
+            if (rind>=shape_cb_size)
             {
-               best_index=j;
-               best_score=score;
-               best_gain=corr/(.001+E[j]);
+               sign=-1;
+               rind-=shape_cb_size;
             }
+
+            q=subvect_size-m;
+#ifdef FIXED_POINT
+            g=sign*shape_cb[rind*subvect_size+m];
+#else
+            g=sign*0.03125*shape_cb[rind*subvect_size+m];
+#endif
+            target_update(nt[j]+subvect_size*(i+1), g, r+q, nsf-subvect_size*(i+1));
          }
-         for (j=0;j<nsf;j++)
-            e[j]=0;
-         for (j=0;j<subvect_size;j++)
-            e[subvect_size*(i+half*nb_half)+j]=best_gain*shape_cb[best_index*subvect_size+j];
-         residue_zero(e, awk1, r, nsf, p);
-         syn_filt_zero(r, ak, r, nsf, p);
-         syn_filt_zero(r, awk2, r, nsf,p);
-         for (j=0;j<nsf;j++)
-            t[j]-=r[j];
-         for (j=0;j<nsf;j++)
-            exc[j]+=e[j];
-         }
+
+         for (q=0;q<nb_subvect;q++)
+            nind[j][q]=oind[best_ntarget[j]][q];
+         nind[j][i]=best_nind[j];
       }
+
+      /*update old-new data*/
+      /* just swap pointers instead of a long copy */
+      {
+         spx_word16_t **tmp2;
+         tmp2=ot;
+         ot=nt;
+         nt=tmp2;
+      }
+      for (j=0;j<N;j++)
+         for (m=0;m<nb_subvect;m++)
+            oind[j][m]=nind[j][m];
+      for (j=0;j<N;j++)
+         odist[j]=ndist[j];
    }
+
+   /*save indices*/
+   for (i=0;i<nb_subvect;i++)
+   {
+      ind[i]=nind[0][i];
+      speex_bits_pack(bits,ind[i],params->shape_bits+have_sign);
    }
-#else
+   
+   /* Put everything back together */
    for (i=0;i<nb_subvect;i++)
    {
-      int best_index=0;
-      float g, corr, best_gain=0, score, best_score=-1;
-      for (j=0;j<shape_cb_size;j++)
+      int rind;
+      spx_word16_t sign=1;
+      rind = ind[i];
+      if (rind>=shape_cb_size)
       {
-         corr=xcorr(resp+j*subvect_size,t+subvect_size*i,subvect_size);
-         score=corr*corr/(.001+E[j]);
-         g = corr/(.001+E[j]);
-         if (score>best_score)
-         {
-            best_index=j;
-            best_score=score;
-            best_gain=corr/(.001+E[j]);
-         }
+         sign=-1;
+         rind-=shape_cb_size;
       }
-      frame_bits_pack(bits,best_index,params->shape_bits);
-      if (best_gain>0)
-         frame_bits_pack(bits,0,1);
-      else
-          frame_bits_pack(bits,1,1);        
-      ind[i]=best_index;
-      gains[i]=best_gain;
-
-      for (j=0;j<nsf;j++)
-         e[j]=0;
-      for (j=0;j<subvect_size;j++)
-         e[subvect_size*i+j]=best_gain*shape_cb[best_index*subvect_size+j];
-      residue_zero(e, awk1, r, nsf, p);
-      syn_filt_zero(r, ak, r, nsf, p);
-      syn_filt_zero(r, awk2, r, nsf,p);
-      for (j=0;j<nsf;j++)
-         tresp[i*nsf+j]=r[j];
-      for (j=0;j<nsf;j++)
-         t[j]-=r[j];
-      /*for (j=0;j<nsf;j++)
-        exc[j]+=e[j];*/
-   }
-   {
-      float A[10][10];
-      float b[10];
-      float c[10];
-      for (i=0;i<10;i++)
-         for (j=0;j<10;j++)
-            A[i][j]=xcorr(tresp+i*nsf, tresp+j*nsf, nsf);
-      for (i=0;i<10;i++)
-         b[i]=xcorr(target,tresp+i*nsf,nsf);
-      for (i=0;i<10;i++)
-         A[i][i]+=.01;
-
-
-      solve(&A[0][0],b,c, 10);
-      for (i=0;i<10;i++)
-        gains[i]*=c[i];
-      
-      for (i=0;i<10;i++)
-         gains[i]*=Ee[ind[i]];
-
-
-
-   {
-      int best_vq_index=0, max_index;
-      float max_gain=0, log_max, min_dist=0, *sign;
-
-      if (gain_cb) /*If no gain codebook, do not quantize (for testing/debugging) */
+#ifdef FIXED_POINT
+      if (sign==1)
       {
-         sign = PUSH(stack, nb_subvect);
-         for (i=0;i<nb_subvect;i++)
-         {
-            if (gains[i]<0)
-            {
-               gains[i]=-gains[i];
-               sign[i]=-1;
-            } else {
-               sign[i]=1;
-            }
-         }
-         for (i=0;i<nb_subvect;i++)
-            if (gains[i]>max_gain)
-               max_gain=gains[i];
-         log_max=log(max_gain+1);
-         max_index = (int)(floor(.5+log_max-3));
-         if (max_index>7)
-            max_index=7;
-         if (max_index<0)
-            max_index=0;
-         max_gain=1/exp(max_index+3.0);
-         for (i=0;i<nb_subvect;i++)
-            gains[i]*=max_gain;
-         frame_bits_pack(bits,max_index,3);
-
-         /*Vector quantize gains[i]*/
-         if (nb_subvect<=5)
-         {
-         best_vq_index = vq_index(gains, gain_cb, nb_subvect, gain_cb_size);
-         frame_bits_pack(bits,best_vq_index,params->gain_bits);
-         printf ("best_gains_vq_index %d %f %d\n", best_vq_index, min_dist, max_index);
-         for (i=0;i<nb_subvect;i++)
-            gains[i]= sign[i]*gain_cb[best_vq_index*nb_subvect+i]/max_gain/(Ee[ind[i]]+.001);
-         } else
-         {
-            float tmp[5];
-            int best_vq_index2;
-         best_vq_index = vq_index(gains, gain_cb, nb_subvect/2, gain_cb_size);
-         for (i=0;i<5;i++)
-            tmp[i]=gains[i]-gain_cb[best_vq_index*nb_subvect/2+i];
-         best_vq_index2 = vq_index(tmp, exc_gains_wb2_table, nb_subvect/2, 256);
-
-         frame_bits_pack(bits,best_vq_index,params->gain_bits);
-         printf ("best_gains_vq_index %d %f %d\n", best_vq_index, min_dist, max_index);
-         for (i=0;i<nb_subvect/2;i++)
-            gains[i]= sign[i]*(gain_cb[best_vq_index*nb_subvect/2+i]+exc_gains_wb2_table[best_vq_index2*nb_subvect/2+i])/max_gain/(Ee[ind[i]]+.001);
-
-
-         best_vq_index = vq_index(gains+5, gain_cb, nb_subvect/2, gain_cb_size);
-         frame_bits_pack(bits,best_vq_index,params->gain_bits);
-         for (i=0;i<5;i++)
-            tmp[i]=gains[i+5]-gain_cb[best_vq_index*nb_subvect/2+i];
-         best_vq_index2 = vq_index(tmp, exc_gains_wb2_table, nb_subvect/2, 256);
-
-         printf ("best_gains_vq_index %d %f %d\n", best_vq_index, min_dist, max_index);
-         for (i=0;i<nb_subvect/2;i++)
-            gains[i+5]= sign[i+5]*(gain_cb[best_vq_index*nb_subvect/2+i]+exc_gains_wb2_table[best_vq_index2*nb_subvect/2+i])/max_gain/(Ee[ind[i+5]]+.001);
-         }
-         
-         
+         for (j=0;j<subvect_size;j++)
+            e[subvect_size*i+j]=SHL32(EXTEND32(shape_cb[rind*subvect_size+j]),SIG_SHIFT-5);
       } else {
-         
-         for (i=0;i<10;i++)
-            gains[i]/=Ee[ind[i]]+.001;
-         
-      }
-   }
-      for (i=0;i<10;i++)
          for (j=0;j<subvect_size;j++)
-            e[subvect_size*i+j]=gains[i]*shape_cb[ind[i]*subvect_size+j];
-
-      for (j=0;j<nsf;j++)
-         exc[j]+=e[j];
-      residue_zero(e, awk1, r, nsf, p);
-      syn_filt_zero(r, ak, r, nsf, p);
-      syn_filt_zero(r, awk2, r, nsf,p);
-      for (j=0;j<nsf;j++)
-         target[j]-=r[j];
-
-   }
+            e[subvect_size*i+j]=NEG32(SHL32(EXTEND32(shape_cb[rind*subvect_size+j]),SIG_SHIFT-5));
+      }
+#else
+      for (j=0;j<subvect_size;j++)
+         e[subvect_size*i+j]=sign*0.03125*shape_cb[rind*subvect_size+j];
 #endif
-
-   /*TODO: Perform joint optimization of gains*/
+   }   
+   /* Update excitation */
+   for (j=0;j<nsf;j++)
+      exc[j]=ADD32(exc[j],e[j]);
    
-   /*for (i=0;i<nsf;i++)
-      target[i]=t[i];
-   */
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
+   /* Update target: only update target if necessary */
+   if (update_target)
+   {
+      syn_percep_zero(e, ak, awk1, awk2, r2, nsf,p, stack);
+      for (j=0;j<nsf;j++)
+         target[j]=SUB16(target[j],EXTRACT16(PSHR32(r2[j],8)));
+   }
 }
 
 
-
-
-void split_cb_unquant(
-float *exc,
-void *par,                      /* non-overlapping codebook */
+void split_cb_shape_sign_unquant(
+spx_sig_t *exc,
+const void *par,                      /* non-overlapping codebook */
 int   nsf,                      /* number of samples in subframe */
-FrameBits *bits,
-float *stack
+SpeexBits *bits,
+char *stack,
+spx_int32_t *seed
 )
 {
    int i,j;
-   int *ind;
-   float *gains;
-   float *sign;
-   int max_gain_ind, vq_gain_ind;
-   float max_gain, *Ee;
-   float *shape_cb, *gain_cb;
-   int shape_cb_size, gain_cb_size, subvect_size, nb_subvect;
-   split_cb_params *params;
-
-   params = (split_cb_params *) par;
+   VARDECL(int *ind);
+   VARDECL(int *signs);
+   const signed char *shape_cb;
+   int shape_cb_size, subvect_size, nb_subvect;
+   const split_cb_params *params;
+   int have_sign;
+
+   params = (const split_cb_params *) par;
    subvect_size = params->subvect_size;
    nb_subvect = params->nb_subvect;
    shape_cb_size = 1<<params->shape_bits;
    shape_cb = params->shape_cb;
-   gain_cb_size = 1<<params->gain_bits;
-   gain_cb = params->gain_cb;
-   
-   ind = (int*)PUSH(stack, nb_subvect);
-   gains = PUSH(stack, nb_subvect);
-   sign = PUSH(stack, nb_subvect);
-   Ee=PUSH(stack, nb_subvect);
+   have_sign = params->have_sign;
 
+   ALLOC(ind, nb_subvect, int);
+   ALLOC(signs, nb_subvect, int);
+
+   /* Decode codewords and gains */
    for (i=0;i<nb_subvect;i++)
    {
-      ind[i] = frame_bits_unpack_unsigned(bits, params->shape_bits);
-      if (frame_bits_unpack_unsigned(bits, 1))
-         sign[i]=-1;
+      if (have_sign)
+         signs[i] = speex_bits_unpack_unsigned(bits, 1);
       else
-         sign[i]=1;
-      Ee[i]=.001;
+         signs[i] = 0;
+      ind[i] = speex_bits_unpack_unsigned(bits, params->shape_bits);
+   }
+   /* Compute decoded excitation */
+   for (i=0;i<nb_subvect;i++)
+   {
+      spx_word16_t s=1;
+      if (signs[i])
+         s=-1;
+#ifdef FIXED_POINT
+      if (s==1)
+      {
+         for (j=0;j<subvect_size;j++)
+            exc[subvect_size*i+j]=SHL32(EXTEND32(shape_cb[ind[i]*subvect_size+j]),SIG_SHIFT-5);
+      } else {
+         for (j=0;j<subvect_size;j++)
+            exc[subvect_size*i+j]=NEG32(SHL32(EXTEND32(shape_cb[ind[i]*subvect_size+j]),SIG_SHIFT-5));
+      }
+#else
       for (j=0;j<subvect_size;j++)
-         Ee[i]+=shape_cb[ind[i]*subvect_size+j]*shape_cb[ind[i]*subvect_size+j];
+         exc[subvect_size*i+j]+=s*0.03125*shape_cb[ind[i]*subvect_size+j];      
+#endif
    }
-   max_gain_ind = frame_bits_unpack_unsigned(bits, 3);
-   vq_gain_ind = frame_bits_unpack_unsigned(bits, params->gain_bits);
-   printf ("unquant gains ind: %d %d\n", max_gain_ind, vq_gain_ind);
+}
 
-   max_gain=exp(max_gain_ind+3.0);
-   for (i=0;i<nb_subvect;i++)
-      gains[i] = sign[i]*gain_cb[vq_gain_ind*nb_subvect+i]*max_gain/Ee[i];
-   
-   printf ("unquant gains: ");
-   for (i=0;i<nb_subvect;i++)
-      printf ("%f ", gains[i]);
-   printf ("\n");
+void noise_codebook_quant(
+spx_word16_t target[],                 /* target vector */
+spx_coef_t ak[],                       /* LPCs for this subframe */
+spx_coef_t awk1[],                     /* Weighted LPCs for this subframe */
+spx_coef_t awk2[],                     /* Weighted LPCs for this subframe */
+const void *par,                      /* Codebook/search parameters*/
+int   p,                        /* number of LPC coeffs */
+int   nsf,                      /* number of samples in subframe */
+spx_sig_t *exc,
+spx_word16_t *r,
+SpeexBits *bits,
+char *stack,
+int   complexity,
+int   update_target
+)
+{
+   int i;
+   VARDECL(spx_sig_t *tmp);
+   ALLOC(tmp, nsf, spx_sig_t);
+   for (i=0;i<nsf;i++)
+      tmp[i]=PSHR32(EXTEND32(target[i]),SIG_SHIFT);
+   residue_percep_zero(tmp, ak, awk1, awk2, tmp, nsf, p, stack);
 
-   for (i=0;i<nb_subvect;i++)
-      for (j=0;j<subvect_size;j++)
-         exc[subvect_size*i+j]+=gains[i]*shape_cb[ind[i]*subvect_size+j];
-   
-   POP(stack);
-   POP(stack);
-   POP(stack);
-   POP(stack);
+   for (i=0;i<nsf;i++)
+      exc[i]+=tmp[i];
+   for (i=0;i<nsf;i++)
+      target[i]=0;
+}
+
+
+void noise_codebook_unquant(
+spx_sig_t *exc,
+const void *par,                      /* non-overlapping codebook */
+int   nsf,                      /* number of samples in subframe */
+SpeexBits *bits,
+char *stack,
+spx_int32_t *seed
+)
+{
+   int i;
+   /* FIXME: This is bad, but I don't think the function ever gets called anyway */
+   for (i=0;i<nsf;i++)
+      exc[i]=SHL32(EXTEND32(speex_rand(1, seed)),SIG_SHIFT);
 }