Remove redundant mid-only flag when side VAD flag is set.
[opus.git] / silk / enc_API.c
index 3fa2715..1984a6d 100644 (file)
@@ -119,44 +119,6 @@ opus_int silk_QueryEncoder(
     return ret;
 }
 
-static void stereo_crossmix(const opus_int16 *in, opus_int16 *out, int channel, int len, int to_mono, int id)
-{
-   int i;
-   opus_int16                            delta, g1, g2;
-   const opus_int16                     *x1, *x2;
-
-   x1 = in+channel;
-   x2 = in+(1-channel);
-   g1 = to_mono ? 16384: 8192;
-   g2 = to_mono ? 0 : 8192;
-
-   /* We want to finish at 0.5 */
-   delta = (16384+(len>>1))/(len);
-   if (to_mono) {
-      delta = -delta;
-   }
-
-   i=0;
-   if (to_mono != 2)
-   {
-      if ( id==0 ) {
-         for ( ; i < len>>1; i++ ) {
-            out[ i ] = silk_RSHIFT_ROUND( silk_SMLABB( silk_SMULBB( x1[ 2*i ], g1 ), x2[ 2*i ], g2 ), 14 );
-            g1 += delta;
-            g2 -= delta;
-         }
-      }
-   }
-   if (to_mono) {
-      for ( ; i < len; i++ ) {
-         out[ i ] = silk_RSHIFT( (opus_int32)x1[ 2*i ] + (opus_int32)x2[ 2*i ], 1 );
-      }
-   } else {
-      for ( ; i < len; i++ ) {
-         out[ i ] = x1[ 2*i ];
-      }
-   }
-}
 
 /**************************/
 /* Encode frame with Silk */
@@ -176,8 +138,8 @@ opus_int silk_Encode(
     opus_int   speech_act_thr_for_switch_Q8;
     opus_int32 TargetRate_bps, MStargetRates_bps[ 2 ], channelRate_bps, LBRR_symbol;
     silk_encoder *psEnc = ( silk_encoder * )encState;
-    opus_int16 buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ ];
-    opus_int transition;
+    opus_int16 buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ + MAX_ENCODER_DELAY];
+    opus_int transition, delay;
 
     psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded = psEnc->state_Fxx[ 1 ].sCmn.nFramesEncoded = 0;
 
@@ -192,7 +154,10 @@ opus_int silk_Encode(
         ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ] );
         silk_memset( psEnc->sStereo.pred_prev_Q13, 0, sizeof( psEnc->sStereo.pred_prev_Q13 ) );
         silk_memset( psEnc->sStereo.sSide, 0, sizeof( psEnc->sStereo.sSide ) );
-        silk_memset( psEnc->sStereo.mid_side_amp_Q0, 0, sizeof( psEnc->sStereo.mid_side_amp_Q0 ) );
+        psEnc->sStereo.mid_side_amp_Q0[ 0 ] = 0;
+        psEnc->sStereo.mid_side_amp_Q0[ 1 ] = 1;
+        psEnc->sStereo.mid_side_amp_Q0[ 2 ] = 0;
+        psEnc->sStereo.mid_side_amp_Q0[ 3 ] = 1;
         psEnc->sStereo.width_prev_Q14 = 0;
         psEnc->sStereo.smth_width_Q14 = SILK_FIX_CONST( 1, 14 );
         if( psEnc->nChannelsAPI == 2 ) {
@@ -257,9 +222,11 @@ opus_int silk_Encode(
                 psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i ] = 0;
             }
         }
+        psEnc->state_Fxx[ n ].sCmn.inDTX = psEnc->state_Fxx[ n ].sCmn.useDTX;
     }
     silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
 
+    delay = psEnc->state_Fxx[ 0 ].sCmn.delay;
     /* Input buffering/resampling and encoding */
     while( 1 ) {
         nSamplesToBuffer  = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
@@ -268,18 +235,16 @@ opus_int silk_Encode(
         /* Resample and write to buffer */
         if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 2 ) {
             int id = psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded;
-            if ( encControl->toMono > 0) {
-                stereo_crossmix( samplesIn, buf, 0, nSamplesFromInput, encControl->toMono, id );
-            } else if( psEnc->nPrevChannelsInternal == 1 || encControl->toMono == -1 ) {
-                stereo_crossmix( samplesIn, buf, 0, nSamplesFromInput, 0, id );
-            } else {
-                for( n = 0; n < nSamplesFromInput; n++ ) {
-                    buf[ n ] = samplesIn[ 2 * n ];
-                }
+            for( n = 0; n < nSamplesFromInput; n++ ) {
+                buf[ n+delay ] = samplesIn[ 2 * n ];
             }
+            silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[ MAX_ENCODER_DELAY - delay ], delay * sizeof(opus_int16));
             /* Making sure to start both resamplers from the same state when switching from mono to stereo */
-            if(psEnc->nPrevChannelsInternal == 1 && id==0)
-               silk_memcpy(&psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state));
+            if(psEnc->nPrevChannelsInternal == 1 && id==0) {
+               silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state));
+               silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.delayBuf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf, MAX_ENCODER_DELAY*sizeof(opus_int16));
+            }
+            silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf + nSamplesFromInput + delay - MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
 
             ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
                 &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
@@ -287,30 +252,46 @@ opus_int silk_Encode(
 
             nSamplesToBuffer  = psEnc->state_Fxx[ 1 ].sCmn.frame_length - psEnc->state_Fxx[ 1 ].sCmn.inputBufIx;
             nSamplesToBuffer  = silk_min( nSamplesToBuffer, 10 * nBlocksOf10ms * psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
-            if ( encControl->toMono > 0) {
-                stereo_crossmix( samplesIn, buf, 1, nSamplesFromInput, encControl->toMono, id );
-            } else if( psEnc->nPrevChannelsInternal == 1  || encControl->toMono == -1) {
-                stereo_crossmix( samplesIn, buf, 1, nSamplesFromInput, 0, id );
-            } else {
-                for( n = 0; n < nSamplesFromInput; n++ ) {
-                    buf[ n ] = samplesIn[ 2 * n + 1 ];
-                }
+            for( n = 0; n < nSamplesFromInput; n++ ) {
+                buf[ n + delay ] = samplesIn[ 2 * n + 1 ];
             }
+            silk_memcpy(buf, &psEnc->state_Fxx[ 1 ].sCmn.delayBuf[ MAX_ENCODER_DELAY - delay ], delay * sizeof(opus_int16));
             ret += silk_resampler( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state,
                 &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+            silk_memcpy(psEnc->state_Fxx[ 1 ].sCmn.delayBuf, buf + nSamplesFromInput + delay - MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
+
             psEnc->state_Fxx[ 1 ].sCmn.inputBufIx += nSamplesToBuffer;
         } else if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 1 ) {
             /* Combine left and right channels before resampling */
             for( n = 0; n < nSamplesFromInput; n++ ) {
-                buf[ n ] = (opus_int16)silk_RSHIFT_ROUND( samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ],  1 );
+                buf[ n + delay ] = (opus_int16)silk_RSHIFT_ROUND( samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ],  1 );
+            }
+            if(psEnc->nPrevChannelsInternal == 2 && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded==0) {
+               for ( n = 0; n<MAX_ENCODER_DELAY; n++ )
+                  psEnc->state_Fxx[ 0 ].sCmn.delayBuf[ n ] = silk_RSHIFT(psEnc->state_Fxx[ 0 ].sCmn.delayBuf[ n ]+(opus_int32)psEnc->state_Fxx[ 1 ].sCmn.delayBuf[ n ], 1);
             }
+            silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[ MAX_ENCODER_DELAY - delay ], delay * sizeof(opus_int16));
             ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
                 &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+            /* On the first mono frame, average the results for the two resampler states  */
+            if (psEnc->nPrevChannelsInternal == 2 && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded==0) {
+               ret += silk_resampler( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state,
+                   &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+               for ( n = 0; n < psEnc->state_Fxx[ 0 ].sCmn.frame_length; n++ ) {
+                  psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx+n+2 ] =
+                        silk_RSHIFT(psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx+n+2 ]
+                                  + psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx+n+2 ], 1);
+               }
+            }
+            silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf + nSamplesFromInput + delay - MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
             psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
         } else {
             silk_assert( encControl->nChannelsAPI == 1 && encControl->nChannelsInternal == 1 );
+            silk_memcpy(buf + delay, samplesIn, nSamplesFromInput*sizeof(opus_int16));
+            silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[ MAX_ENCODER_DELAY - delay ], delay * sizeof(opus_int16));
             ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
-                &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], samplesIn, nSamplesFromInput );
+                &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+            silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf + nSamplesFromInput + delay - MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
             psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
         }
 
@@ -350,6 +331,8 @@ opus_int silk_Encode(
                 for( i = 0; i < psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket; i++ ) {
                     for( n = 0; n < encControl->nChannelsInternal; n++ ) {
                         if( psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i ] ) {
+                            opus_int condCoding;
+
                             if( encControl->nChannelsInternal == 2 && n == 0 ) {
                                 silk_stereo_encode_pred( psRangeEnc, psEnc->sStereo.predIx[ i ] );
                                 /* For LBRR data there's no need to code the mid-only flag if the side-channel LBRR flag is set */
@@ -357,7 +340,13 @@ opus_int silk_Encode(
                                     silk_stereo_encode_mid_only( psRangeEnc, psEnc->sStereo.mid_only_flags[ i ] );
                                 }
                             }
-                            silk_encode_indices( &psEnc->state_Fxx[ n ].sCmn, psRangeEnc, i, 1 );
+                            /* Use conditional coding if previous frame available */
+                            if( i > 0 && psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i - 1 ] ) {
+                                condCoding = CODE_CONDITIONALLY;
+                            } else {
+                                condCoding = CODE_INDEPENDENTLY;
+                            }
+                            silk_encode_indices( &psEnc->state_Fxx[ n ].sCmn, psRangeEnc, i, 1, condCoding );
                             silk_encode_pulses( psRangeEnc, psEnc->state_Fxx[ n ].sCmn.indices_LBRR[i].signalType, psEnc->state_Fxx[ n ].sCmn.indices_LBRR[i].quantOffsetType,
                                 psEnc->state_Fxx[ n ].sCmn.pulses_LBRR[ i ], psEnc->state_Fxx[ n ].sCmn.frame_length );
                         }
@@ -394,17 +383,39 @@ opus_int silk_Encode(
             if( encControl->nChannelsInternal == 2 ) {
                 silk_stereo_LR_to_MS( &psEnc->sStereo, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ 2 ], &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ 2 ],
                     psEnc->sStereo.predIx[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ], &psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ],
-                    MStargetRates_bps, TargetRate_bps, psEnc->state_Fxx[ 0 ].sCmn.speech_activity_Q8,
+                    MStargetRates_bps, TargetRate_bps, psEnc->state_Fxx[ 0 ].sCmn.speech_activity_Q8, encControl->toMono,
                     psEnc->state_Fxx[ 0 ].sCmn.fs_kHz, psEnc->state_Fxx[ 0 ].sCmn.frame_length );
-                if (!prefillFlag) {
+                if( psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] == 0 ) {
+                    /* Reset side channel encoder memory for first frame with side coding */
+                    if( psEnc->prev_decode_only_middle == 1 ) {
+                        silk_memset( &psEnc->state_Fxx[ 1 ].sShape,               0, sizeof( psEnc->state_Fxx[ 1 ].sShape ) );
+                        silk_memset( &psEnc->state_Fxx[ 1 ].sPrefilt,             0, sizeof( psEnc->state_Fxx[ 1 ].sPrefilt ) );
+                        silk_memset( &psEnc->state_Fxx[ 1 ].sCmn.sNSQ,            0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.sNSQ ) );
+                        silk_memset( psEnc->state_Fxx[ 1 ].sCmn.prev_NLSFq_Q15,   0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.prev_NLSFq_Q15 ) );
+                        silk_memset( &psEnc->state_Fxx[ 1 ].sCmn.sLP.In_LP_State, 0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.sLP.In_LP_State ) );
+                        silk_memset( &psEnc->state_Fxx[ 1 ].sCmn.inputBuf,        0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.inputBuf ) );
+                        psEnc->state_Fxx[ 1 ].sCmn.prevLag                = 100;
+                        psEnc->state_Fxx[ 1 ].sCmn.sNSQ.lagPrev           = 100;
+                        psEnc->state_Fxx[ 1 ].sShape.LastGainIndex        = 10;
+                        psEnc->state_Fxx[ 1 ].sCmn.prevSignalType         = TYPE_NO_VOICE_ACTIVITY;
+                        psEnc->state_Fxx[ 1 ].sCmn.sNSQ.prev_inv_gain_Q16 = 65536;
+                    }
+                    silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ] );
+                } else {
+                    psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] = 0;
+                }
+                if( !prefillFlag ) {
                     silk_stereo_encode_pred( psRangeEnc, psEnc->sStereo.predIx[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] );
-                    silk_stereo_encode_mid_only( psRangeEnc, psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] );
+                    if( psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] == 0 ) {
+                        silk_stereo_encode_mid_only( psRangeEnc, psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] );
+                    }
                 }
             } else {
                 /* Buffering */
                 silk_memcpy( psEnc->state_Fxx[ 0 ].sCmn.inputBuf, psEnc->sStereo.sMid, 2 * sizeof( opus_int16 ) );
                 silk_memcpy( psEnc->sStereo.sMid, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.frame_length ], 2 * sizeof( opus_int16 ) );
             }
+            silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ] );
 
             /* Encode */
             for( n = 0; n < encControl->nChannelsInternal; n++ ) {
@@ -415,16 +426,29 @@ opus_int silk_Encode(
                 }
 
                 if( channelRate_bps > 0 ) {
+                    opus_int condCoding;
+
                     silk_control_SNR( &psEnc->state_Fxx[ n ].sCmn, channelRate_bps );
 
-                    if( ( ret = silk_encode_frame_Fxx( &psEnc->state_Fxx[ n ], nBytesOut, psRangeEnc ) ) != 0 ) {
+                    /* Use independent coding if no previous frame available */
+                    if( psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded - n <= 0 ) {
+                        condCoding = CODE_INDEPENDENTLY;
+                    } else if( n > 0 && psEnc->prev_decode_only_middle ) {
+                        /* If we skipped a side frame in this packet, we don't
+                           need LTP scaling; the LTP state is well-defined. */
+                        condCoding = CODE_INDEPENDENTLY_NO_LTP_SCALING;
+                    } else {
+                        condCoding = CODE_CONDITIONALLY;
+                    }
+                    if( ( ret = silk_encode_frame_Fxx( &psEnc->state_Fxx[ n ], nBytesOut, psRangeEnc, condCoding ) ) != 0 ) {
                         silk_assert( 0 );
                     }
-                    psEnc->state_Fxx[ n ].sCmn.nFramesEncoded++;
                 }
                 psEnc->state_Fxx[ n ].sCmn.controlled_since_last_payload = 0;
                 psEnc->state_Fxx[ n ].sCmn.inputBufIx = 0;
+                psEnc->state_Fxx[ n ].sCmn.nFramesEncoded++;
             }
+            psEnc->prev_decode_only_middle = psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded - 1 ];
 
             /* Insert VAD and FEC flags at beginning of bitstream */
             if( *nBytesOut > 0 && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded == psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket) {
@@ -437,8 +461,9 @@ opus_int silk_Encode(
                     flags  = silk_LSHIFT( flags, 1 );
                     flags |= psEnc->state_Fxx[ n ].sCmn.LBRR_flag;
                 }
-                if (!prefillFlag)
+                if( !prefillFlag ) {
                     ec_enc_patch_initial_bits( psRangeEnc, flags, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal );
+                }
 
                 /* Return zero bytes if all channels DTXed */
                 if( psEnc->state_Fxx[ 0 ].sCmn.inDTX && ( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 1 ].sCmn.inDTX ) ) {
@@ -468,12 +493,13 @@ opus_int silk_Encode(
             break;
         }
     }
+
     psEnc->nPrevChannelsInternal = encControl->nChannelsInternal;
 
     encControl->allowBandwidthSwitch = psEnc->allowBandwidthSwitch;
     encControl->inWBmodeWithoutVariableLP = psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == 16 && psEnc->state_Fxx[ 0 ].sCmn.sLP.mode == 0;
     encControl->internalSampleRate = silk_SMULBB( psEnc->state_Fxx[ 0 ].sCmn.fs_kHz, 1000 );
-    encControl->stereoWidth_Q14 = psEnc->sStereo.width_prev_Q14;
+    encControl->stereoWidth_Q14 = encControl->toMono ? 0 : psEnc->sStereo.smth_width_Q14;
     if( prefillFlag ) {
         encControl->payloadSize_ms = tmp_payloadSize_ms;
         encControl->complexity = tmp_complexity;