From 17c5966045b463fde45418000b03c95eb5cd7e09 Mon Sep 17 00:00:00 2001
From: JeanMarc Valin
Date: Fri, 17 Feb 2012 16:09:21 0500
Subject: [PATCH] Last updates for draft 11
 Draft updates
 Updated code to produce and check test vectors
 Making sure that the test vectors pass at all rates as well as for mono and stereo

Makefile.draft  2 +
celt/bands.c  8 +
celt/celt.c  2 +
configure.ac  2 +
doc/build_draft.sh  11 +
doc/draftietfcodecopus.xml  1648 ++++++++++++++++++++++
silk/dec_API.c  20 +
silk/decoder_set_fs.c  16 
src/opus_compare.c  70 ++
src/opus_decoder.c  2 +
src/opus_demo.c  164 ++++
tests/run_vectors.sh  43 +
12 files changed, 1176 insertions(+), 812 deletions()
diff git a/Makefile.draft b/Makefile.draft
index 0f084a5..501f76e 100644
 a/Makefile.draft
+++ b/Makefile.draft
@@ 20,7 +20,7 @@ CFLAGS := Drestrict= $(CFLAGS)
###################### END OF OPTIONS ######################
CFLAGS += DOPUS_VERSION='"0.9.8"'
+CFLAGS += DOPUS_VERSION='"0.9.9"'
include silk_sources.mk
include celt_sources.mk
include opus_sources.mk
diff git a/celt/bands.c b/celt/bands.c
index 1d49386..68b3626 100644
 a/celt/bands.c
+++ b/celt/bands.c
@@ 238,22 +238,22 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas
celt_norm *X;
opus_val16 prev1;
opus_val16 prev2;
 opus_val16 Ediff;
+ opus_val32 Ediff;
opus_val16 r;
int renormalize=0;
prev1 = prev1logE[c*m>nbEBands+i];
prev2 = prev2logE[c*m>nbEBands+i];
 if (CnbEBands+i]);
prev2 = MAX16(prev2,prev2logE[m>nbEBands+i]);
}
 Ediff = logE[c*m>nbEBands+i]MIN16(prev1,prev2);
+ Ediff = EXTEND32(logE[c*m>nbEBands+i])EXTEND32(MIN16(prev1,prev2));
Ediff = MAX16(0, Ediff);
#ifdef FIXED_POINT
if (Ediff < 16384)
 r = 2*MIN16(16383,SHR32(celt_exp2(Ediff),1));
+ r = 2*MIN16(16383,SHR32(celt_exp2(EXTRACT16(Ediff)),1));
else
r = 0;
if (LM==3)
diff git a/celt/celt.c b/celt/celt.c
index 6c1eb6b..ddf65fa 100644
 a/celt/celt.c
+++ b/celt/celt.c
@@ 2392,7 +2392,7 @@ int celt_decode_with_ec(CELTDecoder * restrict st, const unsigned char *data, in
dec = &_dec;
}
 if (Cmode>nbEBands;i++)
oldBandE[i]=MAX16(oldBandE[i],oldBandE[st>mode>nbEBands+i]);
diff git a/configure.ac b/configure.ac
index 112b99d..c54bade 100644
 a/configure.ac
+++ b/configure.ac
@@ 9,7 +9,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
OPUS_MAJOR_VERSION=0
OPUS_MINOR_VERSION=9
OPUS_MICRO_VERSION=8
+OPUS_MICRO_VERSION=9
OPUS_EXTRA_VERSION=
OPUS_VERSION="$OPUS_MAJOR_VERSION.$OPUS_MINOR_VERSION.$OPUS_MICRO_VERSION$OPUS_EXTRA_VERSION"
diff git a/doc/build_draft.sh b/doc/build_draft.sh
index 4d95574..7809ee8 100755
 a/doc/build_draft.sh
+++ b/doc/build_draft.sh
@@ 50,6 +50,17 @@ cat opus_source.tar.gz base64  tr d '\n'  fold w 64  \
#echo '' >> opus_compare_escaped.c
#echo '' >> opus_compare_escaped.c
+echo '' >> testvectors_sha1
+
echo running xml2rfc
xml2rfc draftietfcodecopus.xml draftietfcodecopus.html &
xml2rfc draftietfcodecopus.xml
diff git a/doc/draftietfcodecopus.xml b/doc/draftietfcodecopus.xml
index a6739a1..448c2e9 100644
 a/doc/draftietfcodecopus.xml
+++ b/doc/draftietfcodecopus.xml
@@ 2,7 +2,7 @@

+Definition of the Opus Audio Codec
@@ 53,7 +53,7 @@

+
General
@@ 65,7 +65,7 @@ This document defines the Opus interactive speech and audio codec.
Opus is designed to handle a wide range of interactive audio applications,
including Voice over IP, videoconferencing, ingame chat, and even live,
distributed music performances.
It scales from low bitrate narrowband speech at 6 kb/s to very high quality
+It scales from low bitrate narrowband speech at 6 kb/s to very high quality
stereo music at 510 kb/s.
Opus uses both linear prediction (LP) and the Modified Discrete Cosine
Transform (MDCT) to achieve good compression of both speech and music.
@@ 78,7 +78,7 @@ Opus uses both linear prediction (LP) and the Modified Discrete Cosine
The Opus codec is a realtime interactive audio codec designed to meet the requirements
described in .
+described in .
It is composed of a linear
prediction (LP)based layer and a Modified Discrete Cosine Transform
(MDCT)based layer.
@@ 96,11 +96,11 @@ The primary normative part of this specification is provided by the source code
in .
Only the decoder portion of this software is normative, though a
significant amount of code is shared by both the encoder and decoder.

The decoder contains significant amounts of integer and fixedpoint arithmetic
 which must be performed exactly, including all rounding considerations, so any
 useful specification must make extensive use of domainspecific symbolic
 language to adequately define these operations.
+ provides a decoder conformance test.
+The decoder contains a great deal of integer and fixedpoint arithmetic which
+ must be performed exactly, including all rounding considerations, so any
+ useful specification requires domainspecific symbolic language to adequately
+ define these operations.
Additionally, any
conflict between the symbolic representation and the included reference
implementation must be resolved. For the practical reasons of compatibility and
@@ 112,7 +112,6 @@ For these reasons this RFC uses the reference implementation as the sole
symbolic representation of the codec.

While the symbolic representation is unambiguous and complete it is not
always the easiest way to understand the codec's operation. For this reason
this document also describes significant parts of the codec in English and
@@ 150,8 +149,8 @@ E.g., the text will explicitly indicate any shifts required after a
Expressions, where included in the text, follow C operator rules and
 precedence, with the exception that the syntax "x**y" is used to indicate x
 raised to the power y.
+ precedence, with the exception that the syntax "x**y" indicates x raised to
+ the power y.
The text also makes use of the following functions:
@@ 279,7 +278,8 @@ The LP layer is based on the
.
It supports NB, MB, or WB audio and frame sizes from 10 ms to 60 ms,
and requires an additional 5 ms lookahead for noise shaping estimation.
 A small additional delay (up to 1.2 ms) may be required for sampling rate conversion.
+A small additional delay (up to 1.5 ms) may be required for sampling rate
+ conversion.
Like Vorbis and many other modern codecs, SILK is inherently designed for
variablebitrate (VBR) coding, though the encoder can also produce
constantbitrate (CBR) streams.
@@ 360,70 +360,75 @@ Although the LP layer is VBR, the bit allocation of the MDCT layer can produce
The Opus codec includes a number of control parameters which can be changed dynamically during
regular operation of the codec, without interrupting the audio stream from the encoder to the decoder.
These parameters only affect the encoder since any impact they have on the bitstream is signalled
inband such that a decoder can decode any Opus stream without any outofband signalling. Any Opus
+These parameters only affect the encoder since any impact they have on the bitstream is signaled
+inband such that a decoder can decode any Opus stream without any outofband signaling. Any Opus
implementation can add or modify these control parameters without affecting interoperability. The most
important encoder control parameters in the reference encoder are listed below.

+
Opus supports all bitrates from 6 kb/s to 510 kb/s. All other parameters being
equal, higher bitrate results in higher quality. For a frame size of 20 ms, these
+Opus supports all bitrates from 6 kb/s to 510 kb/s. All other parameters being
+equal, higher bitrate results in higher quality. For a frame size of 20 ms, these
are the bitrate "sweet spots" for Opus in various configurations:
812 kb/s for narrowband speech
1620 kb/s for wideband speech
2840 kb/s for fullband speech
4864 kb/s for fullband mono music
64128 kb/s for fullband stereo music
+812 kb/s for NB speech,
+1620 kb/s for WB speech,
+2840 kb/s for FB speech,
+4864 kb/s for FB mono music, and
+64128 kb/s for FB stereo music.

+
Opus can transmit either mono or stereo audio within one stream. When
decoding a mono stream in stereo, the left and right channels will be
identical and when decoding a stereo channel in mono, the mono output
will be the average of the encoded left and right channels. In some cases
it is desirable to encode a stereo input stream in mono (e.g. because the
bitrate is insufficient for good quality stereo). The number of channels
encoded can be selected in realtime, but by default the reference encoder
attempts to make the best decision possible given the current bitrate.
+Opus can transmit either mono or stereo frames within a single stream.
+When decoding a mono frame in a stereo decoder, the left and right channels are
+ identical, and when decoding a stereo frame in a mono decoder, the mono output
+ is the average of the left and right channels.
+In some cases, it is desirable to encode a stereo input stream in mono (e.g.,
+ because the bitrate is too low to encode stereo with sufficient quality).
+The number of channels encoded can be selected in realtime, but by default the
+ reference encoder attempts to make the best decision possible given the
+ current bitrate.

+
The audio bandwidths supported by Opus are listed in
. Just like for the number of channels,
any decoder can decode audio encoded at any bandwidth. For example, any Opus
decoder operating at 8 kHz can decode a fullband Opus stream and any Opus decoder
operating at 48 kHz can decode a narrowband stream. Similarly, the reference encoder
can take a 48 kHz input signal and encode it in narrowband. The higher the audio
bandwidth, the higher the required bitrate to achieve acceptable quality.
+The audio bandwidths supported by Opus are listed in
+ .
+Just like for the number of channels, any decoder can decode audio encoded at
+ any bandwidth.
+For example, any Opus decoder operating at 8 kHz can decode a FB Opus
+ frame, and any Opus decoder operating at 48 kHz can decode a NB frame.
+Similarly, the reference encoder can take a 48 kHz input signal and
+ encode it as NB.
+The higher the audio bandwidth, the higher the required bitrate to achieve
+ acceptable quality.
The audio bandwidth can be explicitly specified in realtime, but by default
the reference encoder attempts to make the best bandwidth decision possible given
the current bitrate.
+ the reference encoder attempts to make the best bandwidth decision possible
+ given the current bitrate.

+
Opus can encode frames of 2.5, 5, 10, 20, 40 or 60 ms. It can also combine
multiple frames into packets of up to 120 ms. Because of the overhead from
IP/UDP/RTP headers, sending fewer packets per second reduces the
bitrate, but increases latency and sensitivity to packet losses as
losing one packet constitutes a loss of a bigger chunk of audio
signal. Increasing the frame duration also slightly improves coding
efficiency, but the gain becomes small for frame sizes above 20 ms. For
this reason, 20 ms frames tend to be a good choice for most applications.
+Opus can encode frames of 2.5, 5, 10, 20, 40 or 60 ms.
+It can also combine multiple frames into packets of up to 120 ms.
+For realtime applications, sending fewer packets per second reduces the
+ bitrate, since it reduces the overhead from IP, UDP, and RTP headers.
+However, it increases latency and sensitivity to packet losses, as losing one
+ packet constitutes a loss of a bigger chunk of audio.
+Increasing the frame duration also slightly improves coding efficiency, but the
+ gain becomes small for frame sizes above 20 ms.
+For this reason, 20 ms frames are a good choice for most applications.

+
There are various aspects of the Opus encoding process where tradeoffs
can be made between CPU complexity and quality/bitrate. In the reference
@@ 431,16 +436,17 @@ encoder, the complexity is selected using an integer from 0 to 10, where
0 is the lowest complexity and 10 is the highest. Examples of
computations for which such tradeoffs may occur are:
the filter order of the pitch analysis whitening filter the shortterm noise shaping filter;
+The order of the pitch analysis whitening filter,
+The order of the shortterm noise shaping filter,The number of states in delayed decision quantization of the
residual signal;
+residual signal, andThe use of certain bitstream features such as variable timefrequency
resolution and pitch postfilter.
+resolution and the pitch postfilter.

+
Audio codecs often exploit interframe correlations to reduce the
bitrate at a cost in error propagation: after losing one packet
@@ 451,21 +457,21 @@ choose a tradeoff between bitrate and amount of error propagation.

+
 Another mechanism providing robustness against packet loss is the in
 band Forward Error Correction (FEC). Packets that are determined to
+ Another mechanism providing robustness against packet loss is the inband
+ Forward Error Correction (FEC). Packets that are determined to
contain perceptually important speech information, such as onsets or
transients, are encoded again at a lower bitrate and this reencoded
information is added to a subsequent packet.

+
Opus is more efficient when operating with variable bitrate (VBR), which is
the default. However, in some (rare) applications, constant bitrate (CBR)
is required. There are two main reasons to operate in CBR mode:
+the default. However, in some (rare) applications, constant bitrate (CBR)
+is required. There are two main reasons to operate in CBR mode:
When the transport only supports a fixed size for each compressed frameWhen security is important and the input audio
@@ 480,7 +486,7 @@ CBR due to the bit reservoir).

+
Discontinuous Transmission (DTX) reduces the bitrate during silence
or background noise. When DTX is enabled, only one frame is encoded
@@ 573,8 +579,8 @@ For example, configuration 0 has a 10 ms frame size and configuration 3
One additional bit, labeled "s", is used to signal mono vs. stereo, with 0
 indicating mono and 1 indicating stereo.
+One additional bit, labeled "s", signals mono vs. stereo, with 0 indicating
+ mono and 1 indicating stereo.
@@ 606,20 +612,23 @@ This section describes how frames are packed according to each possible value
When a packet contains multiple VBR frames (i.e., code 2 or 3), the compressed
 length of one or more of these frames is indicated with a one or two byte
+ length of one or more of these frames is indicated with a one or twobyte
sequence, with the meaning of the first byte as follows:
0: No frame (discontinuous transmission (DTX) or lost packet)

1...251: Length of the frame in bytes252...255: A second byte is needed. The total length is (len[1]*4)+len[0]
+The special length 0 indicates that no frame is available, either because it
+ was dropped during transmission by some intermediary or because the encoder
+ chose not to transmit it.
+A length of 0 is valid for any Opus frame in any mode.
+
+
+
The maximum representable length is 255*4+255=1275 bytes.
For 20 ms frames, this represents a bitrate of 510 kb/s, which is
approximately the highest useful rate for lossily compressed fullband stereo
@@ 691,7 +700,7 @@ The number of payload bytes available for compressed data, N1, MUST be even
For code 2 packets, the TOC byte is followed by a one or two byte sequence
+For code 2 packets, the TOC byte is followed by a one or twobyte sequence
indicating the length of the first frame (marked N1 in the figure below),
followed by N1 bytes of compressed data for the first frame.
The remaining NN12 or NN13 bytes are the compressed data for the
@@ 703,7 +712,7 @@ For example, a 1byte code 2 packet is always invalid, and a 2byte code 2
The length of the first frame, N1, MUST also be no larger than the size of the
payload remaining after decoding that length for all code 2 packets.
This makes, for example, a 2byte code 2 packet with a second byte in the range
 1...250 invalid as well (the only valid 2byte code 2 packet is one where the
+ 1...251 invalid as well (the only valid 2byte code 2 packet is one where the
length of both frames is zero).
 where 0 <= n < k1.
+ where 0 <= n < k.
Here, rc_Q30[k] are the reflection coefficients.
div_Q30[k] is the denominator for each iteration, and gain_Qb1[k] is its
multiplicative inverse (with b1[k] fractional bits, where b1[k] ranges from
@@ 3551,11 +3593,11 @@ If the resulting value is zero, it falls back to the absolute coding procedure
Otherwise, the final primary pitch lag is then
 where lag_prev is the primary pitch lag from the most recent frame in the same
 channel and delta_lag_index is the value just decoded.
+ where previous_lag is the primary pitch lag from the most recent frame in the
+ same channel and delta_lag_index is the value just decoded.
This allows a perframe change in the pitch lag of 8 to +11 samples.
The decoder does no clamping at this point, so this value can fall outside the
range of 2 ms to 18 ms, and the decoder must use this unclamped
@@ 3953,7 +3995,7 @@ Frames that do not code the scaling parameter use the default factor of 15565
As described in , SILK uses a
linear congruential generator (LCG) to inject pseudorandom noise into the
 quantized excitation
+ quantized excitation.
To ensure synchronization of this process between the encoder and decoder, each
SILK frame stores a 2bit seed after the LTP parameters (if any).
The encoder may consider the choice of seed during quantization, and the
@@ 4238,8 +4280,10 @@ After the decoder reads the pulse locations for all blocks, it reads the LSBs
(if any) for each block in turn.
Inside each block, it reads all the LSBs for each coefficient in turn, even
those where no pulses were allocated, before proceeding to the next one.
They are coded from most significant to least significant, and they all use the
 PDF in .
+For 10 ms MB frames, it reads LSBs even for the extra 8 samples in
+ the last block.
+The LSBs are coded from most significant to least significant, and they all use
+ the PDF in .
@@ 4348,13 +4392,13 @@ The constant quantization offset varies depending on the signal type and
title="Excitation Quantization Offsets">
Signal TypeQuantization Offset Type
Quantization Offset (Q25)
InactiveLow100
InactiveHigh240
UnvoicedLow100
UnvoicedHigh240
VoicedLow32
VoicedHigh100
+Quantization Offset (Q23)
+InactiveLow25
+InactiveHigh60
+UnvoicedLow25
+UnvoicedHigh60
+VoicedLow8
+VoicedHigh25
@@ 4367,23 +4411,22 @@ Additionally, let seed be the current pseudorandom seed, which is initialized
to the value decoded from for the first sample in
the current SILK frame, and updated for each subsequent sample according to
the procedure below.
Finally, let offset_Q25 be the quantization offset from
+Finally, let offset_Q23 be the quantization offset from
.
Then the following procedure produces the final reconstructed excitation value,
 e_Q25[i]:
+ e_Q23[i]:
When e_raw[i] is zero, sign() returns 0 by the definition in
 , so the 80 term does not get added.
 offset does not get added.
The final e_Q25[i] value may require more than 16 bits per sample, but will not
 require more than 25, including the sign.
+ , so the factor of 20 does not get added.
+The final e_Q23[i] value may require more than 16 bits per sample, but will not
+ require more than 23, including the sign.
@@ 4439,31 +4482,24 @@ The LTP filter requires LPC residual values from before the current subframe as
However, since the LPCs may have changed, it obtains this residual by
"rewhitening" the corresponding output signal using the LPCs from the current
subframe.
Let e_Q25[i] be the excitation, and out[i] be the fully reconstructed output
 signal from previous subframes (see ), or
 zeros in the first subframe for this channel after either
+Let out[i] for
+ (j  pitch_lags[s]  d_LPC  2) <= i < j
+ be the fully reconstructed output signal from the last
+ (pitch_lags[s] + d_LPC + 2) samples of previous subframes
+ (see ), where pitch_lags[s] is the pitch
+ lag for the current subframe from .
+During reconstruction of the first subframe for this channel after either
An uncoded regular SILK frame in the side channel, or
A decoder reset (see ).
+An uncoded regular SILK frame (if this is the side channel), or
+A decoder reset (see ),



Let LTP_scale_Q14 be the LTP scaling parameter from
 for the first two subframes in any SILK
 frame, as well as the last two subframes in a 20 ms SILK frame where
 w_Q2 == 4.
Otherwise let LTP_scale_Q14 be 16384 (corresponding to 1.0).
Then, for i such that
 (j  pitch_lags[s]  d_LPC  2) <= i < j,
 where pitch_lags[s] is the pitch lag for the current subframe from
 , out[i] is rewhitened into an LPC residual,
+ out[] is rewhitened into an LPC residual,
res[i], via
This requires storage to buffer up to 306 values of out[i] from previous
subframes.
This corresponds to WB with a maximum of 18 ms * 16 kHz
 samples of pitch lag, plus 2 samples for the width of the LTP filter, plus 16
 samples for d_LPC.
+This corresponds to WB with a maximum pitch lag of
+ 18 ms * 16 kHz samples, plus 16 samples for d_LPC, plus 2
+ samples for the width of the LTP filter.
Let b_Q7[k] be the coefficients of the LTP filter taken from the
 codebook entry in one of
+Let e_Q23[i] for j <= i < (j + n) be the
+ excitation for the current subframe, and b_Q7[k] for
+ 0 <= k < 5 be the coefficients of the LTP filter
+ taken from the codebook entry in one of
Tables
through
corresponding to the index decoded for the current subframe in
@@ 4490,11 +4528,11 @@ Then for i such that j <= i < (j + n),
the LPC residual is
@@ 4505,9 +4543,9 @@ For unvoiced frames, the LPC residual for
copy of the excitation signal, i.e.,
@@ 4518,11 +4556,12 @@ res[i] = 
LPC synthesis uses the shortterm LPC filter to predict the next output
coefficient.
For i such that (j  d_LPC) <= i < j, let
 lpc[i] be the result of LPC synthesis from the previous subframe, or zeros in
 the first subframe for this channel after either
+ lpc[i] be the result of LPC synthesis from the last d_LPC samples of the
+ previous subframe, or zeros in the first subframe for this channel after
+ either
An uncoded regular SILK frame in the side channel, or
A decoder reset (see ).
+An uncoded regular SILK frame (if this is the side channel), or
+A decoder reset (see ).
Then for i such that j <= i < (j + n), the
result of LPC synthesis for the current subframe is
@@ 4616,7 +4655,7 @@ Then for i such that j <= i < (j + n2),
right[i] = clamp(1.0, (1  w1)*mid[i1]  side[i1]  w0*p0, 1.0) .
]]>
These formulas require twp samples prior to index j, the start of the
+These formulas require two samples prior to index j, the start of the
frame, for the mid channel, and one prior sample for the side channel.
For the first frame after a decoder reset, zeros are used instead.
@@ 4641,7 +4680,7 @@ However, a minimum amount of delay is imposed to allow the resampler to
operate, and this delay is normative, so that the corresponding delay can be
applied to the MDCT layer in the encoder.
A decoder is always free to use a resampler which requires more delay than
 allowed for here (e.g., to improve quality), but then it most delay the output
+ allowed for here (e.g., to improve quality), but it must then delay the output
of the MDCT layer by this extra amount.
Keeping as much delay as possible on the encoder side allows an encoder which
knows it will never use any of the SILK or Hybrid modes to skip this delay.
@@ 4653,27 +4692,42 @@ By contrast, if it were all applied by the decoder, then a decoder which
gives the maximum resampler delay
in samples at 48 kHz for each SILK audio bandwidth.
The reference implementation is able to resample to any of the supported
 output sampling rates (8, 12, 16, 24, or 48 kHz) within or near this
 delay constraint.
Because the actual output rate may not be 48 kHz, it may not be possible
to achieve exactly these delays while using a whole number of input or output
samples.
+The reference implementation is able to resample to any of the supported
+ output sampling rates (8, 12, 16, 24, or 48 kHz) within or near this
+ delay constraint.
Some resampling filters (including those used by the reference implementation)
 may add a delay that is not itself an exact integer at either rate.
However, such deviations are unlikely to be perceptible.
+ may add a delay that is not an exact integer, or is not linearphase, and so
+ cannot be represented by a single delay at all frequencies.
+However, such deviations are unlikely to be perceptible, and the comparison
+ tool described in is designed to be relatively
+ insensitive to them.
The delays listed here are the ones that should be targeted by the encoder.
Audio Bandwidth
Delay in Samples at 48 kHz
NB18
MB32
WB24
+Delay in millisecond
+NB0.538
+MB0.692
+WB0.706
+
+NB is given a smaller decoder delay allocation than MB and WB to allow a
+ higherorder filter when resampling to 8 kHz in both the encoder and
+ decoder.
+This implies that the audio content of two SILK frames operating at different
+ bandwidths are not perfectly aligned in time.
+This is not an issue for any transitions described in
+ , because they all involve a SILK decoder reset.
+When the decoder is reset, any samples remaining in the resampling buffer
+ are discarded, and the resampler is reinitialized with silence.
+
+
@@ 4699,9 +4753,9 @@ An overview of the decoder is given in .
 ^ 
++   
 Range   ++ v
 Decoder +  Bit  ++
++  Allocation  2^x 
  ++ ++
+ Decoder +  Bit  ++
+++  Allocation  2**x 
+  ++ ++
  
 v v ++
 ++ ++ ++  pitch 
@@ 4717,7 +4771,8 @@ An overview of the decoder is given in .
The decoder is based on the following symbols and sets of symbols:

+Symbol(s)PDFCondition
@@ 4742,7 +4797,6 @@ The decoder is based on the following symbols and sets of symbols:
residualanticollapse{1, 1}/2finalize
Order of the symbols in the CELT section of the bitstream.
@@ 4860,7 +4914,7 @@ Intraband masking is the strongest of the perceptual masking effects. This stru
means that the ideal allocation is more consistent from frame to frame than
it is for other codecs without an equivalent structure.
Because the bit allocation is used to drive the decoding of the rangecoder
+Because the bit allocation drives the decoding of the rangecoder
stream, it MUST be recovered exactly so that identical coding decisions are
made in the encoder and decoder. Any deviation from the reference's resulting
bit allocation will result in corrupted output, though implementers are
@@ 5010,7 +5064,7 @@ decode the trim value using the inverse CDF {127, 126, 124, 119, 109, 87, 41, 19
the allocation process, then one anticollapse bit is reserved in the allocation process so it can
be decoded later. Following the the anticollapse reservation, one bit is reserved for skip if available.
For stereo frames, bits are reserved for intensity stereo and for dual stereo. Intensity stereo
+For stereo frames, bits are reserved for intensity stereo and for dual stereo. Intensity stereo
requires ilog2(endstart) bits. Those bits are reserved if there is enough bits left. Following this, one
bit is reserved for dual stereo if available.
@@ 5092,7 +5146,7 @@ and the whole balance are applied, respectively.
Decoding of PVQ vectors is implemented in decode_pulses() (cwrs.c).
The unique codeword index is decoded as a uniformlydistributed integer value between 0 and
V(N,K)1, where V(N,K) is the number of possible combinations of K pulses in
+V(N,K)1, where V(N,K) is the number of possible combinations of K pulses in
N samples. The index is then converted to a vector in the same way specified in
. The indexing is based on the calculation of V(N,K)
(denoted N(L,K) in ).
@@ 5167,7 +5221,7 @@ R(x_N2, X_N1), ..., R(x_1, x_2).
If the decoded vector represents more
than one time block, then the following process is applied separately on each time block.
Also, if each block represents 8 samples or more, then another ND rotation, by
+Also, if each block represents 8 samples or more, then another ND rotation, by
(pi/2theta), is applied before the rotation described above. This
extra rotation is applied in an interleaved manner with a stride equal to round(sqrt(N/nb_blocks))
@@ 5193,13 +5247,14 @@ of stereo audio.
The timefrequency (TF) parameters are used to control the timefrequency resolution tradeoff
in each coded band. For each band, there are two possible TF choices. For the first
band coded, the PDF is {3, 1}/4 for frames marked as transient and {15, 1}/16 for
the other frames. For subsequent bands, the TF choice is coded relative to the
+the other frames. For subsequent bands, the TF choice is coded relative to the
previous TF choice with probability {15, 1}/15 for transient frames and {31, 1}/32
otherwise. The mapping between the decoded TF choices and the adjustment in TF
resolution is shown in the tables below.

+Frame size (ms)01
@@ 5207,10 +5262,10 @@ resolution is shown in the tables below.
50110022002
TF adjustments for nontransient frames and tf_select=0

+Frame size (ms)01
@@ 5218,11 +5273,11 @@ resolution is shown in the tables below.
50210032003
TF adjustments for nontransient frames and tf_select=1

+Frame size (ms)01
@@ 5230,10 +5285,10 @@ resolution is shown in the tables below.
51010202030
TF adjustments for transient frames and tf_select=0

+Frame size (ms)01
@@ 5241,7 +5296,6 @@ resolution is shown in the tables below.
51110112011
TF adjustments for transient frames and tf_select=1
@@ 5250,9 +5304,9 @@ while a positive TF adjustment means that the frequency resolution is increased.
Changes in TF resolution are implemented using the Hadamard transform. To increase
the time resolution by N, N "levels" of the Hadamard transform are applied to the
decoded vector for each interleaved MDCT vector. To increase the frequency resolution
(assumes a transient frame), then N levels of the Hadamard transform are applied
+(assumes a transient frame), then N levels of the Hadamard transform are applied
across the interleaved MDCT vector. In the case of increased
time resolution the decoder uses the "sequency order" because the input vector
+time resolution the decoder uses the "sequency order" because the input vector
is sorted in time.
@@ 5286,18 +5340,18 @@ multiplied by the square root of the decoded energy. This is done by denormalise
The inverse MDCT implementation has no special characteristics. The
input is N frequencydomain samples and the output is 2*N timedomain
samples, while scaling by 1/2. A "lowoverlap" window is used to reduce the algorithmic delay.
+samples, while scaling by 1/2. A "lowoverlap" window reduces the algorithmic delay.
It is derived from a basic (full overlap) 240sample version of the window used by the Vorbis codec:
The lowoverlap window is created by zeropadding the basic window and inserting ones in the
middle, such that the resulting window still satisfies power complementarity. The IMDCT and
+The lowoverlap window is created by zeropadding the basic window and inserting ones in the
+middle, such that the resulting window still satisfies power complementarity. The IMDCT and
windowing are performed by mdct_backward (mdct.c).
@@ 5419,8 +5473,6 @@ periodic, and if so what the period is, using the OPUS_GET_PITCH() request.


Switching between the Opus coding modes, audio bandwidths, and channel counts
requires careful consideration to avoid audible glitches.
@@ 5446,7 +5498,7 @@ However, other transitions between SILKonly packets or between NB or MB SILK
new sample rate.
These switches SHOULD be delayed by the encoder until quiet periods or
transients, where the inevitable glitches will be less audible. Additionally,
 the bitstream MAY include redundant side information ("redundancy"), in the
+ the bitstream MAY include redundant side information ("redundancy"), in the
form of additional CELT frames embedded in each of the Opus frames around the
transition.
@@ 5468,7 +5520,7 @@ To avoid or reduces glitches during these problematic mode transitions, and
A transition between coding the lower frequencies with the LP model and the
 MDCT model or a transition that involves changing the SILK bandwidth
+ MDCT model or a transition that involves changing the SILK bandwidth
is only normatively specified when it includes redundancy.
For those without redundancy, it is RECOMMENDED that the decoder use a
concealment technique (e.g., make use of a PLC algorithm) to "fill in" the
@@ 5618,7 +5670,6 @@ If the redundancy belongs at the beginning (in a CELTonly to SILKonly or
Hybrid transition), the final reconstructed output uses the first 2.5 ms
of audio output by the decoder for the redundant frame is asis, discarding
the corresponding output from the SILKonly or Hybrid portion of the frame.

The remaining 2.5 ms is crosslapped with the decoded SILK/Hybrid signal
using the CELT's powercomplementary MDCT window to ensure a smooth
transition.
@@ 5661,8 +5712,8 @@ When switching from CELTonly mode to SILKonly or Hybrid mode with redundancy,
illustrates all of the normative
transitions involving a mode change, an audio bandwidth change, or both.
Each one uses an S, H, or C to represent an Opus frames in the corresponding
 modes.
+Each one uses an S, H, or C to represent an Opus frame in the corresponding
+ mode.
In addition, an R indicates the presence of redundancy in the Opus frame it is
crosslapped with.
Its location in the first or last 5 ms is assumed to correspond to whether
@@ 5673,9 +5724,11 @@ Finally, a c indicates the contents of the CELT overlap buffer after the
S > S ;S > S > S
 & &
+SILK to SILK with Redundancy: S > S > S
+ &
!R > R
+ &
+ ;S > S > S
NB or MB SILK to Hybrid with Redundancy: S > S > S
&
@@ 5687,9 +5740,11 @@ SILK to CELT with Redundancy: S > S > S
&
!R > C > C > C
Hybrid to NB or MB SILK with Redundancy: H > H > H ;S > S > S
 & &
+Hybrid to NB or MB SILK with Redundancy: H > H > H
+ &
!R > R
+ &
+ ;S > S > S
Hybrid to WB SILK: H > H > H > c
\ +
@@ 5759,6 +5814,7 @@ Key:
S SILKonly frame ; SILK decoder reset
H Hybrid frame  CELT and SILK decoder resets
C CELTonly frame ! CELT decoder reset
+c CELT overlap + Direct mixing
P Packet Loss Concealment & Windowed crosslap
]]>
@@ 5782,25 +5838,25 @@ Encoders SHOULD NOT use other transitions, e.g., those that involve redundancy
Just like the decoder, the Opus encoder also normally consists of two main blocks: the
SILK encoder and the CELT encoder. However, unlike the case of the decoder, a valid
(though potentially suboptimal) Opus encoder is not required to support all modes and
may thus only include a SILK encoder module or a CELT encoder module.
+may thus only include a SILK encoder module or a CELT encoder module.
The output bitstream of the Opus encoding contains bits from the SILK and CELT
 encoders, though these are not separable due to the use of a range coder.
+ encoders, though these are not separable due to the use of a range coder.
A block diagram of the encoder is illustrated below.

+
 rate >encoder+
 ++  conversion   
  Optional   ++ ++  ++
> highpass + +> Range 
 + filter +  ++ ++ encoder>
 ++   Delay   CELT  +>  bit
 +>compensation>encoder+ ++ stream
    
 ++ ++
+ ++ ++
+  Sample   SILK +
+ +> Rate > Encoder  V
+ ++   Conversion    ++
+  Optional   ++ ++  Range 
+> Highpass +  Encoder >
+ + Filter +  ++ ++   Bit
+ ++   Delay   CELT  ++ stream
+ +> Compensation > Encoder  ^
+    +
+ ++ ++
]]>
@@ 5813,7 +5869,7 @@ In the reference implementation, the frame size is selected by the application,
other configuration parameters (number of channels, bandwidth, mode) are automatically
selected (unless explicitly overridden by the application) depend on the following:
Requested bitrate
+Requested bitrateInput sampling rateType of signal (speech vs music)Frame size in use
@@ 5822,150 +5878,277 @@ selected (unless explicitly overridden by the application) depend on the followi
The type of signal currently needs to be provided by the application (though it can be
changed in realtime). An Opus encoder implementation could also do automatic detection,
but since Opus is an interactive codec, such an implementation would likely have to either
delay the signal (for noninteractive application) or delay the mode switching decisions (for
+delay the signal (for noninteractive applications) or delay the mode switching decisions (for
interactive applications).
When the encoder is configured for voice over IP applications, the input signal is
+When the encoder is configured for voice over IP applications, the input signal is
filtered by a highpass filter to remove the lowest part of the spectrum
that contains little speech energy and may contain background noise. This is a second order
Auto Regressive Moving Average (ARMA) filter with a cutoff frequency around 50 Hz.
In the future, a music detector may also be used to lower the cutoff frequency when the
+In the future, a music detector may also be used to lower the cutoff frequency when the
input signal is detected to be music rather than speech.

+
The range coder also acts as the bitpacker for Opus. It is
used in three different ways, to encode:
+The range coder acts as the bitpacker for Opus.
+It is used in three different ways: to encode
entropycoded symbols with a fixed probability model using ec_encode(), (entenc.c)
integers from 0 to 2**M1 using ec_enc_uint() or ec_enc_bits(), (entenc.c)
integers from 0 to N1 (where N is not a power of two) using ec_enc_uint(). (entenc.c)
+
+Entropycoded symbols with a fixed probability model using ec_encode()
+ (entenc.c),
+
+
+Integers from 0 to (2**M  1) using ec_enc_uint() or ec_enc_bits()
+ (entenc.c),
+
+Integers from 0 to (ft  1) (where ft is not a power of two) using
+ ec_enc_uint() (entenc.c).
+
The range encoder maintains an internal state vector composed of the
fourtuple (low,rng,rem,ext) representing the low end of the current
range, the size of the current range, a single buffered output octet,
and a count of additional carrypropagating output octets. Both rng
and low are 32bit unsigned integer values, rem is an octet value or
the special value 1, and ext is an integer with at least 16 bits.
This state vector is initialized at the start of each each frame to
the value (0,2**31,1,0). The reference implementation reuses the
'val' field of the entropy coder structure to hold low, in order to
allow the same structure to be used for encoding and decoding, but
we maintain the distinction here for clarity.
+The range encoder maintains an internal state vector composed of the fourtuple
+ (val, rng, rem, ext) representing the low end of the current
+ range, the size of the current range, a single buffered output octet, and a
+ count of additional carrypropagating output octets.
+Both val and rng are 32bit unsigned integer values, rem is an octet value or
+ less than 255 or the special value 1, and ext is an unsigned integer with at
+ least 11 bits.
+This state vector is initialized at the start of each each frame to the value
+ (0, 2**31, 1, 0).
+After encoding a sequence of symbols, the value of rng in the encoder should
+ exactly match the value of rng in the decoder after decoding the same sequence
+ of symbols.
+This is a powerful tool for detecting errors in either an encoder or decoder
+ implementation.
+The value of val, on the other hand, represents different things in the encoder
+ and decoder, and is not expected to match.
+
+
+
+The decoder has no analog for rem and ext.
+These are used to perform carry propagation in the renormalization loop below.
+Each iteration of this loop produces 9 bits of output, consisting of 8 data
+ bits and a carry flag.
+The encoder cannot determine the final value of the output octets until it
+ propagates these carry flags.
+Therefore the reference implementation buffers a single nonpropagating output
+ octet (i.e., one less than 255) in rem and keeps a count of additional
+ propagating (i.e., 255) output octets in ext.
+An implementation may choose to use any mathematically equivalent scheme to
+ perform carry propagation.
 The main encoding function is ec_encode() (entenc.c),
 which takes as an argument a threetuple (fl,fh,ft)
 describing the range of the symbol to be encoded in the current
 context, with 0 <= fl < fh <= ft <= 65535. The values of this tuple
 are derived from the probability model for the symbol. Let f(i) be
 the frequency of the i'th symbol in the current context. Then the
 threetuple corresponding to the k'th symbol is given by

+The main encoding function is ec_encode() (entenc.c), which encodes symbol k in
+ the current context using the same threetuple (fl[k], fh[k], ft)
+ as the decoder to describe the range of the symbol (see
+ ).
 ec_encode() updates the state of the encoder as follows. If fl is
 greater than zero, then low = low + rng  (rng/ft)*(ftfl) and
 rng = (rng/ft)*(fhfl). Otherwise, low is unchanged and
 rng = rng  (rng/ft)*(fhfl). The divisions here are exact integer
 division. After this update, the range is normalized.
+ec_encode() updates the state of the encoder as follows.
+If fl[k] is greater than zero, then
+
+
+
+Otherwise, val is unchanged and
+
+
+
+The divisions here are exact integer division.
+
+
+
+
+After this update, the range is normalized using a procedure very similar to
+ that of , implemented by
+ ec_enc_normalize() (entenc.c).
+The following process is repeated until rng > 2**23.
+First, the top 9 bits of val, (val>>23), are sent to the carry buffer,
+ described in .
+Then, the encoder sets
+
+
+
+
+
+
 To normalize the range, the following process is repeated until
 rng > 2**23. First, the top 9 bits of low, (low>>23), are placed into
 a carry buffer. Then, low is set to . This process is carried out by
 ec_enc_normalize() (entenc.c).
+The function ec_enc_carry_out() (entenc.c) implements carry propagation and
+ output buffering.
+It takes as input a 9bit value, c, consisting of 8 data bits and an additional
+ carry bit.
+If c is equal to the value 255, then ext is simply incremented, and no other
+ state updates are performed.
+Otherwise, let b = (c>>8) be the carry bit.
+Then,
+
+
+If the buffered octet rem contains a value other than 1, the encoder outputs
+ the octet (rem + b).
+Otherwise, if rem is 1, no octet is output.
 The 9 bits produced in each iteration of the normalization loop
 consist of 8 data bits and a carry flag. The final value of the
 output bits is not determined until carry propagation is accounted
 for. Therefore the reference implementation buffers a single
 (nonpropagating) output octet and keeps a count of additional
 propagating (0xFF) output octets. An implementation may choose to use
 any mathematically equivalent scheme to perform carry propagation.
+If ext is nonzero, then the encoder outputs ext octetsall with a value of 0
+ if b is set, or 255 if b is unsetand sets ext to 0.
+
+
+rem is set to the 8 data bits:
+
+
+
+
+
+
+
+
+
+
 The function ec_enc_carry_out() (entenc.c) performs
 this buffering. It takes a 9bit input value, c, from the normalization:
 8 bits of output and a carry bit. If c is 0xFF, then ext is incremented
 and no octets are output. Otherwise, if rem is not the special value
 1, then the octet (rem+(c>>8)) is output. Then ext octets are output
 with the value 0 if the carry bit is set, or 0xFF if it is not, and
 rem is set to the lower 8 bits of c. After this, ext is set to zero.
+The reference implementation uses three additional encoding methods that are
+ exactly equivalent to the above, but make assumptions and simplifications that
+ allow for a more efficient implementation.
+
+
 In the reference implementation, a special version of ec_encode()
 called ec_encode_bin() (entenc.c) is defined to
 take a twotuple (fl,ftb), where , but avoids using division.
+The first is ec_encode_bin() (entenc.c), defined using the parameter ftb
+ instead of ft.
+It is mathematically equivalent to calling ec_encode() with
+ ft = (1<<ftb), but avoids using division.
+
+
+
+
+The next is ec_enc_bit_logp() (entenc.c), which encodes a single binary symbol.
+The context is described by a single parameter, logp, which is the absolute
+ value of the base2 logarithm of the probability of a "1".
+It is mathematically equivalent to calling ec_encode() with the 3tuple
+ (fl[k] = 0, fh[k] = (1<<logp)  1,
+ ft = (1<<logp)) if k is 0 and with
+ (fl[k] = (1<<logp)  1,
+ fh[k] = ft = (1<<logp)) if k is 1.
+The implementation requires no multiplications or divisions.
+
+
+The last is ec_enc_icdf() (entenc.c), which encodes a single binary symbol with
+ a tablebased context of up to 8 bits.
+This uses the same icdf table as ec_dec_icdf() from
+ .
+The function is mathematically equivalent to calling ec_encode() with
+ fl[k] = (1<<ftb)  icdf[k1] (or 0 if
+ k == 0), fh[k] = (1<<ftb)  icdf[k], and
+ ft = (1<<ftb).
+This only saves a few arithmetic operations over ec_encode_bin(), but allows
+ the encoder to use the same icdf tables as the decoder.
+
+
+
+
+
 The CELT layer also allows directly encoding a series of raw bits, outside
 of the range coder, implemented in ec_enc_bits() (entenc.c).
 The raw bits are packed at the end of the packet, starting by storing the
 least significant bit of the value to be packed in the least significant bit
 of the last byte, filling up to the most significant bit in
 the last byte, and then continuing in the least significant bit of the
 penultimate byte, and so on.
 This packing may continue into the last byte output by the range coder,
 though the format should render it impossible to overwrite any set bit
 produced by the range coder when the procedure in
 is followed to finalize the stream.
+The raw bits used by the CELT layer are packed at the end of the buffer using
+ ec_enc_bits() (entenc.c).
+Because the raw bits may continue into the last byte output by the range coder
+ if there is room in the loworder bits, the encoder must be prepared to merge
+ these values into a single octet.
+The procedure in does this in a way that
+ ensures both the range coded data and the raw bits can be decoded
+ successfully.
 The function ec_enc_uint() is based on ec_encode() and encodes one of N
 equiprobable symbols, each with a frequency of 1, where N may be as large as
 2**321. Because ec_encode() is limited to a total frequency of 2**161, this
 is done by encoding a series of symbols in smaller contexts.
+The function ec_enc_uint() (entenc.c) encodes one of ft equiprobable symbols in
+ the range 0 to (ft  1), inclusive, each with a frequency of 1,
+ where ft may be as large as (2**32  1).
+Like the decoder (see ), it splits it splits up the
+ value into a range coded symbol representing up to 8 of the high bits, and, if
+ necessary, raw bits representing the remainder of the value.
+
+
+ec_enc_uint() takes a twotuple (t, ft), where t is the value to be
+ encoded, 0 <= t < ft, and ft is not necessarily a
+ power of two.
+Let ftb = ilog(ft  1), i.e., the number of bits required
+ to store (ft  1) in two's complement notation.
+If ftb is 8 or less, then t is encoded directly using ec_encode() with the
+ threetuple (t, t + 1, ft).
 ec_enc_uint() (entenc.c) takes a twotuple (fl,ft),
 where ft is not necessarily a power of two. Let ftb be the location
 of the highest 1 bit in the two'scomplement representation of
 (ft1), or 1 if no bits are set. If ftb>8, then the top 8 bits of fl
 are encoded using ec_encode() with the threetuple
 (fl>>ftb8,(fl>>ftb8)+1,(ft1>>ftb8)+1), and the remaining bits
 are encoded as raw bits. Otherwise, fl is encoded with ec_encode() directly
 using the threetuple (fl,fl+1,ft).
+If ftb is greater than 8, then the top 8 bits of t are encoded using the
+ threetuple (t>>(ftb  8),
+ (t>>(ftb  8)) + 1,
+ ((ft  1)>>(ftb  8)) + 1), and the
+ remaining bits,
+ (t & ((1<<(ftb  8))  1),
+ are encoded as raw bits with ec_enc_bits().
 After all symbols are encoded, the stream must be finalized by
 outputting a value inside the current range. Let end be the integer
 in the interval [low,low+rng) with the largest number of trailing
 zero bits, b, such that end+(1<<b)1 is also in the interval
 [low,low+rng). Then while end is not zero, the top 9 bits of end, e.g.,
 >23), are sent to the carry buffer, and end is replaced by
 (end<<8&0x7FFFFFFF). Finally, if the value in carry buffer, rem, is]]>
 neither zero nor the special value 1, or the carry count, ext, is
 greater than zero, then 9 zero bits are sent to the carry buffer.
 After the carry buffer is finished outputting octets, the rest of the
 output buffer (if any) is padded with zero bits, until it reaches the raw
 bits. Finally, rem is set to the
 special value 1. This process is implemented by ec_enc_done()
 (entenc.c).
+After all symbols are encoded, the stream must be finalized by outputting a
+ value inside the current range.
+Let end be the integer in the interval [val, val + rng) with the
+ largest number of trailing zero bits, b, such that
+ (end + (1<<b)  1) is also in the interval
+ [val, val + rng).
+This choice of end allows the maximum number of trailing bits to be set to
+ arbitrary values while still ensuring the range coded part of the buffer can
+ be decoded correctly.
+Then, while end is not zero, the top 9 bits of end, i.e., (end>>23), are
+ passed to the carry buffer in accordance with the procedure in
+ , and end is updated via
+
+
+
+Finally, if the buffered output octet, rem, is neither zero nor the special
+ value 1, or the carry count, ext, is greater than zero, then 9 zero bits are
+ sent to the carry buffer to flush it to the output buffer.
+When outputting the final byte from the range coder, if it would overlap any
+ raw bits already packed into the end of the output buffer, they should be ORed
+ into the same byte.
+The bit allocation routines in the CELT layer should ensure that this can be
+ done without corrupting the range coder data so long as end is chosen as
+ described above.
+If there is any space between the end of the range coder data and the end of
+ the raw bits, it is padded with zero bits.
+This entire process is implemented by ec_enc_done() (entenc.c).
@@ 5989,30 +6172,29 @@ fl=sum(f(i),i
 In many respects the SILK encoder mirrors the SILK decoder described
 in .
 Details such as the quantization and range coder tables can be found
 there, while this section describes the highlevel design choices that
+ In many respects the SILK encoder mirrors the SILK decoder described
+ in .
+ Details such as the quantization and range coder tables can be found
+ there, while this section describes the highlevel design choices that
were made.
The diagram below shows the basic modules of the SILK encoder.

+
 Rate > Mixing > Core >
 input Conversion    Encoder  bitstream
 ++ ++ ++
+ ++ ++ ++
+  Sample   Stereo   SILK 
+> Rate > Mixing > Core >
+Input Conversion    Encoder  Bitstream
+ ++ ++ ++
]]>
Silk Encoder.
The input signal's sampling rate is adjusted by a sample rate conversion
module so that it matches the SILK internal sampling rate.
+module so that it matches the SILK internal sampling rate.
The input to the sample rate converter is delayed by a number of samples
depending on the sample rate ratio, such that the overall delay is constant
for all input and output sample rates.
@@ 6026,17 +6208,17 @@ It converts a stereo left/right signal into an adaptive
mid/side representation.
The first step is to compute nonadaptive mid/side signals
as half the sum and difference between left and right signals.
The side signal is then minimized in energy by subtracting a
+The side signal is then minimized in energy by subtracting a
prediction of it based on the mid signal.
This prediction works well when the left and right signals
exhibit linear dependency, for instance for an amplitudepanned
input signal.
Like in the decoder, the prediction coefficients are linearly
interpolated during the first 8 ms of the frame.
 The mid signal is always encoded, whereas the residual
+ The mid signal is always encoded, whereas the residual
side signal is only encoded if it has sufficient
 energy compared to the mid signal's energy.
 If it has not,
+ energy compared to the mid signal's energy.
+ If it has not,
the "mid_only_flag" is set without encoding the side signal.
@@ 6045,13 +6227,13 @@ the side signal is encoded.
For each frame, two predictor coefficients are computed, one
that predicts between lowpassed mid and side channels, and
one that predicts between highpassed mid and side channels.
The lowpass filter is a simple threetap filter
+The lowpass filter is a simple threetap filter
and creates a delay of one sample.
The highpass filtered signal is the difference between
the mid signal delayed by one sample and the lowpassed
signal. Instead of explicitly computing the highpassed
signal, it is computationally more efficient to transform
the prediction coefficients before applying them to the
+the prediction coefficients before applying them to the
filtered mid signal, as follows
@@ 6077,7 +6259,7 @@ For simplicity, the core encoder is referred to simply as the encoder in
the remainder of this section. An overview of the encoder is given in
.

+
Silk Core Encoder.
The input signal is processed by a Voice Activity Detector (VAD) to produce
a measure of voice activity, spectral tilt, and signaltonoise estimates for
each frame. The VAD uses a sequence of halfband filterbanks to split the
signal into four subbands: 0...Fs/16, Fs/16...Fs/8, Fs/8...Fs/4, and
Fs/4...Fs/2, where Fs is the sampling frequency (8, 12, 16, or 24 kHz).
The lowest subband, from 0  Fs/16, is highpass filtered with a firstorder
moving average (MA) filter (with transfer function H(z) = 1z**(1)) to
reduce the energy at the lowest frequencies. For each frame, the signal
energy per subband is computed.
In each subband, a noise level estimator tracks the background noise level
and a SignaltoNoise Ratio (SNR) value is computed as the logarithm of the
ratio of energy to noise level.
Using these intermediate variables, the following parameters are calculated
+The input signal is processed by a Voice Activity Detector (VAD) to produce
+a measure of voice activity, spectral tilt, and signaltonoise estimates for
+each frame. The VAD uses a sequence of halfband filterbanks to split the
+signal into four subbands: 0...Fs/16, Fs/16...Fs/8, Fs/8...Fs/4, and
+Fs/4...Fs/2, where Fs is the sampling frequency (8, 12, 16, or 24 kHz).
+The lowest subband, from 0  Fs/16, is highpass filtered with a firstorder
+moving average (MA) filter (with transfer function H(z) = 1z**(1)) to
+reduce the energy at the lowest frequencies. For each frame, the signal
+energy per subband is computed.
+In each subband, a noise level estimator tracks the background noise level
+and a SignaltoNoise Ratio (SNR) value is computed as the logarithm of the
+ratio of energy to noise level.
+Using these intermediate variables, the following parameters are calculated
for use in other SILK modules:
@@ 6165,12 +6346,12 @@ Smoothed subband SNRs. Temporally smoothed subband SNR values.
Speech activity level. Based on the average SNR and a weighted average of the
+Speech activity level. Based on the average SNR and a weighted average of the
subband energies.
Spectral tilt. A weighted average of the subband SNRs, with positive weights
+Spectral tilt. A weighted average of the subband SNRs, with positive weights
for the low subbands and negative weights for the high subbands.
@@ 6179,9 +6360,10 @@ for the low subbands and negative weights for the high subbands.
The input signal is processed by the open loop pitch estimator shown in
+The input signal is processed by the open loop pitch estimator shown in
.

+
Block diagram of the pitch estimator.
The pitch analysis finds a binary voiced/unvoiced classification, and, for
frames classified as voiced, four pitch lags per frame  one for each
5 ms subframe  and a pitch correlation indicating the periodicity of
the signal.
The input is first whitened using a Linear Prediction (LP) whitening filter,
where the coefficients are computed through standard Linear Prediction Coding
(LPC) analysis. The order of the whitening filter is 16 for best results, but
is reduced to 12 for medium complexity and 8 for low complexity modes.
The whitened signal is analyzed to find pitch lags for which the time
correlation is high.
+The pitch analysis finds a binary voiced/unvoiced classification, and, for
+frames classified as voiced, four pitch lags per frame  one for each
+5 ms subframe  and a pitch correlation indicating the periodicity of
+the signal.
+The input is first whitened using a Linear Prediction (LP) whitening filter,
+where the coefficients are computed through standard Linear Prediction Coding
+(LPC) analysis. The order of the whitening filter is 16 for best results, but
+is reduced to 12 for medium complexity and 8 for low complexity modes.
+The whitened signal is analyzed to find pitch lags for which the time
+correlation is high.
The analysis consists of three stages for reducing the complexity:
In the first stage, the whitened signal is downsampled to 4 kHz
(from 8 kHz) and the current frame is correlated to a signal delayed
by a range of lags, starting from a shortest lag corresponding to
+In the first stage, the whitened signal is downsampled to 4 kHz
+(from 8 kHz) and the current frame is correlated to a signal delayed
+by a range of lags, starting from a shortest lag corresponding to
500 Hz, to a longest lag corresponding to 56 Hz.
The second stage operates on an 8 kHz signal (downsampled from 12, 16,
or 24 kHz) and measures time correlations only near the lags
corresponding to those that had sufficiently high correlations in the first
stage. The resulting correlations are adjusted for a small bias towards
short lags to avoid ending up with a multiple of the true pitch lag.
+The second stage operates on an 8 kHz signal (downsampled from 12, 16,
+or 24 kHz) and measures time correlations only near the lags
+corresponding to those that had sufficiently high correlations in the first
+stage. The resulting correlations are adjusted for a small bias towards
+short lags to avoid ending up with a multiple of the true pitch lag.
The highest adjusted correlation is compared to a threshold depending on:
@@ 6250,13 +6431,13 @@ The speech activity level
The spectral tilt.
If the threshold is exceeded, the current frame is classified as voiced and
the lag with the highest adjusted correlation is stored for a final pitch
+If the threshold is exceeded, the current frame is classified as voiced and
+the lag with the highest adjusted correlation is stored for a final pitch
analysis of the highest precision in the third stage.
The last stage operates directly on the whitened input signal to compute time
correlations for each of the four subframes independently in a narrow range
+The last stage operates directly on the whitened input signal to compute time
+correlations for each of the four subframes independently in a narrow range
around the lag with highest correlation from the second stage.
@@ 6265,44 +6446,45 @@ around the lag with highest correlation from the second stage.
The noise shaping analysis finds gains and filter coefficients used in the
prefilter and noise shaping quantizer. These parameters are chosen such that
+The noise shaping analysis finds gains and filter coefficients used in the
+prefilter and noise shaping quantizer. These parameters are chosen such that
they will fulfill several requirements:
Balancing quantization noise and bitrate.
The quantization gains determine the step size between reconstruction levels
of the excitation signal. Therefore, increasing the quantization gain
amplifies quantization noise, but also reduces the bitrate by lowering
+Balancing quantization noise and bitrate.
+The quantization gains determine the step size between reconstruction levels
+of the excitation signal. Therefore, increasing the quantization gain
+amplifies quantization noise, but also reduces the bitrate by lowering
the entropy of the quantization indices.
Spectral shaping of the quantization noise; the noise shaping quantizer is
capable of reducing quantization noise in some parts of the spectrum at the
cost of increased noise in other parts without substantially changing the
bitrate.
By shaping the noise such that it follows the signal spectrum, it becomes
less audible. In practice, best results are obtained by making the shape
+Spectral shaping of the quantization noise; the noise shaping quantizer is
+capable of reducing quantization noise in some parts of the spectrum at the
+cost of increased noise in other parts without substantially changing the
+bitrate.
+By shaping the noise such that it follows the signal spectrum, it becomes
+less audible. In practice, best results are obtained by making the shape
of the noise spectrum slightly flatter than the signal spectrum.
Deemphasizing spectral valleys; by using different coefficients in the
analysis and synthesis part of the prefilter and noise shaping quantizer,
the levels of the spectral valleys can be decreased relative to the levels
of the spectral peaks such as speech formants and harmonics.
This reduces the entropy of the signal, which is the difference between the
+Deemphasizing spectral valleys; by using different coefficients in the
+analysis and synthesis part of the prefilter and noise shaping quantizer,
+the levels of the spectral valleys can be decreased relative to the levels
+of the spectral peaks such as speech formants and harmonics.
+This reduces the entropy of the signal, which is the difference between the
coded signal and the quantization noise, thus lowering the bitrate.
Matching the levels of the decoded speech formants to the levels of the
original speech formants; an adjustment gain and a first order tilt
coefficient are computed to compensate for the effect of the noise
+Matching the levels of the decoded speech formants to the levels of the
+original speech formants; an adjustment gain and a first order tilt
+coefficient are computed to compensate for the effect of the noise
shaping quantization on the level and spectral tilt.

+
Noise shaping and spectral deemphasis illustration.
 shows an example of an
input signal spectrum (1).
After deemphasis and level matching, the spectrum has deeper valleys (2).
The quantization noise spectrum (3) more or less follows the input signal
spectrum, while having slightly less pronounced peaks.
The entropy, which provides a lower bound on the bitrate for encoding the
excitation signal, is proportional to the area between the deemphasized
spectrum (2) and the quantization noise spectrum (3). Without deemphasis,
the entropy is proportional to the area between input spectrum (1) and
+ shows an example of an
+input signal spectrum (1).
+After deemphasis and level matching, the spectrum has deeper valleys (2).
+The quantization noise spectrum (3) more or less follows the input signal
+spectrum, while having slightly less pronounced peaks.
+The entropy, which provides a lower bound on the bitrate for encoding the
+excitation signal, is proportional to the area between the deemphasized
+spectrum (2) and the quantization noise spectrum (3). Without deemphasis,
+the entropy is proportional to the area between input spectrum (1) and
quantization noise (3)  clearly higher.
The transformation from input signal to deemphasized signal can be
+The transformation from input signal to deemphasized signal can be
described as a filtering operation with a filter
@@ 6365,9 +6546,9 @@ Wana(z) = (1  \ (a_ana(k) * z )*(1  z * \ b_ana(k) * z ),
]]>
is the analysis part of the deemphasis filter, consisting of the shortterm
shaping filter with coefficients a_ana(k), and the longterm shaping filter
with coefficients b_ana(k) and pitch lag L.
+is the analysis part of the deemphasis filter, consisting of the shortterm
+shaping filter with coefficients a_ana(k), and the longterm shaping filter
+with coefficients b_ana(k) and pitch lag L.
The parameter d determines the number of longterm shaping filter taps.
@@ 6386,19 +6567,19 @@ Wsyn(z) = (1  \ (a_syn(k) * z )*(1  z * \ b_syn(k) * z ).
All noise shaping parameters are computed and applied per subframe of 5 ms.
First, an LPC analysis is performed on a windowed signal block of 15 ms.
The signal block has a lookahead of 5 ms relative to the current subframe,
and the window is an asymmetric sine window. The LPC analysis is done with the
+All noise shaping parameters are computed and applied per subframe of 5 ms.
+First, an LPC analysis is performed on a windowed signal block of 15 ms.
+The signal block has a lookahead of 5 ms relative to the current subframe,
+and the window is an asymmetric sine window. The LPC analysis is done with the
autocorrelation method, with an order of between 8, in lowestcomplexity mode,
and 16, for best quality.
+and 16, for best quality.
Optionally the LPC analysis and noise shaping filters are warped by replacing
the delay elements by firstorder allpass filters.
This increases the frequency resolution at low frequencies and reduces it at
+This increases the frequency resolution at low frequencies and reduces it at
high ones, which better matches the human auditory system and improves
quality.
+quality.
The warped analysis and filtering comes at a cost in complexity
and is therefore only done in higher complexity modes.
@@ 6408,10 +6589,10 @@ from the LPC analysis and multiplying it by a value inversely proportional
to the coding quality control parameter and the pitch correlation.
Next the two sets of shortterm noise shaping coefficients a_ana(k) and
a_syn(k) are obtained by applying different amounts of bandwidth expansion to the
coefficients found in the LPC analysis.
This bandwidth expansion moves the roots of the LPC polynomial towards the
+Next the two sets of shortterm noise shaping coefficients a_ana(k) and
+a_syn(k) are obtained by applying different amounts of bandwidth expansion to the
+coefficients found in the LPC analysis.
+This bandwidth expansion moves the roots of the LPC polynomial towards the
origin, using the formulas
@@ 6424,7 +6605,7 @@ origin, using the formulas
]]>
where a(k) is the k'th LPC coefficient, and the bandwidth expansion factors
+where a(k) is the k'th LPC coefficient, and the bandwidth expansion factors
g_ana and g_syn are calculated as
@@ 6435,13 +6616,13 @@ g_syn = 0.95 + 0.01*C,
]]>
where C is the coding quality control parameter between 0 and 1.
Applying more bandwidth expansion to the analysis part than to the synthesis
+where C is the coding quality control parameter between 0 and 1.
+Applying more bandwidth expansion to the analysis part than to the synthesis
part gives the desired deemphasis of spectral valleys in between formants.
The longterm shaping is applied only during voiced frames.
+The longterm shaping is applied only during voiced frames.
It uses three filter taps, described by
@@ 6452,11 +6633,11 @@ b_syn = F_syn * [0.25, 0.5, 0.25].
]]>
For unvoiced frames these coefficients are set to 0. The multiplication factors
F_ana and F_syn are chosen between 0 and 1, depending on the coding quality
control parameter, as well as the calculated pitch correlation and smoothed
subband SNR of the lowest subband. By having F_ana less than F_syn,
the pitch harmonics are emphasized relative to the valleys in between the
+For unvoiced frames these coefficients are set to 0. The multiplication factors
+F_ana and F_syn are chosen between 0 and 1, depending on the coding quality
+control parameter, as well as the calculated pitch correlation and smoothed
+subband SNR of the lowest subband. By having F_ana less than F_syn,
+the pitch harmonics are emphasized relative to the valleys in between the
harmonics.
@@ 6465,7 +6646,7 @@ The tilt coefficient c_tilt is for unvoiced frames chosen as
@@ 6480,15 +6661,15 @@ c_tilt = 0.25 + 0.2625 * V
for voiced frames, where V is the voice activity level between 0 and 1.
The adjustment gain G serves to correct any level mismatch between the original
and decoded signals that might arise from the noise shaping and deemphasis.
This gain is computed as the ratio of the prediction gain of the shortterm
analysis and synthesis filter coefficients. The prediction gain of an LPC
synthesis filter is the square root of the output energy when the filter is
excited by a unitenergy impulse on the input.
An efficient way to compute the prediction gain is by first computing the
reflection coefficients from the LPC coefficients through the stepdown
algorithm, and extracting the prediction gain from the reflection coefficients
+The adjustment gain G serves to correct any level mismatch between the original
+and decoded signals that might arise from the noise shaping and deemphasis.
+This gain is computed as the ratio of the prediction gain of the shortterm
+analysis and synthesis filter coefficients. The prediction gain of an LPC
+synthesis filter is the square root of the output energy when the filter is
+excited by a unitenergy impulse on the input.
+An efficient way to compute the prediction gain is by first computing the
+reflection coefficients from the LPC coefficients through the stepdown
+algorithm, and extracting the prediction gain from the reflection coefficients
as
@@ 6504,22 +6685,22 @@ where r_k is the k'th reflection coefficient.
Initial values for the quantization gains are computed as the squareroot of
the residual energy of the LPC analysis, adjusted by the coding quality control
parameter.
These quantization gains are later adjusted based on the results of the
+Initial values for the quantization gains are computed as the squareroot of
+the residual energy of the LPC analysis, adjusted by the coding quality control
+parameter.
+These quantization gains are later adjusted based on the results of the
prediction analysis.
The prediction analysis is performed in one of two ways depending on how
the pitch estimator classified the frame.
The processing for voiced and unvoiced speech is described in
 and
 , respectively.
 Inputs to this function include the prewhitened signal from the
+The prediction analysis is performed in one of two ways depending on how
+the pitch estimator classified the frame.
+The processing for voiced and unvoiced speech is described in
+ and
+ , respectively.
+ Inputs to this function include the prewhitened signal from the
pitch estimator (see ).
@@ 6538,58 +6719,58 @@ The processing for voiced and unvoiced speech is described in
This LTP residual signal is the input to an LPC analysis where the LPCs are
estimated using Burg's method, such that the residual energy is minimized.
The estimated LPCs are converted to a Line Spectral Frequency (LSF) vector
 and quantized as described in .
After quantization, the quantized LSF vector is converted back to LPC
coefficients using the full procedure in .
By using quantized LTP coefficients and LPC coefficients derived from the
quantized LSF coefficients, the encoder remains fully synchronized with the
decoder.
The quantized LPC and LTP coefficients are also used to filter the input
+ and quantized as described in .
+After quantization, the quantized LSF vector is converted back to LPC
+coefficients using the full procedure in .
+By using quantized LTP coefficients and LPC coefficients derived from the
+quantized LSF coefficients, the encoder remains fully synchronized with the
+decoder.
+The quantized LPC and LTP coefficients are also used to filter the input
signal and measure residual energy for each of the four subframes.
For a speech signal that has been classified as unvoiced, there is no need
for LTP filtering, as it has already been determined that the prewhitened
input signal is not periodic enough within the allowed pitch period range
for LTP analysis to be worth the cost in terms of complexity and bitrate.
The prewhitened input signal is therefore discarded, and instead the input
signal is used for LPC analysis using Burg's method.
The resulting LPC coefficients are converted to an LSF vector and quantized
as described in the following section.
They are then transformed back to obtain quantized LPC coefficients, which
are then used to filter the input signal and measure residual energy for
+For a speech signal that has been classified as unvoiced, there is no need
+for LTP filtering, as it has already been determined that the prewhitened
+input signal is not periodic enough within the allowed pitch period range
+for LTP analysis to be worth the cost in terms of complexity and bitrate.
+The prewhitened input signal is therefore discarded, and instead the input
+signal is used for LPC analysis using Burg's method.
+The resulting LPC coefficients are converted to an LSF vector and quantized
+as described in the following section.
+They are then transformed back to obtain quantized LPC coefficients, which
+are then used to filter the input signal and measure residual energy for
each of the four subframes.
The main purpose of LPC coding in SILK is to reduce the bitrate by
minimizing the residual energy.
At least at high bitrates, perceptual aspects are handled
+At least at high bitrates, perceptual aspects are handled
independently by the noise shaping filter.
Burg's method is used because it provides higher prediction gain
than the autocorrelation method and, unlike the covariance method,
produces stable filters (assuming numerical errors don't spoil
that). SILK's implementation of Burg's method is also computationally
+that). SILK's implementation of Burg's method is also computationally
faster than the autocovariance method.
The implementation of Burg's method differs from traditional
+The implementation of Burg's method differs from traditional
implementations in two aspects.
The first difference is that it
operates on autocorrelations, similar to the Schur algorithm, but
+The first difference is that it
+operates on autocorrelations, similar to the Schur algorithm, but
with a simple update to the autocorrelations after finding each
reflection coefficient to make the result identical to Burg's method.
This brings down the complexity of Burg's method to near that of
+This brings down the complexity of Burg's method to near that of
the autocorrelation method.
The second difference is that the signal in each subframe is scaled
by the inverse of the residual quantization step size. Subframes with
a small quantization step size will on average spend more bits for a
given amount of residual energy than subframes with a large step size.
Without scaling, Burg's method minimizes the total residual energy in
all subframes, which doesn't necessarily minimize the total number of
bits needed for coding the quantized residual. The residual energy
+by the inverse of the residual quantization step size. Subframes with
+a small quantization step size will on average spend more bits for a
+given amount of residual energy than subframes with a large step size.
+Without scaling, Burg's method minimizes the total residual energy in
+all subframes, which doesn't necessarily minimize the total number of
+bits needed for coding the quantized residual. The residual energy
of the scaled subframes is a better measure for that number of
bits.
+bits.
@@ 6597,14 +6778,14 @@ bits.
Unlike many other speech codecs, SILK uses variable bitrate coding
+Unlike many other speech codecs, SILK uses variable bitrate coding
for the LSFs.
This improves the average ratedistortion tradeoff and reduces outliers.
The variable bitrate coding minimizes a linear combination of the weighted
quantization errors and the bitrate.
The weights for the quantization errors are the Inverse
Harmonic Mean Weighting (IHMW) function proposed by Laroia et al.
(see ).
+(see ).
These weights are referred to here as Laroia weights.
@@ 6612,7 +6793,7 @@ The LSF quantizer consists of two stages.
The first stage is an (unweighted) vector quantizer (VQ), with a
codebook size of 32 vectors.
The quantization errors for the codebook vector are sorted, and
for the N best vectors a second stage quantizer is run.
+for the N best vectors a second stage quantizer is run.
By varying the number N a tradeoff is made between R/D performance
and computational efficiency.
For each of the N codebook vectors the Laroia weights corresponding
@@ 6622,7 +6803,7 @@ vector is scaled by the square roots of these Laroia weights.
This scaling partially normalizes error sensitivity for the
residual vector, so that a uniform quantizer with fixed
step sizes can be used in the second stage without too much
performance loss.
+performance loss.
And by scaling with Laroia weights determined from the firststage
codebook vector, the process can be reversed in the decoder.
@@ 6651,38 +6832,37 @@ better in the reverse direction.
The quantization index of the first stage is entropy coded.
The quantization sequence from the second stage is also entropy
coded, where for each element the probability table is chosen
depending on the vector index from the first and the location
+depending on the vector index from the first stage and the location
of that element in the LSF vector.

+
If the input is stable, finding the best candidate usually results in a
quantized vector that is also stable. Because of the twostage approach,
however, it is possible that the best quantization candidate is unstable.
Therefore we apply an LSF stabilization method which ensures that the LSF
parameters are within their valid range, increasingly sorted, and have minimum
distances between each other and the border values that have been
predetermined as the 0.01 percentile distance values from a large
training set.
+If the input is stable, finding the best candidate usually results in a
+quantized vector that is also stable. Because of the twostage approach,
+however, it is possible that the best quantization candidate is unstable.
+The encoder applies the same stabilization procedure applied by the decoder
+ (see to ensure the LSF parameters
+ are within their valid range, increasingly sorted, and have minimum
+ distances between each other and the border values.
For voiced frames, the prediction analysis described in
 resulted in four sets
(one set per subframe) of five LTP coefficients, plus four weighting matrices.
The LTP coefficients for each subframe are quantized using entropy constrained
vector quantization.
A total of three vector codebooks are available for quantization, with
different ratedistortion tradeoffs. The three codebooks have 10, 20, and
40 vectors and average rates of about 3, 4, and 5 bits per vector, respectively.
Consequently, the first codebook has larger average quantization distortion at
a lower rate, whereas the last codebook has smaller average quantization
distortion at a higher rate.
Given the weighting matrix W_ltp and LTP vector b, the weighted ratedistortion
+For voiced frames, the prediction analysis described in
+ resulted in four sets
+(one set per subframe) of five LTP coefficients, plus four weighting matrices.
+The LTP coefficients for each subframe are quantized using entropy constrained
+vector quantization.
+A total of three vector codebooks are available for quantization, with
+different ratedistortion tradeoffs. The three codebooks have 10, 20, and
+40 vectors and average rates of about 3, 4, and 5 bits per vector, respectively.
+Consequently, the first codebook has larger average quantization distortion at
+a lower rate, whereas the last codebook has smaller average quantization
+distortion at a higher rate.
+Given the weighting matrix W_ltp and LTP vector b, the weighted ratedistortion
measure for a codebook vector cb_i with rate r_i is give by
@@ 6691,35 +6871,35 @@ measure for a codebook vector cb_i with rate r_i is give by
]]>
where u is a fixed, heuristicallydetermined parameter balancing the distortion
and rate.
Which codebook gives the best performance for a given LTP vector depends on the
weighting matrix for that LTP vector.
For example, for a low valued W_ltp, it is advantageous to use the codebook
with 10 vectors as it has a lower average rate.
For a large W_ltp, on the other hand, it is often better to use the codebook
+where u is a fixed, heuristicallydetermined parameter balancing the distortion
+and rate.
+Which codebook gives the best performance for a given LTP vector depends on the
+weighting matrix for that LTP vector.
+For example, for a low valued W_ltp, it is advantageous to use the codebook
+with 10 vectors as it has a lower average rate.
+For a large W_ltp, on the other hand, it is often better to use the codebook
with 40 vectors, as it is more likely to contain the best codebook vector.
The weighting matrix W_ltp depends mostly on two aspects of the input signal.
The first is the periodicity of the signal; the more periodic, the larger W_ltp.
The second is the change in signal energy in the current subframe, relative to
the signal one pitch lag earlier.
A decaying energy leads to a larger W_ltp than an increasing energy.
Both aspects fluctuate relatively slowly, which causes the W_ltp matrices for
different subframes of one frame often to be similar.
Because of this, one of the three codebooks typically gives good performance
for all subframes, and therefore the codebook search for the subframe LTP
vectors is constrained to only allow codebook vectors to be chosen from the
+The weighting matrix W_ltp depends mostly on two aspects of the input signal.
+The first is the periodicity of the signal; the more periodic, the larger W_ltp.
+The second is the change in signal energy in the current subframe, relative to
+the signal one pitch lag earlier.
+A decaying energy leads to a larger W_ltp than an increasing energy.
+Both aspects fluctuate relatively slowly, which causes the W_ltp matrices for
+different subframes of one frame often to be similar.
+Because of this, one of the three codebooks typically gives good performance
+for all subframes, and therefore the codebook search for the subframe LTP
+vectors is constrained to only allow codebook vectors to be chosen from the
same codebook, resulting in a rate reduction.
To find the best codebook, each of the three vector codebooks is
used to quantize all subframe LTP vectors and produce a combined
weighted ratedistortion measure for each vector codebook.
The vector codebook with the lowest combined ratedistortion
over all subframes is chosen. The quantized LTP vectors are used
in the noise shaping quantizer, and the index of the codebook
plus the four indices for the four subframe codebook vectors
+To find the best codebook, each of the three vector codebooks is
+used to quantize all subframe LTP vectors and produce a combined
+weighted ratedistortion measure for each vector codebook.
+The vector codebook with the lowest combined ratedistortion
+over all subframes is chosen. The quantized LTP vectors are used
+in the noise shaping quantizer, and the index of the codebook
+plus the four indices for the four subframe codebook vectors
are passed on to the range encoder.
@@ 6733,33 +6913,33 @@ By applying only the noise shaping analysis filter to the input signal,
it provides the input to the noise shaping quantizer.

+
The noise shaping quantizer independently shapes the signal and coding noise
+The noise shaping quantizer independently shapes the signal and coding noise
spectra to obtain a perceptually higher quality at the same bitrate.
The prefilter output signal is multiplied with a compensation gain G computed
in the noise shaping analysis. Then the output of a synthesis shaping filter
is added, and the output of a prediction filter is subtracted to create a
residual signal.
The residual signal is multiplied by the inverse quantized quantization gain
from the noise shaping analysis, and input to a scalar quantizer.
The quantization indices of the scalar quantizer represent a signal of pulses
that is input to the pyramid range encoder.
The scalar quantizer also outputs a quantization signal, which is multiplied
by the quantized quantization gain from the noise shaping analysis to create
an excitation signal.
The output of the prediction filter is added to the excitation signal to form
the quantized output signal y(n).
The quantized output signal y(n) is input to the synthesis shaping and
+The prefilter output signal is multiplied with a compensation gain G computed
+in the noise shaping analysis. Then the output of a synthesis shaping filter
+is added, and the output of a prediction filter is subtracted to create a
+residual signal.
+The residual signal is multiplied by the inverse quantized quantization gain
+from the noise shaping analysis, and input to a scalar quantizer.
+The quantization indices of the scalar quantizer represent a signal of pulses
+that is input to the pyramid range encoder.
+The scalar quantizer also outputs a quantization signal, which is multiplied
+by the quantized quantization gain from the noise shaping analysis to create
+an excitation signal.
+The output of the prediction filter is added to the excitation signal to form
+the quantized output signal y(n).
+The quantized output signal y(n) is input to the synthesis shaping and
prediction filters.
Optionally the noise shaping quantizer operates in a delayed decision
mode.
In this mode it uses a Viterbi algorithm to keep track of
+mode.
+In this mode it uses a Viterbi algorithm to keep track of
multiple rounding choices in the quantizer and select the best
one after a delay of 32 samples. This improves the rate/distortion
performance of the quantizer.
@@ 6774,14 +6954,12 @@ performance of the quantizer.
no more than the allowed number of bits. The Opus wrapper code
then pads the bitstream if any unused bits are left in SILK mode, or
encodes the high band with the remaining number of bits in Hybrid mode.
 If SILK is unable to encode the packet with less than the allowed number
 of bits, the Opus encoder temporarily codes the signal in CELT mode instead.
The number of payload bits is adjusted by changing
the quantization gains and the rate/distortion tradeoff in the noise
 shaping quantizer, in an iterateve loop
+ shaping quantizer, in an iterative loop
around the noise shaping quantizer and entropy coding.
 Compared to the SILK VBR mode, the CBR mode has lower
 audio quality at a given average bitrate, and also has higher
+ Compared to the SILK VBR mode, the CBR mode has lower
+ audio quality at a given average bitrate, and also has higher
computational complexity.
@@ 6793,23 +6971,23 @@ performance of the quantizer.
Most of the aspects of the CELT encoder can be directly derived from the description
+Most of the aspects of the CELT encoder can be directly derived from the description
of the decoder. For example, the filters and rotations in the encoder are simply the
inverse of the operation performed by the decoder. Similarly, the quantizers generally
optimize for the mean square error (because noise shaping is part of the bitstream itself),
so no special search is required. For this reason, only the less straightforward aspects of the
+so no special search is required. For this reason, only the less straightforward aspects of the
encoder are described here.
The pitch prefilter is applied after the preemphasis. It is applied
+The pitch prefilter is applied after the preemphasis. It is applied
in such a way as to be the inverse of the decoder's postfilter. The main nonobvious aspect of the
prefilter is the selection of the pitch period. The pitch search should be optimised for the
+prefilter is the selection of the pitch period. The pitch search should be optimized for the
following criteria:
continuity: it is important that the pitch period
does not change abruptly between frames; and
avoidance of pitch multiples: when the period used is a multiple of the real period
+avoidance of pitch multiples: when the period used is a multiple of the real period
(lower frequency fundamental), the postfilter loses most of its ability to reduce noise
@@ 6831,41 +7009,41 @@ and normalise_bands() (bands.c), respectively.
Energy quantization (both coarse and fine) can be easily understood from the decoding process.
For all useful bitrates, the coarse quantizer always chooses the quantized log energy value that
+For all useful bitrates, the coarse quantizer always chooses the quantized log energy value that
minimizes the error for each band. Only at very low rate does the encoder allow larger errors to
minimize the rate and avoid using more bits than are available. When the
available CPU requirements allow it, it is best to try encoding the coarse energy both with and without
interframe prediction such that the best prediction mode can be selected. The optimal mode depends on
the coding rate, the available bitrate, and the current rate of packet loss.
+the coding rate, the available bitrate, and the current rate of packet loss.
The fine energy quantizer always chooses the quantized log energy value that
+The fine energy quantizer always chooses the quantized log energy value that
minimizes the error for each band because the rate of the fine quantization depends only
on the bit allocation and not on the values that are coded.
+on the bit allocation and not on the values that are coded.

+The encoder must use exactly the same bit allocation process as used by the decoder
and described in . The three mechanisms that can be used by the
encoder to adjust the bitrate on a framebyframe basis are band boost, allocation trim,
+encoder to adjust the bitrate on a framebyframe basis are band boost, allocation trim,
and band skipping.

+The reference encoder makes a decision to boost a band when the energy of that band is significantly
higher than that of the neighboring bands. Let E_j be the logenergy of band j, we define
D_j = 2*E_j  E_j1  E_j+1
The allocation of band j is boosted once if D_j > t1 and twice if D_j > t2. For LM>=1, t1=2 and t2=4,
+The allocation of band j is boosted once if D_j > t1 and twice if D_j > t2. For LM>=1, t1=2 and t2=4,
while for LM<1, t1=3 and t2=5.

+The allocation trim is a value between 0 and 10 (inclusively) that controls the allocation
balance between the low and high frequencies. The encoder starts with a safe "default" of 5
and deviates from that default in two different ways. First the trim can deviate by +/ 2
@@ 6877,7 +7055,7 @@ be decreased by up to 4 when the interchannel correlation at low frequency (fir
is high.

+The encoder uses band skipping to ensure that the shape of the bands is only coded
if there is at least 1/2 bit per sample available for the PVQ. If not, then no bit is allocated
and folding is used instead. To ensure continuity in the allocation, some amount of hysteresis is
@@ 6888,7 +7066,7 @@ previous frames needs at least 9/16 bit/sample to be coded.

+Because CELT applies midside stereo coupling in the normalized domain, it does not suffer from
important stereo image problems even when the two channels are completely uncorrelated. For this reason
it is always safe to use stereo coupling on any audio frame. That being said, there are some frames
@@ 6913,7 +7091,8 @@ taking into account the frame size by subtracting 80 bits per frame for coarse e
band using intensity coding is as follows:

+bitrate (kb/s)start band<358
@@ 6923,7 +7102,6 @@ band using intensity coding is as follows:
841021910213020>130disabled
Thresholds for intensity stereo
@@ 6946,7 +7124,7 @@ See tf_analysis() in celt/celt.c.
The choice of the spreading value in has an
impact on the nature of the coding noise introduced by CELT. The larger the f_r value, the
lower the impact of the rotation, and the more tonal the coding noise. The
more tonal the signal, the more tonal the noise should be, so the CELT encoder determines
+more tonal the signal, the more tonal the noise should be, so the CELT encoder determines
the optimal value for f_r by estimating how tonal the signal is. The tonality estimate
is based on discrete pdf (4bin histogram) of each band. Bands that have a large number of small
values are considered more tonal and a decision is made by combining all bands with more than
@@ 6964,7 +7142,7 @@ all integer codevectors y of N dimensions that satisfy sum(abs(y(j))) = K.
In bands where there are sufficient bits allocated the PVQ is used to encode
+In bands where there are sufficient bits allocated PVQ is used to encode
the unit vector that results from the normalization in
directly. Given a PVQ codevector y,
the unit vector X is obtained as X = y/y, where . denotes the
@@ 7017,11 +7195,11 @@ codebook and the implementers MAY use any other search methods. See alg_quant()

+
It is the intention to allow the greatest possible choice of freedom in
implementing the specification. For this reason, outside of a few exceptions
+It is our intention to allow the greatest possible choice of freedom in
+implementing the specification. For this reason, outside of the exceptions
noted in this section, conformance is defined through the reference
implementation of the decoder provided in .
Although this document includes an English description of the codec, should
@@ 7030,55 +7208,64 @@ the latter shall take precedence.
Compliance with this specification means that a decoder's output MUST be
+Compliance with this specification means that in addition to following the normative keywords in this document,
+ a decoder's output MUST also be
within the thresholds specified by the opus_compare.c tool (included
 with the code) when compared to the reference implementation for each of the
 test vectors provided (see ). Either the floatingpoint
 implementation or the fixedpoint implementation can be used as a reference and being
 within the threshold for one of the two is sufficient. In addition, a compliant
+ with the code) when compared to the reference implementation for each of the
+ test vectors provided (see ) and for each output
+ sampling rate and channel count supported. In addition, a compliant
decoder implementation MUST have the same final range decoder state as that of the
 reference decoder.
+ reference decoder. It is therefore RECOMMENDED that the
+ decoder implement the same functional behavior as the reference.
+
+ A decoder implementation is not required to support all output sampling
+ rates or all output channel counts.
Using the reference code provided in ,
a mono test vector can be decoded with
+a test vector can be decoded with
opus_demo d 48000 1 test_mono.bit test_mono.out
+opus_demo d <rate> <channels> testvectorX.bit testX.out
+where <rate> is the sampling rate and can be 8000, 12000, 16000, 24000, or 48000, and
+<channels> is 1 for mono or 2 for stereo.
+
+
If the range decoder state is incorrect for one of the frames, the decoder will exit with
"Error: Range coder state mismatch between encoder and decoder". If the decoder succeeds, then
the output can be compared with the "reference" output with
opus_compare test_mono.float test_mono.out
+opus_compare s r <rate> testvectorX.dec testX.out
or
+for stereo or
opus_compare test_mono.fixed test_mono.out


For a stereo test vector, the command line for decoding is

opus_demo d 48000 2 test_stereo.bin test_stereo.out
+opus_compare r <rate> testvectorX.dec testX.out
+for mono.
+
and the output can be compared with the reference output with

opus_compare s test_stereo.float test_stereo.out

or

opus_compare s test_stereo.fixed test_stereo.out

+In addition to indicating whether the test vector comparison passes, the opus_compare tool
+outputs an "Opus quality metric" that indicates how well the tested decoder matches the
+reference implementation. A quality of 0 corresponds to the passing threshold, while
+a quality of 100 means that the output of the tested decoder is identical to the reference
+implementation. The passing threshold was calibrated in such a way that it corresponds to
+additive white noise with a 48 dB SNR (similar to what can be obtained on a cassette deck).
+It is still possible for an implementation to sound very good with such a low quality measure
+(e.g. if the deviation is due to inaudible phase distortion), but unless this is verified by
+listening tests, it is RECOMMENDED that implementations achive a quality above 90 for 48 kHz
+decoding. For other sampling rates, it is normal for the quality metric to be lower
+(typically as low as 50 even for a good implementation) because of harmless mismatch with
+the delay and phase of the internal sampling rate conversion.
On POSIX environments, the run_vectors.sh script can be used to verify all test
vectors. This can be done with
run_vectors.sh <exec path> <vector path>
+run_vectors.sh <exec path> <vector path> <rate>
where <exec path> is the directory where the opus_demo and opus_compare executables
are built and <vector path> is the directory containing the test vectors.
@@ 7120,8 +7307,8 @@ The reference implementation contains no known buffer overflow or cases where
in CPU load.
However, on certain CPU architectures where denormalized floatingpoint
operations are much slower than normal floatingpoint operations, it is
 possible for some audio content (e.g., silence or nearsilence) to cause a certain
 an increase in CPU load.
+ possible for some audio content (e.g., silence or nearsilence) to cause an
+ increase in CPU load.
Denormals can be introduced by reordering operations in the compiler and depend
on the target architecture, so it is difficult to guarantee that an implementation
avoids them.
@@ 7195,7 +7382,7 @@ name of work, or endorsement information.

+
@@ 7213,7 +7400,7 @@ name of work, or endorsement information.
This document provides specific requirements for an Internet audio
 codec. These requirements address quality, sample rate, bitrate,
+ codec. These requirements address quality, sample rate, bitrate,
and packetloss robustness, as well as other desirable properties.
@@ 7354,7 +7541,7 @@ Robust and Efficient Quantization of Speech LSP Parameters Using Structured Vect
This appendix contains the complete source code for the
reference implementation of the Opus codec written in C. By default,
+reference implementation of the Opus codec written in C. By default,
this implementation relies on floatingpoint arithmetic, but it can be
compiled to use only fixedpoint arithmetic by defining the FIXED_POINT
macro. Information on building and using the reference implementation is
@@ 7369,15 +7556,15 @@ but it is easy to substitute any other FFT library.
While the reference implementation does not rely on any
+While the reference implementation does not rely on any
undefined behavior as defined by C89 or C99,
it relies on common implementationdefined behavior
for two's complement architectures:
Right shifts of negative values are consistent with two's complement arithmetic, so that a>>b is equivalent to floor(a/(2^b))
For conversion to a signed integer of N bits, the value is reduced modulo 2^N to be within range of the type
The result of integer division of a negative values is truncated towards zero
The compiler provides a 64bit integer type (a C99 requirement which is supported by most C89 compilers)
+Right shifts of negative values are consistent with two's complement arithmetic, so that a>>b is equivalent to floor(a/(2**b)),
+For conversion to a signed integer of N bits, the value is reduced modulo 2**N to be within range of the type,
+The result of integer division of a negative value is truncated towards zero, and
+The compiler provides a 64bit integer type (a C99 requirement which is supported by most C89 compilers).
@@ 7385,9 +7572,9 @@ for two's complement architectures:
In its current form, the reference implementation also requires the following
architectural characteristics to obtain acceptable performance:
two's complement arithmetic
at least a 16 bit by 16 bit integer multiplier (32bit result)
at least a 32bit adder/accumulator
+Two's complement arithmetic,
+At least a 16 bit by 16 bit integer multiplier (32bit result), and
+At least a 32bit adder/accumulator.
@@ 7428,60 +7615,31 @@ Development snapshots are provided at

+

+
Because of size constraints, the Opus test vectors are not distributed in this
draft. They are available from the Opus codec website at
+draft. They are available from the Opus codec website at
and will also be made available
in IETF meeting proceedings. These test vectors were created specifically to exercise
all aspects of the decoder and therefore the audio quality of the decoded output is
significantly lower than what Opus can achieve in normal operation.
+significantly lower than what Opus can achieve in normal operation.
The SHA1 hash of the files in the test vector package are



+
+

+
To use the internal framing described in , the decoder
diff git a/silk/dec_API.c b/silk/dec_API.c
index a0b841c..8c9ed24 100644
 a/silk/dec_API.c
+++ b/silk/dec_API.c
@@ 92,6 +92,7 @@ opus_int silk_Decode( /* O Returns error co
silk_decoder *psDec = ( silk_decoder * )decState;
silk_decoder_state *channel_state = psDec>channel_state;
opus_int has_side;
+ opus_int stereo_to_mono;
/**********************************/
/* Test if first frame in payload */
@@ 107,6 +108,9 @@ opus_int silk_Decode( /* O Returns error co
ret += silk_init_decoder( &channel_state[ 1 ] );
}
+ stereo_to_mono = decControl>nChannelsInternal == 1 && psDec>nChannelsInternal == 2 &&
+ ( decControl>internalSampleRate == 1000*channel_state[ 0 ].fs_kHz );
+
if( channel_state[ 0 ].nFramesDecoded == 0 ) {
for( n = 0; n < decControl>nChannelsInternal; n++ ) {
opus_int fs_kHz_dec;
@@ 293,7 +297,7 @@ opus_int silk_Decode( /* O Returns error co
ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec );
/* Interleave if stereo output and stereo stream */
 if( decControl>nChannelsAPI == 2 && decControl>nChannelsInternal == 2 ) {
+ if( decControl>nChannelsAPI == 2 ) {
for( i = 0; i < *nSamplesOut; i++ ) {
samplesOut[ n + 2 * i ] = resample_out_ptr[ i ];
}
@@ 302,8 +306,18 @@ opus_int silk_Decode( /* O Returns error co
/* Create two channel output from mono stream */
if( decControl>nChannelsAPI == 2 && decControl>nChannelsInternal == 1 ) {
 for( i = 0; i < *nSamplesOut; i++ ) {
 samplesOut[ 0 + 2 * i ] = samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ];
+ if ( stereo_to_mono ){
+ /* Resample right channel for newly collapsed stereo just in case
+ we weren't doing collapsing when switching to mono */
+ ret += silk_resampler( &channel_state[ 1 ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ 0 ][ 1 ], nSamplesOutDec );
+
+ for( i = 0; i < *nSamplesOut; i++ ) {
+ samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ];
+ }
+ } else {
+ for( i = 0; i < *nSamplesOut; i++ ) {
+ samplesOut[ 1 + 2 * i ] = samplesOut[ 0 + 2 * i ];
+ }
}
}
diff git a/silk/decoder_set_fs.c b/silk/decoder_set_fs.c
index e0a343f..c0bf352 100644
 a/silk/decoder_set_fs.c
+++ b/silk/decoder_set_fs.c
@@ 49,25 +49,9 @@ opus_int silk_decoder_set_fs(
/* Initialize resampler when switching internal or external sampling frequency */
if( psDec>fs_kHz != fs_kHz  psDec>fs_API_hz != fs_API_Hz ) {
 /* Allocate worst case space for temporary upsampling, 8 to 48 kHz, so a factor 6 */
 opus_int16 temp_buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ ];
 silk_resampler_state_struct temp_resampler_state;

 if( psDec>fs_kHz != fs_kHz && psDec>fs_kHz > 0 ) {
 /* Initialize resampler for temporary resampling of outBuf data to the new internal sampling rate */
 ret += silk_resampler_init( &temp_resampler_state, silk_SMULBB( psDec>fs_kHz, 1000 ), silk_SMULBB( fs_kHz, 1000 ), 0 );

 /* Temporary resampling of outBuf data to the new internal sampling rate */
 silk_memcpy( temp_buf, psDec>outBuf, psDec>frame_length * sizeof( opus_int16 ) );
 ret += silk_resampler( &temp_resampler_state, psDec>outBuf, temp_buf, psDec>frame_length );
 }

/* Initialize the resampler for dec_API.c preparing resampling from fs_kHz to API_fs_Hz */
ret += silk_resampler_init( &psDec>resampler_state, silk_SMULBB( fs_kHz, 1000 ), fs_API_Hz, 0 );
 /* Correct resampler state by resampling buffered data from fs_kHz to API_fs_Hz */
 ret += silk_resampler( &psDec>resampler_state, temp_buf, psDec>outBuf, frame_length );

psDec>fs_API_hz = fs_API_Hz;
}
diff git a/src/opus_compare.c b/src/opus_compare.c
index a74acb0..b8a1620 100644
 a/src/opus_compare.c
+++ b/src/opus_compare.c
@@ 133,7 +133,7 @@ static const int BANDS[NBANDS+1]={
};
#define TEST_WIN_SIZE (480)
#define TEST_WIN_STEP (TEST_WIN_SIZE>>1)
+#define TEST_WIN_STEP (120)
int main(int _argc,const char **_argv){
FILE *fin1;
@@ 143,7 +143,7 @@ int main(int _argc,const char **_argv){
float *xb;
float *X;
float *Y;
 float err;
+ double err;
float Q;
size_t xlength;
size_t ylength;
@@ 246,14 +246,15 @@ int main(int _argc,const char **_argv){
}
}
if(xi>0){
 /*Temporal masking: 5 dB/5ms slope.*/
+ /*Temporal masking: 3 dB/2.5ms slope.*/
for(bi=0;bi=79&&xj<=81)im*=0.1F;
 if(xj==80)im*=0.1F;
 Ef+=im*im;
+ for(bi=0;bi=79&&xj<=81)im*=0.1F;
+ if(xj==80)im*=0.1F;
+ Eb+=im;
+ }
}
+ Eb /= (BANDS[bi+1]BANDS[bi])*nchannels;
+ Ef += Eb*Eb;
}
/*Using a fixed normalization value means we're willing to accept slightly
lower quality for lower sampling rates.*/
 Ef/=200*nchannels;
+ Ef/=NBANDS;
Ef*=Ef;
err+=Ef*Ef;
}
diff git a/src/opus_decoder.c b/src/opus_decoder.c
index ab79f42..889b5a4 100644
 a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ 427,7 +427,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
pcm[i] = 0;
/* For hybrid > SILK transitions, we let the CELT MDCT
do a fadeout by decoding a silence frame */
 if (st>prev_mode == MODE_HYBRID)
+ if (st>prev_mode == MODE_HYBRID && !(redundancy && celt_to_silk && st>prev_redundancy) )
{
celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0));
celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL);
diff git a/src/opus_demo.c b/src/opus_demo.c
index f97648c..34fba5c 100644
 a/src/opus_demo.c
+++ b/src/opus_demo.c
@@ 102,6 +102,103 @@ static void check_encoder_option(int decode_only, const char *opt)
}
}
+int silk8_test[][4] = {
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*3, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*2, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*3, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960*2, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 2}
+};
+
+int silk12_test[][4] = {
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*3, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*2, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 480, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*3, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960*2, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 960, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_MEDIUMBAND, 480, 2}
+};
+
+int silk16_test[][4] = {
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*3, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*2, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*3, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960*2, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 2}
+};
+
+int hybrid24_test[][4] = {
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 2}
+};
+
+int hybrid48_test[][4] = {
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 1},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 2},
+ {MODE_SILK_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 2}
+};
+
+int celt_test[][4] = {
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 1},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 1},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 240, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 240, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 240, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 240, 1},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 120, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 120, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 120, 1},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 120, 1},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 960, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 960, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 960, 2},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 480, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 480, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 480, 2},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 240, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 240, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 240, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 240, 2},
+
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 120, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_SUPERWIDEBAND, 120, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_WIDEBAND, 120, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_NARROWBAND, 120, 2},
+
+};
+
+int celt_hq_test[][4] = {
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 960, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 480, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 240, 2},
+ {MODE_CELT_ONLY, OPUS_BANDWIDTH_FULLBAND, 120, 2},
+};
+
int main(int argc, char *argv[])
{
int err;
@@ 143,6 +240,11 @@ int main(int argc, char *argv[])
int random_framesize=0, newsize=0, delayed_celt=0;
int sweep_max=0, sweep_min=0;
int random_fec=0;
+ int (*mode_list)[4]=NULL;
+ int nb_modes_in_list=0;
+ int curr_mode=0;
+ int curr_mode_count=0;
+ int mode_switch_time = 48000;
if (argc < 5 )
{
@@ 302,6 +404,41 @@ int main(int argc, char *argv[])
check_encoder_option(decode_only, "random_fec");
random_fec = 1;
args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "silk8k_test" ) == 0 ) {
+ check_encoder_option(decode_only, "silk8k_test");
+ mode_list = silk8_test;
+ nb_modes_in_list = 8;
+ args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "silk12k_test" ) == 0 ) {
+ check_encoder_option(decode_only, "silk12k_test");
+ mode_list = silk12_test;
+ nb_modes_in_list = 8;
+ args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "silk16k_test" ) == 0 ) {
+ check_encoder_option(decode_only, "silk16k_test");
+ mode_list = silk16_test;
+ nb_modes_in_list = 8;
+ args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "hybrid24k_test" ) == 0 ) {
+ check_encoder_option(decode_only, "hybrid24k_test");
+ mode_list = hybrid24_test;
+ nb_modes_in_list = 4;
+ args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "hybrid48k_test" ) == 0 ) {
+ check_encoder_option(decode_only, "hybrid48k_test");
+ mode_list = hybrid48_test;
+ nb_modes_in_list = 4;
+ args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "celt_test" ) == 0 ) {
+ check_encoder_option(decode_only, "celt_test");
+ mode_list = celt_test;
+ nb_modes_in_list = 32;
+ args++;
+ } else if( STR_CASEINSENSITIVE_COMPARE( argv[ args ], "celt_hq_test" ) == 0 ) {
+ check_encoder_option(decode_only, "celt_hq_test");
+ mode_list = celt_hq_test;
+ nb_modes_in_list = 4;
+ args++;
} else {
printf( "Error: unrecognized setting: %s\n\n", argv[ args ] );
print_usage( argv );
@@ 326,6 +463,17 @@ int main(int argc, char *argv[])
fprintf (stderr, "Could not open input file %s\n", argv[argc2]);
return EXIT_FAILURE;
}
+ if (mode_list)
+ {
+ int size;
+ fseek(fin, 0, SEEK_END);
+ size = ftell(fin);
+ fprintf(stderr, "File size is %d bytes\n", size);
+ fseek(fin, 0, SEEK_SET);
+ mode_switch_time = size/sizeof(short)/channels/nb_modes_in_list;
+ fprintf(stderr, "Switching mode every %d samples\n", mode_switch_time);
+ }
+
outFile = argv[argc1];
fout = fopen(outFile, "wb+");
if (!fout)
@@ 428,6 +576,8 @@ int main(int argc, char *argv[])
case 4: newsize=sampling_rate/25; break;
case 5: newsize=3*sampling_rate/50; break;
}
+ while (newsize < sampling_rate/25 && bitrate_bpsfabs(sweep_bps) <= 3*12*sampling_rate/newsize)
+ newsize*=2;
if (newsize < sampling_rate/100 && frame_size >= sampling_rate/100)
{
opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(MODE_CELT_ONLY));
@@ 463,6 +613,13 @@ int main(int argc, char *argv[])
break;
}
} else {
+ if (mode_list!=NULL)
+ {
+ opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(mode_list[curr_mode][1]));
+ opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(mode_list[curr_mode][0]));
+ opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3]));
+ frame_size = mode_list[curr_mode][2];
+ }
err = fread(in, sizeof(short)*channels, frame_size, fin);
curr_read = err;
if (curr_read < frame_size)
@@ 472,7 +629,6 @@ int main(int argc, char *argv[])
in[i] = 0;
stop = 1;
}

len[toggle] = opus_encode(enc, in, frame_size, data[toggle], max_payload_bytes);
if (sweep_bps!=0)
{
@@ 497,6 +653,12 @@ int main(int argc, char *argv[])
fclose(fout);
return EXIT_FAILURE;
}
+ curr_mode_count += frame_size;
+ if (curr_mode_count > mode_switch_time && curr_mode < nb_modes_in_list1)
+ {
+ curr_mode++;
+ curr_mode_count = 0;
+ }
}
if (encode_only)
diff git a/tests/run_vectors.sh b/tests/run_vectors.sh
index 81b68f3..9b5c29b 100755
 a/tests/run_vectors.sh
+++ b/tests/run_vectors.sh
@@ 1,12 +1,16 @@
#!/bin/sh
if [ "$#" ne "2" ]; then
 echo "usage: run_vectors.sh "
+rm logs_mono.txt
+rm logs_stereo.txt
+
+if [ "$#" ne "3" ]; then
+ echo "usage: run_vectors.sh "
exit 1
fi
CMD_PATH=$1
VECTOR_PATH=$2
+RATE=$3
OPUS_DEMO=$CMD_PATH/opus_demo
OPUS_COMPARE=$CMD_PATH/opus_compare
@@ 32,24 +36,23 @@ echo Testing mono
echo "=============="
echo
for file in test1_mono test2_mono test3_mono test4_mono test5_mono
+for file in `seq w 1 11`
do
 if [ e $VECTOR_PATH/$file.bit ]; then
 echo Testing $file
+ if [ e $VECTOR_PATH/testvector$file.bit ]; then
+ echo Testing testvector$file
else
 echo Bitstream file not found: $file
+ echo Bitstream file not found: testvector$file.bit
fi
 if $OPUS_DEMO d 48000 1 $VECTOR_PATH/$file.bit tmp.out > /dev/null 2>&1; then
+ if $OPUS_DEMO d $RATE 1 $VECTOR_PATH/testvector$file.bit tmp.out >> logs_mono.txt 2>&1; then
echo successfully decoded
else
echo ERROR: decoding failed
exit 1
fi
 $OPUS_COMPARE $VECTOR_PATH/$file.float tmp.out > /dev/null 2>&1
+ $OPUS_COMPARE r $RATE $VECTOR_PATH/testvector$file.dec tmp.out >> logs_mono.txt 2>&1
+ true
float_ret=$?
 $OPUS_COMPARE $VECTOR_PATH/$file.fixed tmp.out > /dev/null 2>&1
 fixed_ret=$?
 if [ "$float_ret" eq "0" o "$fixed_ret" eq "0" ]; then
+ if [ "$float_ret" eq "0" ]; then
echo output matches reference
else
echo ERROR: output does not match reference
@@ 63,24 +66,22 @@ echo Testing stereo
echo "=============="
echo
for file in test1_stereo test2_stereo test3_stereo test4_stereo
+for file in `seq w 1 11`
do
 if [ e $VECTOR_PATH/$file.bit ]; then
 echo Testing $file
+ if [ e $VECTOR_PATH/testvector$file.bit ]; then
+ echo Testing testvector$file
else
 echo Bitstream file not found: $file
+ echo Bitstream file not found: testvector$file
fi
 if $OPUS_DEMO d 48000 2 $VECTOR_PATH/$file.bit tmp.out > /dev/null 2>&1; then
+ if $OPUS_DEMO d $RATE 2 $VECTOR_PATH/testvector$file.bit tmp.out >> logs_stereo.txt 2>&1; then
echo successfully decoded
else
echo ERROR: decoding failed
exit 1
fi
 $OPUS_COMPARE s $VECTOR_PATH/$file.float tmp.out > /dev/null 2>&1
+ $OPUS_COMPARE s r $RATE $VECTOR_PATH/testvector$file.dec tmp.out >> logs_stereo.txt 2>&1
float_ret=$?
 $OPUS_COMPARE s $VECTOR_PATH/$file.fixed tmp.out > /dev/null 2>&1
 fixed_ret=$?
 if [ "$float_ret" eq "0" o "$fixed_ret" eq "0" ]; then
+ if [ "$float_ret" eq "0" ]; then
echo output matches reference
else
echo ERROR: output does not match reference
@@ 92,3 +93,5 @@ done
echo All tests have passed successfully
+grep quality logs_mono.txt  awk '{sum+=$4}END{print "Average mono quality is", sum/NR, "%"}'
+grep quality logs_stereo.txt  awk '{sum+=$4}END{print "Average stereo quality is", sum/NR, "%"}'

1.7.2.5