From 4eb399d7f4d97f5f20081db6196dba04d22113a4 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin
Date: Mon, 1 Jul 2013 20:19:24 -0400
Subject: [PATCH] More speech/music detection comments (delayed decision)
---
src/analysis.c | 18 ++++++++++++++++--
src/analysis.h | 4 ++++
2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/src/analysis.c b/src/analysis.c
index a890a06b..a9d2073f 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -177,6 +177,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
curr_lookahead = IMAX(curr_lookahead-10, 0);
psum=0;
+ /* Summing the probability of transition patterns that involve music at
+ time (DETECT_SIZE-curr_lookahead-1) */
for (i=0;ipmusic[i];
for (;icount==1)
@@ -529,18 +536,25 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con
tonal->pspeech[0]=.5;
tonal->pmusic [0]=.5;
}
+ /* Updated probability of having only speech (s0) or only music (m0),
+ before considering the new observation. */
s0 = tonal->pspeech[0] + tonal->pspeech[1];
m0 = tonal->pmusic [0] + tonal->pmusic [1];
+ /* Updates s0 and m0 with instantaneous probability. */
tonal->pspeech[0] = s0*(1-tau)*speech0;
tonal->pmusic [0] = m0*(1-tau)*music0;
+ /* Propagate the transition probabilities */
for (i=1;ipspeech[i] = tonal->pspeech[i+1]*speech0;
tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
}
+ /* Probability that the latest frame is speech, when all the previous ones were music. */
tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
+ /* Probability that the latest frame is music, when all the previous ones were speech. */
tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
+ /* Renormalise probabilities to 1 */
for (i=0;ipspeech[i] + tonal->pmusic[i];
psum = 1.f/psum;
diff --git a/src/analysis.h b/src/analysis.h
index bce94a51..8cd78883 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -62,7 +62,11 @@ typedef struct {
int count;
opus_val32 subframe_mem[3];
int analysis_offset;
+ /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).
+ pspeech[0] is the probability that all frames in the window are speech. */
float pspeech[DETECT_SIZE];
+ /** Probability of having music for time i to DETECT_SIZE-1 (and speech before).
+ pmusic[0] is the probability that all frames in the window are music. */
float pmusic[DETECT_SIZE];
float speech_confidence;
float music_confidence;
--
2.11.0