begin work disabling vad

author: yum <yum.food.vr@gmail.com> 2023-03-17 04:11:18 -0700
committer: yum <yum.food.vr@gmail.com> 2023-03-17 04:11:18 -0700
commit: aaa0188da81056748ef8ffcd5ad86d6f4bffa6bd (patch)
tree: 7f324c5031b6100d1158a4d7f0550ff5b0bd2e29
parent: 5e30b2366a4a320f59ed7e0bfcfe72f5f8c9d108 (diff)
5 files changed, 48 insertions, 29 deletions
diff --git a/Evaluate/evaluate.py b/Evaluate/evaluate.py
index 81b3edf..9d4d2c4 100644
--- a/Evaluate/evaluate.py
+++ b/Evaluate/evaluate.py
@@ -44,5 +44,6 @@ if __name__ == "__main__":
 
     print(f"Duration: {t1 - t0}")
     print(f"Levenshtein distance: {dist}")
-    print(f"Transcript: {test_transcript}")
+    print(f"Control: {ref_transcript}")
+    print(f"Experiment: {test_transcript}")
 
diff --git a/Evaluate/setup.ps1 b/Evaluate/setup.ps1
index 95e2487..0741aeb 100644
--- a/Evaluate/setup.ps1
+++ b/Evaluate/setup.ps1
@@ -9,8 +9,8 @@ if (-Not (Test-Path $MODEL_FILE)) {
   Invoke-WebRequest $MODEL_URL -OutFile $MODEL_FILE
 }
 
-#$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_01_unitedstates_64kb.mp3"
-$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_02_unitedstates_128kb.mp3"
+$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_01_unitedstates_64kb.mp3"
+#$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_02_unitedstates_128kb.mp3"
 $AUDIO_FILE = $(Split-Path -Path $AUDIO_URL -Leaf)
 if (-Not (Test-Path $AUDIO_FILE)) {
   echo "Fetch audio"
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h
index c23d633..d2fee6b 100644
--- a/Whisper/API/MfStructs.h
+++ b/Whisper/API/MfStructs.h
@@ -19,6 +19,8 @@ namespace Whisper
 	{
 		// When the capture device supports stereo, keep stereo PCM samples in addition to mono
 		Stereo = 1,
+		// Don't use voice activity detection (VAD).
+		DisableVAD = 2,
 	};
 
 	// Parameters for audio capture
diff --git a/Whisper/Whisper/ContextImpl.capture.cpp b/Whisper/Whisper/ContextImpl.capture.cpp
index 642eef1..ce51393 100644
--- a/Whisper/Whisper/ContextImpl.capture.cpp
+++ b/Whisper/Whisper/ContextImpl.capture.cpp
@@ -238,35 +238,48 @@ namespace
 		CHECK( readSample( false ) );
 		const size_t newSamples = pcm.mono.size();
 
-		const size_t lastVoiceFrame = detectVoice();
-		if( lastVoiceFrame == 0 )
-		{
-			// No voice is detected in the entire buffered audio
-			clearStateFlag( eCaptureStatus::Voice );
-			if( newSamples < captureParams.dropStartSilence )
+		const bool wantVAD = !(captureParams.flags & (uint32_t)eCaptureFlags::DisableVAD);
+		if (wantVAD) {
+			const size_t lastVoiceFrame = detectVoice();
+			if (lastVoiceFrame == 0)
+			{
+				// No voice is detected in the entire buffered audio
+				clearStateFlag(eCaptureStatus::Voice);
+				if (newSamples < captureParams.dropStartSilence)
+					return S_OK;
+
+				pcm.dropFirst(1024);
+				vad.clear();
+				pcmStartTime = nextSampleTime;
 				return S_OK;
+			}
 
-			pcm.dropFirst(1024);
-			vad.clear();
-			pcmStartTime = nextSampleTime;
-			return S_OK;
-		}
-
-		const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples;
+			const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples;
 
-		if( newFrameVoice )
-		{
-			// A voice is detected in the buffer, and it was fairly recently
-			setStateFlag( eCaptureStatus::Voice );
-			if( newSamples < captureParams.maxDuration )
-				return S_OK;	// While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time
+			if (newFrameVoice)
+			{
+				// A voice is detected in the buffer, and it was fairly recently
+				setStateFlag(eCaptureStatus::Voice);
+				if (newSamples < captureParams.maxDuration)
+					return S_OK;	// While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time
+			}
+			else
+			{
+				// A voice is detected in the buffer, but it was a while ago
+				clearStateFlag(eCaptureStatus::Voice);
+				if (newSamples < captureParams.minDuration)
+					return S_OK;	// When detected pause in the voice, we fire the transcribe task right away.
+			}
 		}
-		else
-		{
-			// A voice is detected in the buffer, but it was a while ago
-			clearStateFlag( eCaptureStatus::Voice );
-			if( newSamples < captureParams.minDuration )
-				return S_OK;	// When detected pause in the voice, we fire the transcribe task right away.
+		else {
+			// VAD is disabled. Pause until minimum duration is reached.
+			if (pcm.mono.size() < captureParams.minDuration) {
+				return S_OK;
+			}
+			// Ensure buffer is not too long.
+			if (pcm.mono.size() >= captureParams.maxDuration) {
+				pcm.retainLast(captureParams.maxDuration);
+			}
 		}
 
 		// Hopefully, we have enough captured PCM data to run the ASR model.
diff --git a/Whisper/Whisper/voiceActivityDetection.cpp b/Whisper/Whisper/voiceActivityDetection.cpp
index c0eb7ef..a0472a3 100644
--- a/Whisper/Whisper/voiceActivityDetection.cpp
+++ b/Whisper/Whisper/voiceActivityDetection.cpp
@@ -9,9 +9,12 @@ using namespace Whisper;
 inline VAD::Feature VAD::defaultPrimaryThresholds()
 {
 	Feature f;
+	// Energy primary threshold
 	f.energy = 40;
+	// Frequency primary threshold
 	f.F = 185;
-	f.SFM = 5;
+	// Spectral flatness measure (SFM) primary threshold
+	f.SFM = 1;
 	return f;
 }
author	yum <yum.food.vr@gmail.com>	2023-03-17 04:11:18 -0700
committer	yum <yum.food.vr@gmail.com>	2023-03-17 04:11:18 -0700
commit	aaa0188da81056748ef8ffcd5ad86d6f4bffa6bd (patch)
tree	7f324c5031b6100d1158a4d7f0550ff5b0bd2e29
parent	5e30b2366a4a320f59ed7e0bfcfe72f5f8c9d108 (diff)