summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Evaluate/evaluate.py3
-rw-r--r--Evaluate/setup.ps14
-rw-r--r--Whisper/API/MfStructs.h2
-rw-r--r--Whisper/Whisper/ContextImpl.capture.cpp63
-rw-r--r--Whisper/Whisper/voiceActivityDetection.cpp5
5 files changed, 48 insertions, 29 deletions
diff --git a/Evaluate/evaluate.py b/Evaluate/evaluate.py
index 81b3edf..9d4d2c4 100644
--- a/Evaluate/evaluate.py
+++ b/Evaluate/evaluate.py
@@ -44,5 +44,6 @@ if __name__ == "__main__":
print(f"Duration: {t1 - t0}")
print(f"Levenshtein distance: {dist}")
- print(f"Transcript: {test_transcript}")
+ print(f"Control: {ref_transcript}")
+ print(f"Experiment: {test_transcript}")
diff --git a/Evaluate/setup.ps1 b/Evaluate/setup.ps1
index 95e2487..0741aeb 100644
--- a/Evaluate/setup.ps1
+++ b/Evaluate/setup.ps1
@@ -9,8 +9,8 @@ if (-Not (Test-Path $MODEL_FILE)) {
Invoke-WebRequest $MODEL_URL -OutFile $MODEL_FILE
}
-#$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_01_unitedstates_64kb.mp3"
-$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_02_unitedstates_128kb.mp3"
+$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_01_unitedstates_64kb.mp3"
+#$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_02_unitedstates_128kb.mp3"
$AUDIO_FILE = $(Split-Path -Path $AUDIO_URL -Leaf)
if (-Not (Test-Path $AUDIO_FILE)) {
echo "Fetch audio"
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h
index c23d633..d2fee6b 100644
--- a/Whisper/API/MfStructs.h
+++ b/Whisper/API/MfStructs.h
@@ -19,6 +19,8 @@ namespace Whisper
{
// When the capture device supports stereo, keep stereo PCM samples in addition to mono
Stereo = 1,
+ // Don't use voice activity detection (VAD).
+ DisableVAD = 2,
};
// Parameters for audio capture
diff --git a/Whisper/Whisper/ContextImpl.capture.cpp b/Whisper/Whisper/ContextImpl.capture.cpp
index 642eef1..ce51393 100644
--- a/Whisper/Whisper/ContextImpl.capture.cpp
+++ b/Whisper/Whisper/ContextImpl.capture.cpp
@@ -238,35 +238,48 @@ namespace
CHECK( readSample( false ) );
const size_t newSamples = pcm.mono.size();
- const size_t lastVoiceFrame = detectVoice();
- if( lastVoiceFrame == 0 )
- {
- // No voice is detected in the entire buffered audio
- clearStateFlag( eCaptureStatus::Voice );
- if( newSamples < captureParams.dropStartSilence )
+ const bool wantVAD = !(captureParams.flags & (uint32_t)eCaptureFlags::DisableVAD);
+ if (wantVAD) {
+ const size_t lastVoiceFrame = detectVoice();
+ if (lastVoiceFrame == 0)
+ {
+ // No voice is detected in the entire buffered audio
+ clearStateFlag(eCaptureStatus::Voice);
+ if (newSamples < captureParams.dropStartSilence)
+ return S_OK;
+
+ pcm.dropFirst(1024);
+ vad.clear();
+ pcmStartTime = nextSampleTime;
return S_OK;
+ }
- pcm.dropFirst(1024);
- vad.clear();
- pcmStartTime = nextSampleTime;
- return S_OK;
- }
-
- const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples;
+ const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples;
- if( newFrameVoice )
- {
- // A voice is detected in the buffer, and it was fairly recently
- setStateFlag( eCaptureStatus::Voice );
- if( newSamples < captureParams.maxDuration )
- return S_OK; // While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time
+ if (newFrameVoice)
+ {
+ // A voice is detected in the buffer, and it was fairly recently
+ setStateFlag(eCaptureStatus::Voice);
+ if (newSamples < captureParams.maxDuration)
+ return S_OK; // While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time
+ }
+ else
+ {
+ // A voice is detected in the buffer, but it was a while ago
+ clearStateFlag(eCaptureStatus::Voice);
+ if (newSamples < captureParams.minDuration)
+ return S_OK; // When detected pause in the voice, we fire the transcribe task right away.
+ }
}
- else
- {
- // A voice is detected in the buffer, but it was a while ago
- clearStateFlag( eCaptureStatus::Voice );
- if( newSamples < captureParams.minDuration )
- return S_OK; // When detected pause in the voice, we fire the transcribe task right away.
+ else {
+ // VAD is disabled. Pause until minimum duration is reached.
+ if (pcm.mono.size() < captureParams.minDuration) {
+ return S_OK;
+ }
+ // Ensure buffer is not too long.
+ if (pcm.mono.size() >= captureParams.maxDuration) {
+ pcm.retainLast(captureParams.maxDuration);
+ }
}
// Hopefully, we have enough captured PCM data to run the ASR model.
diff --git a/Whisper/Whisper/voiceActivityDetection.cpp b/Whisper/Whisper/voiceActivityDetection.cpp
index c0eb7ef..a0472a3 100644
--- a/Whisper/Whisper/voiceActivityDetection.cpp
+++ b/Whisper/Whisper/voiceActivityDetection.cpp
@@ -9,9 +9,12 @@ using namespace Whisper;
inline VAD::Feature VAD::defaultPrimaryThresholds()
{
Feature f;
+ // Energy primary threshold
f.energy = 40;
+ // Frequency primary threshold
f.F = 185;
- f.SFM = 5;
+ // Spectral flatness measure (SFM) primary threshold
+ f.SFM = 1;
return f;
}