summaryrefslogtreecommitdiffstats
path: root/Whisper
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-03-17 04:11:18 -0700
committeryum <yum.food.vr@gmail.com>2023-03-17 04:11:18 -0700
commitaaa0188da81056748ef8ffcd5ad86d6f4bffa6bd (patch)
tree7f324c5031b6100d1158a4d7f0550ff5b0bd2e29 /Whisper
parent5e30b2366a4a320f59ed7e0bfcfe72f5f8c9d108 (diff)
begin work disabling vad
Diffstat (limited to 'Whisper')
-rw-r--r--Whisper/API/MfStructs.h2
-rw-r--r--Whisper/Whisper/ContextImpl.capture.cpp63
-rw-r--r--Whisper/Whisper/voiceActivityDetection.cpp5
3 files changed, 44 insertions, 26 deletions
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h
index c23d633..d2fee6b 100644
--- a/Whisper/API/MfStructs.h
+++ b/Whisper/API/MfStructs.h
@@ -19,6 +19,8 @@ namespace Whisper
{
// When the capture device supports stereo, keep stereo PCM samples in addition to mono
Stereo = 1,
+ // Don't use voice activity detection (VAD).
+ DisableVAD = 2,
};
// Parameters for audio capture
diff --git a/Whisper/Whisper/ContextImpl.capture.cpp b/Whisper/Whisper/ContextImpl.capture.cpp
index 642eef1..ce51393 100644
--- a/Whisper/Whisper/ContextImpl.capture.cpp
+++ b/Whisper/Whisper/ContextImpl.capture.cpp
@@ -238,35 +238,48 @@ namespace
CHECK( readSample( false ) );
const size_t newSamples = pcm.mono.size();
- const size_t lastVoiceFrame = detectVoice();
- if( lastVoiceFrame == 0 )
- {
- // No voice is detected in the entire buffered audio
- clearStateFlag( eCaptureStatus::Voice );
- if( newSamples < captureParams.dropStartSilence )
+ const bool wantVAD = !(captureParams.flags & (uint32_t)eCaptureFlags::DisableVAD);
+ if (wantVAD) {
+ const size_t lastVoiceFrame = detectVoice();
+ if (lastVoiceFrame == 0)
+ {
+ // No voice is detected in the entire buffered audio
+ clearStateFlag(eCaptureStatus::Voice);
+ if (newSamples < captureParams.dropStartSilence)
+ return S_OK;
+
+ pcm.dropFirst(1024);
+ vad.clear();
+ pcmStartTime = nextSampleTime;
return S_OK;
+ }
- pcm.dropFirst(1024);
- vad.clear();
- pcmStartTime = nextSampleTime;
- return S_OK;
- }
-
- const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples;
+ const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples;
- if( newFrameVoice )
- {
- // A voice is detected in the buffer, and it was fairly recently
- setStateFlag( eCaptureStatus::Voice );
- if( newSamples < captureParams.maxDuration )
- return S_OK; // While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time
+ if (newFrameVoice)
+ {
+ // A voice is detected in the buffer, and it was fairly recently
+ setStateFlag(eCaptureStatus::Voice);
+ if (newSamples < captureParams.maxDuration)
+ return S_OK; // While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time
+ }
+ else
+ {
+ // A voice is detected in the buffer, but it was a while ago
+ clearStateFlag(eCaptureStatus::Voice);
+ if (newSamples < captureParams.minDuration)
+ return S_OK; // When detected pause in the voice, we fire the transcribe task right away.
+ }
}
- else
- {
- // A voice is detected in the buffer, but it was a while ago
- clearStateFlag( eCaptureStatus::Voice );
- if( newSamples < captureParams.minDuration )
- return S_OK; // When detected pause in the voice, we fire the transcribe task right away.
+ else {
+ // VAD is disabled. Pause until minimum duration is reached.
+ if (pcm.mono.size() < captureParams.minDuration) {
+ return S_OK;
+ }
+ // Ensure buffer is not too long.
+ if (pcm.mono.size() >= captureParams.maxDuration) {
+ pcm.retainLast(captureParams.maxDuration);
+ }
}
// Hopefully, we have enough captured PCM data to run the ASR model.
diff --git a/Whisper/Whisper/voiceActivityDetection.cpp b/Whisper/Whisper/voiceActivityDetection.cpp
index c0eb7ef..a0472a3 100644
--- a/Whisper/Whisper/voiceActivityDetection.cpp
+++ b/Whisper/Whisper/voiceActivityDetection.cpp
@@ -9,9 +9,12 @@ using namespace Whisper;
inline VAD::Feature VAD::defaultPrimaryThresholds()
{
Feature f;
+ // Energy primary threshold
f.energy = 40;
+ // Frequency primary threshold
f.F = 185;
- f.SFM = 5;
+ // Spectral flatness measure (SFM) primary threshold
+ f.SFM = 1;
return f;
}