diff options
| author | yum <yum.food.vr@gmail.com> | 2023-03-17 04:11:18 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-03-17 04:11:18 -0700 |
| commit | aaa0188da81056748ef8ffcd5ad86d6f4bffa6bd (patch) | |
| tree | 7f324c5031b6100d1158a4d7f0550ff5b0bd2e29 | |
| parent | 5e30b2366a4a320f59ed7e0bfcfe72f5f8c9d108 (diff) | |
begin work disabling vad
| -rw-r--r-- | Evaluate/evaluate.py | 3 | ||||
| -rw-r--r-- | Evaluate/setup.ps1 | 4 | ||||
| -rw-r--r-- | Whisper/API/MfStructs.h | 2 | ||||
| -rw-r--r-- | Whisper/Whisper/ContextImpl.capture.cpp | 63 | ||||
| -rw-r--r-- | Whisper/Whisper/voiceActivityDetection.cpp | 5 |
5 files changed, 48 insertions, 29 deletions
diff --git a/Evaluate/evaluate.py b/Evaluate/evaluate.py index 81b3edf..9d4d2c4 100644 --- a/Evaluate/evaluate.py +++ b/Evaluate/evaluate.py @@ -44,5 +44,6 @@ if __name__ == "__main__": print(f"Duration: {t1 - t0}") print(f"Levenshtein distance: {dist}") - print(f"Transcript: {test_transcript}") + print(f"Control: {ref_transcript}") + print(f"Experiment: {test_transcript}") diff --git a/Evaluate/setup.ps1 b/Evaluate/setup.ps1 index 95e2487..0741aeb 100644 --- a/Evaluate/setup.ps1 +++ b/Evaluate/setup.ps1 @@ -9,8 +9,8 @@ if (-Not (Test-Path $MODEL_FILE)) { Invoke-WebRequest $MODEL_URL -OutFile $MODEL_FILE } -#$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_01_unitedstates_64kb.mp3" -$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_02_unitedstates_128kb.mp3" +$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_01_unitedstates_64kb.mp3" +#$AUDIO_URL = "https://www.archive.org/download/usconstitution_1610_librivox/constitution_02_unitedstates_128kb.mp3" $AUDIO_FILE = $(Split-Path -Path $AUDIO_URL -Leaf) if (-Not (Test-Path $AUDIO_FILE)) { echo "Fetch audio" diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h index c23d633..d2fee6b 100644 --- a/Whisper/API/MfStructs.h +++ b/Whisper/API/MfStructs.h @@ -19,6 +19,8 @@ namespace Whisper { // When the capture device supports stereo, keep stereo PCM samples in addition to mono Stereo = 1, + // Don't use voice activity detection (VAD). + DisableVAD = 2, }; // Parameters for audio capture diff --git a/Whisper/Whisper/ContextImpl.capture.cpp b/Whisper/Whisper/ContextImpl.capture.cpp index 642eef1..ce51393 100644 --- a/Whisper/Whisper/ContextImpl.capture.cpp +++ b/Whisper/Whisper/ContextImpl.capture.cpp @@ -238,35 +238,48 @@ namespace CHECK( readSample( false ) ); const size_t newSamples = pcm.mono.size(); - const size_t lastVoiceFrame = detectVoice(); - if( lastVoiceFrame == 0 ) - { - // No voice is detected in the entire buffered audio - clearStateFlag( eCaptureStatus::Voice ); - if( newSamples < captureParams.dropStartSilence ) + const bool wantVAD = !(captureParams.flags & (uint32_t)eCaptureFlags::DisableVAD); + if (wantVAD) { + const size_t lastVoiceFrame = detectVoice(); + if (lastVoiceFrame == 0) + { + // No voice is detected in the entire buffered audio + clearStateFlag(eCaptureStatus::Voice); + if (newSamples < captureParams.dropStartSilence) + return S_OK; + + pcm.dropFirst(1024); + vad.clear(); + pcmStartTime = nextSampleTime; return S_OK; + } - pcm.dropFirst(1024); - vad.clear(); - pcmStartTime = nextSampleTime; - return S_OK; - } - - const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples; + const bool newFrameVoice = lastVoiceFrame + captureParams.pauseDuration >= oldSamples; - if( newFrameVoice ) - { - // A voice is detected in the buffer, and it was fairly recently - setStateFlag( eCaptureStatus::Voice ); - if( newSamples < captureParams.maxDuration ) - return S_OK; // While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time + if (newFrameVoice) + { + // A voice is detected in the buffer, and it was fairly recently + setStateFlag(eCaptureStatus::Voice); + if (newSamples < captureParams.maxDuration) + return S_OK; // While voice is continuously detected, we allow to grow the buffer up to `maxDuration` time + } + else + { + // A voice is detected in the buffer, but it was a while ago + clearStateFlag(eCaptureStatus::Voice); + if (newSamples < captureParams.minDuration) + return S_OK; // When detected pause in the voice, we fire the transcribe task right away. + } } - else - { - // A voice is detected in the buffer, but it was a while ago - clearStateFlag( eCaptureStatus::Voice ); - if( newSamples < captureParams.minDuration ) - return S_OK; // When detected pause in the voice, we fire the transcribe task right away. + else { + // VAD is disabled. Pause until minimum duration is reached. + if (pcm.mono.size() < captureParams.minDuration) { + return S_OK; + } + // Ensure buffer is not too long. + if (pcm.mono.size() >= captureParams.maxDuration) { + pcm.retainLast(captureParams.maxDuration); + } } // Hopefully, we have enough captured PCM data to run the ASR model. diff --git a/Whisper/Whisper/voiceActivityDetection.cpp b/Whisper/Whisper/voiceActivityDetection.cpp index c0eb7ef..a0472a3 100644 --- a/Whisper/Whisper/voiceActivityDetection.cpp +++ b/Whisper/Whisper/voiceActivityDetection.cpp @@ -9,9 +9,12 @@ using namespace Whisper; inline VAD::Feature VAD::defaultPrimaryThresholds() { Feature f; + // Energy primary threshold f.energy = 40; + // Frequency primary threshold f.F = 185; - f.SFM = 5; + // Spectral flatness measure (SFM) primary threshold + f.SFM = 1; return f; } |
