diff options
| author | Konstantin <const@const.me> | 2023-01-28 20:39:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-28 20:39:43 +0100 |
| commit | 1b005e967d51f53271725869197541e931f9ceac (patch) | |
| tree | 21f41debe86b87e55556347850d60fec01144235 | |
| parent | c75bb96b531414a12fb1bfe599383348e56dbdb5 (diff) | |
Diarize feature for buffered audio
| -rw-r--r-- | Whisper/Whisper/MelStreamer.h | 3 | ||||
| -rw-r--r-- | Whisper/Whisper/Spectrogram.cpp | 41 | ||||
| -rw-r--r-- | Whisper/Whisper/Spectrogram.h | 3 |
3 files changed, 46 insertions, 1 deletions
diff --git a/Whisper/Whisper/MelStreamer.h b/Whisper/Whisper/MelStreamer.h index 387622c..50ed0bf 100644 --- a/Whisper/Whisper/MelStreamer.h +++ b/Whisper/Whisper/MelStreamer.h @@ -11,6 +11,7 @@ namespace Whisper { // Base class for both single- and multi-threaded MEL streamers + // Used by iContext.runStreamed method class MelStreamer : public iSpectrogram { protected: @@ -51,6 +52,7 @@ namespace Whisper }; // Single-threaded MEL streamer: runs these FFTs on-demand, from within makeBuffer() method + // Used by iContext.runStreamed method when cpuThreads parameter is less than 2 class MelStreamerSimple : public MelStreamer { HRESULT makeBuffer( size_t offset, size_t length, const float** buffer, size_t& stride ) noexcept override final; @@ -63,6 +65,7 @@ namespace Whisper // Multi threaded MEL streamers: runs FFT on a background thread ahead of time // The background thread tries to keep the queueMel full, this way the makeBuffer() method has very little to do // makeBuffer() only transposes the data, and does clamping + normalization, both steps are pretty fast + // Used by iContext.runStreamed method when cpuThreads parameter is 2 or more class MelStreamerThread : public MelStreamer, ThreadPoolWork { diff --git a/Whisper/Whisper/Spectrogram.cpp b/Whisper/Whisper/Spectrogram.cpp index 76130bd..4d84d7c 100644 --- a/Whisper/Whisper/Spectrogram.cpp +++ b/Whisper/Whisper/Spectrogram.cpp @@ -102,6 +102,22 @@ HRESULT Spectrogram::pcmToMel( const iAudioBuffer* buffer, const Filters& filter f = (float)( ( f + 4.0 ) / 4.0 ); } // DirectCompute::dbgWriteBinaryFile( LR"(C:\Temp\2remove\ML\mel-my.bin)", data.data(), data.size() * 4 ); + const float* const pcmStereo = buffer->getPcmStereo(); + if( nullptr != pcmStereo ) + { + try + { + stereo.resize( countSamples ); + } + catch( const std::bad_alloc& ) + { + return E_OUTOFMEMORY; + } + memcpy( stereo.data(), pcmStereo, countSamples * 8 ); + } + else + stereo.clear(); + return S_OK; } @@ -125,5 +141,28 @@ void Whisper::computeSignalEnergy( std::vector<float>& result, const iAudioBuffe HRESULT Spectrogram::copyStereoPcm( size_t offset, size_t length, std::vector<StereoSample>& buffer ) const { - return E_NOTIMPL; + if( stereo.empty() ) + return OLE_E_BLANK; + + length *= FFT_STEP; + offset *= FFT_STEP; + if( offset >= stereo.size() ) + return E_BOUNDS; + + try + { + buffer.resize( length ); + } + catch( const std::bad_alloc& ) + { + return E_OUTOFMEMORY; + } + + const size_t lengthToCopy = std::min( length, stereo.size() - offset ); + memcpy( buffer.data(), &stereo[ offset ], lengthToCopy * 8 ); + if( lengthToCopy == length ) + return S_OK; + + memset( &buffer[ lengthToCopy ], 0, ( buffer.size() - lengthToCopy ) * 8 ); + return S_OK; }
\ No newline at end of file diff --git a/Whisper/Whisper/Spectrogram.h b/Whisper/Whisper/Spectrogram.h index 28019ee..2107d6e 100644 --- a/Whisper/Whisper/Spectrogram.h +++ b/Whisper/Whisper/Spectrogram.h @@ -7,11 +7,14 @@ namespace Whisper { struct iAudioBuffer; + // This implementation of iSpectrogram interface converts complete audio into MEL spectrogram + // Used for unbuffered audio, and capture: iContext.runFull and runCapture methods. class Spectrogram: public iSpectrogram { uint32_t length = 0; static constexpr uint32_t mel = N_MEL; std::vector<float> data; + std::vector<StereoSample> stereo; HRESULT makeBuffer( size_t off, size_t len, const float** buffer, size_t& stride ) noexcept override final { |
