summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-28 20:39:43 +0100
committerKonstantin <const@const.me>2023-01-28 20:39:43 +0100
commit1b005e967d51f53271725869197541e931f9ceac (patch)
tree21f41debe86b87e55556347850d60fec01144235
parentc75bb96b531414a12fb1bfe599383348e56dbdb5 (diff)
Diarize feature for buffered audio
-rw-r--r--Whisper/Whisper/MelStreamer.h3
-rw-r--r--Whisper/Whisper/Spectrogram.cpp41
-rw-r--r--Whisper/Whisper/Spectrogram.h3
3 files changed, 46 insertions, 1 deletions
diff --git a/Whisper/Whisper/MelStreamer.h b/Whisper/Whisper/MelStreamer.h
index 387622c..50ed0bf 100644
--- a/Whisper/Whisper/MelStreamer.h
+++ b/Whisper/Whisper/MelStreamer.h
@@ -11,6 +11,7 @@
namespace Whisper
{
// Base class for both single- and multi-threaded MEL streamers
+ // Used by iContext.runStreamed method
class MelStreamer : public iSpectrogram
{
protected:
@@ -51,6 +52,7 @@ namespace Whisper
};
// Single-threaded MEL streamer: runs these FFTs on-demand, from within makeBuffer() method
+ // Used by iContext.runStreamed method when cpuThreads parameter is less than 2
class MelStreamerSimple : public MelStreamer
{
HRESULT makeBuffer( size_t offset, size_t length, const float** buffer, size_t& stride ) noexcept override final;
@@ -63,6 +65,7 @@ namespace Whisper
// Multi threaded MEL streamers: runs FFT on a background thread ahead of time
// The background thread tries to keep the queueMel full, this way the makeBuffer() method has very little to do
// makeBuffer() only transposes the data, and does clamping + normalization, both steps are pretty fast
+ // Used by iContext.runStreamed method when cpuThreads parameter is 2 or more
class MelStreamerThread : public MelStreamer,
ThreadPoolWork
{
diff --git a/Whisper/Whisper/Spectrogram.cpp b/Whisper/Whisper/Spectrogram.cpp
index 76130bd..4d84d7c 100644
--- a/Whisper/Whisper/Spectrogram.cpp
+++ b/Whisper/Whisper/Spectrogram.cpp
@@ -102,6 +102,22 @@ HRESULT Spectrogram::pcmToMel( const iAudioBuffer* buffer, const Filters& filter
f = (float)( ( f + 4.0 ) / 4.0 );
}
// DirectCompute::dbgWriteBinaryFile( LR"(C:\Temp\2remove\ML\mel-my.bin)", data.data(), data.size() * 4 );
+ const float* const pcmStereo = buffer->getPcmStereo();
+ if( nullptr != pcmStereo )
+ {
+ try
+ {
+ stereo.resize( countSamples );
+ }
+ catch( const std::bad_alloc& )
+ {
+ return E_OUTOFMEMORY;
+ }
+ memcpy( stereo.data(), pcmStereo, countSamples * 8 );
+ }
+ else
+ stereo.clear();
+
return S_OK;
}
@@ -125,5 +141,28 @@ void Whisper::computeSignalEnergy( std::vector<float>& result, const iAudioBuffe
HRESULT Spectrogram::copyStereoPcm( size_t offset, size_t length, std::vector<StereoSample>& buffer ) const
{
- return E_NOTIMPL;
+ if( stereo.empty() )
+ return OLE_E_BLANK;
+
+ length *= FFT_STEP;
+ offset *= FFT_STEP;
+ if( offset >= stereo.size() )
+ return E_BOUNDS;
+
+ try
+ {
+ buffer.resize( length );
+ }
+ catch( const std::bad_alloc& )
+ {
+ return E_OUTOFMEMORY;
+ }
+
+ const size_t lengthToCopy = std::min( length, stereo.size() - offset );
+ memcpy( buffer.data(), &stereo[ offset ], lengthToCopy * 8 );
+ if( lengthToCopy == length )
+ return S_OK;
+
+ memset( &buffer[ lengthToCopy ], 0, ( buffer.size() - lengthToCopy ) * 8 );
+ return S_OK;
} \ No newline at end of file
diff --git a/Whisper/Whisper/Spectrogram.h b/Whisper/Whisper/Spectrogram.h
index 28019ee..2107d6e 100644
--- a/Whisper/Whisper/Spectrogram.h
+++ b/Whisper/Whisper/Spectrogram.h
@@ -7,11 +7,14 @@ namespace Whisper
{
struct iAudioBuffer;
+ // This implementation of iSpectrogram interface converts complete audio into MEL spectrogram
+ // Used for unbuffered audio, and capture: iContext.runFull and runCapture methods.
class Spectrogram: public iSpectrogram
{
uint32_t length = 0;
static constexpr uint32_t mel = N_MEL;
std::vector<float> data;
+ std::vector<StereoSample> stereo;
HRESULT makeBuffer( size_t off, size_t len, const float** buffer, size_t& stride ) noexcept override final
{