summaryrefslogtreecommitdiffstats
path: root/Whisper/MF
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/MF
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Whisper/MF')
-rw-r--r--Whisper/MF/AudioBuffer.cpp93
-rw-r--r--Whisper/MF/AudioBuffer.h41
-rw-r--r--Whisper/MF/AudioCapture.cpp167
-rw-r--r--Whisper/MF/AudioCapture.h12
-rw-r--r--Whisper/MF/MediaFoundation.cpp109
-rw-r--r--Whisper/MF/PcmReader.cpp274
-rw-r--r--Whisper/MF/PcmReader.h63
-rw-r--r--Whisper/MF/loadAudioFile.cpp151
-rw-r--r--Whisper/MF/loadAudioFile.h7
-rw-r--r--Whisper/MF/mfStartup.cpp128
-rw-r--r--Whisper/MF/mfStartup.h15
-rw-r--r--Whisper/MF/mfUtils.cpp69
-rw-r--r--Whisper/MF/mfUtils.h15
13 files changed, 1144 insertions, 0 deletions
diff --git a/Whisper/MF/AudioBuffer.cpp b/Whisper/MF/AudioBuffer.cpp
new file mode 100644
index 0000000..ba4752d
--- /dev/null
+++ b/Whisper/MF/AudioBuffer.cpp
@@ -0,0 +1,93 @@
+#include "stdafx.h"
+#include "AudioBuffer.h"
+using namespace Whisper;
+
+void AudioBuffer::appendMono( const float* rsi, size_t countFloats )
+{
+ mono.insert( mono.end(), rsi, rsi + countFloats );
+}
+
+void AudioBuffer::appendStereo( const float* rsi, size_t countFloats )
+{
+ assert( 0 == ( countFloats % 2 ) );
+ const size_t countSamples = countFloats / 2;
+
+ const size_t oldLength = mono.size();
+ assert( oldLength * 2 == stereo.size() );
+ mono.resize( oldLength + countSamples );
+ stereo.resize( ( oldLength + countSamples ) * 2 );
+
+ const float* const rsiEnd = rsi + countSamples * 2;
+ const float* const rsiEndAligned = rsiEnd - ( countSamples * 2 ) % 8;
+
+ float* rdiStereo = &stereo[ oldLength * 2 ];
+ float* rdiMono = &mono[ oldLength ];
+
+ const __m128 half = _mm_set1_ps( 0.5f );
+ for( ; rsi < rsiEndAligned; rsi += 8, rdiStereo += 8, rdiMono += 4 )
+ {
+ // Load 4 samples = 8 floats
+ __m128 v0 = _mm_loadu_ps( rsi ); // L0, R0, L1, R1
+ __m128 v1 = _mm_loadu_ps( rsi + 4 );// L2, R2, L3, R3
+
+ // Store into the stereo PCM vector
+ _mm_storeu_ps( rdiStereo, v0 );
+ _mm_storeu_ps( rdiStereo + 4, v1 );
+
+ // Compute and store the average of these channels
+ __m128 left = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 2, 0, 2, 0 ) );
+ __m128 right = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 3, 1, 3, 1 ) );
+ __m128 sum = _mm_add_ps( left, right );
+ sum = _mm_mul_ps( sum, half );
+ _mm_storeu_ps( rdiMono, sum );
+ }
+
+#pragma loop (no_vector)
+ for( ; rsi < rsiEnd; rsi += 2, rdiStereo += 2, rdiMono++ )
+ {
+ __m128 vec = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+ _mm_store_sd( (double*)rdiStereo, _mm_castps_pd( vec ) );
+
+ vec = _mm_add_ss( vec, _mm_movehdup_ps( vec ) );
+ vec = _mm_mul_ss( vec, half );
+ _mm_store_ss( rdiMono, vec );
+ }
+}
+
+void AudioBuffer::appendDownmixedStereo( const float* rsi, size_t countFloats )
+{
+ assert( 0 == ( countFloats % 2 ) );
+ const size_t countSamples = countFloats / 2;
+
+ const size_t oldLength = mono.size();
+ mono.resize( oldLength + countSamples );
+
+ const float* const rsiEnd = rsi + countSamples * 2;
+ const float* const rsiEndAligned = rsiEnd - ( countSamples * 2 ) % 8;
+
+ float* rdiMono = &mono[ oldLength ];
+
+ const __m128 half = _mm_set1_ps( 0.5f );
+ for( ; rsi < rsiEndAligned; rsi += 8, rdiMono += 4 )
+ {
+ // Load 4 samples = 8 floats
+ __m128 v0 = _mm_loadu_ps( rsi ); // L0, R0, L1, R1
+ __m128 v1 = _mm_loadu_ps( rsi + 4 );// L2, R2, L3, R3
+
+ // Compute and store the average of these channels
+ __m128 left = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 2, 0, 2, 0 ) );
+ __m128 right = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 3, 1, 3, 1 ) );
+ __m128 sum = _mm_add_ps( left, right );
+ sum = _mm_mul_ps( sum, half );
+ _mm_storeu_ps( rdiMono, sum );
+ }
+
+#pragma loop (no_vector)
+ for( ; rsi < rsiEnd; rsi += 2, rdiMono++ )
+ {
+ __m128 vec = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+ vec = _mm_add_ss( vec, _mm_movehdup_ps( vec ) );
+ vec = _mm_mul_ss( vec, half );
+ _mm_store_ss( rdiMono, vec );
+ }
+} \ No newline at end of file
diff --git a/Whisper/MF/AudioBuffer.h b/Whisper/MF/AudioBuffer.h
new file mode 100644
index 0000000..87319dd
--- /dev/null
+++ b/Whisper/MF/AudioBuffer.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <vector>
+
+namespace Whisper
+{
+ struct AudioBuffer
+ {
+ std::vector<float> mono;
+ std::vector<float> stereo;
+
+ void appendMono( const float* rsi, size_t countFloats );
+ void appendDownmixedStereo( const float* rsi, size_t countFloats );
+ void appendStereo( const float* rsi, size_t countFloats );
+
+ using pfnAppendSamples = void( AudioBuffer::* )( const float* rsi, size_t countFloats );
+
+ inline static pfnAppendSamples appendSamplesFunc( bool sourceMono, bool wantStereo )
+ {
+ if( sourceMono )
+ return &AudioBuffer::appendMono;
+ else if( !wantStereo )
+ return &AudioBuffer::appendDownmixedStereo;
+ else
+ return &AudioBuffer::appendStereo;
+ }
+
+ void clear()
+ {
+ mono.clear();
+ stereo.clear();
+ }
+
+ void resize( size_t len )
+ {
+ assert( len <= mono.size() );
+ mono.resize( len );
+ if( !stereo.empty() )
+ stereo.resize( len * 2 );
+ }
+ };
+} \ No newline at end of file
diff --git a/Whisper/MF/AudioCapture.cpp b/Whisper/MF/AudioCapture.cpp
new file mode 100644
index 0000000..17f34dc
--- /dev/null
+++ b/Whisper/MF/AudioCapture.cpp
@@ -0,0 +1,167 @@
+#include "stdafx.h"
+#include <atlstr.h>
+#include <mfapi.h>
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include "AudioCapture.h"
+#include "../API/iMediaFoundation.cl.h"
+#include "../ComLightLib/comLightServer.h"
+#pragma comment(lib, "Mf.lib")
+
+namespace
+{
+ struct Strings
+ {
+ CString displayName, endpoint;
+ };
+
+ HRESULT getAllocString( IMFActivate* activate, const GUID& id, CString& rdi )
+ {
+ wchar_t* pointer = nullptr;
+ UINT32 cchName;
+ HRESULT hr = activate->GetAllocatedString( id, &pointer, &cchName );
+ if( SUCCEEDED( hr ) )
+ rdi.SetString( pointer, cchName );
+ CoTaskMemFree( pointer );
+ return hr;
+ }
+
+ HRESULT getInfo( IMFActivate* activate, Strings& rdi )
+ {
+ CHECK( getAllocString( activate, MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME, rdi.displayName ) );
+ CHECK( getAllocString( activate, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_ENDPOINT_ID, rdi.endpoint ) );
+ return S_OK;
+ }
+
+ HRESULT __stdcall supplyDevices( Whisper::pfnFoundCaptureDevices pfn, void* pv, IMFActivate** ppDevices, UINT32 count )
+ {
+ if( ppDevices == nullptr || count == 0 )
+ return pfn( 0, nullptr, pv );
+
+ std::vector<Strings> strings;
+ strings.reserve( count );
+
+ for( UINT i = 0; i < count; i++ )
+ {
+ IMFActivate* const activate = ppDevices[ i ];
+ if( nullptr == activate )
+ continue;
+ Strings info;
+ HRESULT hr = getInfo( activate, info );
+ if( FAILED( hr ) )
+ continue;
+
+ strings.emplace_back( std::move( info ) );
+ }
+
+ const size_t len = strings.size();
+ if( 0 == len )
+ return pfn( 0, nullptr, pv );
+
+ std::vector<Whisper::sCaptureDevice> pointers;
+ pointers.resize( len );
+ for( size_t i = 0; i < len; i++ )
+ {
+ const auto& src = strings[ i ];
+ auto& dest = pointers[ i ];
+ dest.displayName = src.displayName;
+ dest.endpoint = src.endpoint;
+ }
+ return pfn( (int)len, pointers.data(), pv );
+ }
+}
+
+HRESULT __stdcall Whisper::captureDeviceList( pfnFoundCaptureDevices pfn, void* pv )
+{
+ // Create an attribute store to hold the search criteria.
+ CComPtr<IMFAttributes> attrs;
+ CHECK( MFCreateAttributes( &attrs, 1 ) );
+ // Request audio capture devices
+ CHECK( attrs->SetGUID( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID ) );
+
+ // Enumerate the devices
+ IMFActivate** ppDevices = nullptr;
+ UINT32 count = 0;
+ CHECK( MFEnumDeviceSources( attrs, &ppDevices, &count ) );
+
+ // Feed the data to the caller
+ HRESULT hr = supplyDevices( pfn, pv, ppDevices, count );
+
+ // Free the memory
+ for( DWORD i = 0; i < count; i++ )
+ ppDevices[ i ]->Release();
+ CoTaskMemFree( ppDevices );
+
+ return hr;
+}
+
+namespace
+{
+ using namespace Whisper;
+
+ class Capture : public ComLight::ObjectRoot<iAudioCapture>
+ {
+ CComPtr<IMFSourceReader> reader;
+ CComPtr<iMediaFoundation> mediaFoundation;
+ sCaptureParams captureParams;
+
+ HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const noexcept override final
+ {
+ if( pp == nullptr )
+ return E_POINTER;
+ CComPtr<IMFSourceReader> res = reader;
+ *pp = res.Detach();;
+ return S_OK;
+ }
+ const sCaptureParams& COMLIGHTCALL getParams() const noexcept override final
+ {
+ return captureParams;
+ }
+ public:
+ HRESULT open( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& cp );
+ };
+
+ HRESULT Capture::open( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& cp )
+ {
+ // Create an attribute store to hold the search criteria.
+ CComPtr<IMFAttributes> attrs;
+ CHECK( MFCreateAttributes( &attrs, 2 ) );
+ // Request audio capture devices
+ CHECK( attrs->SetGUID( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID ) );
+ CHECK( attrs->SetString( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_ENDPOINT_ID, endpoint ) );
+
+ CComPtr<IMFMediaSource> source;
+ HRESULT hr = MFCreateDeviceSource( attrs, &source );
+ if( FAILED( hr ) )
+ {
+ logErrorHr( hr, u8"MFCreateDeviceSource" );
+ return hr;
+ }
+
+ // TODO: implement IMFSourceReaderCallback, pass into MF_SOURCE_READER_ASYNC_CALLBACK attribute
+ // This is to support cancellation
+ hr = MFCreateSourceReaderFromMediaSource( source, nullptr, &reader );
+ if( FAILED( hr ) )
+ {
+ logErrorHr( hr, u8"MFCreateSourceReaderFromMediaSource" );
+ return hr;
+ }
+
+ captureParams = cp;
+ mediaFoundation = owner;
+ return S_OK;
+ }
+}
+
+HRESULT __stdcall Whisper::captureOpen( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept
+{
+ if( nullptr == endpoint || nullptr == pp )
+ return E_POINTER;
+
+ ComLight::CComPtr<ComLight::Object<Capture>> res;
+ CHECK( ComLight::Object<Capture>::create( res ) );
+ CHECK( res->open( owner, endpoint, captureParams ) );
+
+ res.detach( pp );
+ return S_OK;
+} \ No newline at end of file
diff --git a/Whisper/MF/AudioCapture.h b/Whisper/MF/AudioCapture.h
new file mode 100644
index 0000000..276ee4b
--- /dev/null
+++ b/Whisper/MF/AudioCapture.h
@@ -0,0 +1,12 @@
+#pragma once
+#include "../API/MfStructs.h"
+
+namespace Whisper
+{
+ struct iAudioCapture;
+ struct iMediaFoundation;
+
+ HRESULT __stdcall captureDeviceList( pfnFoundCaptureDevices pfn, void* pv );
+
+ HRESULT __stdcall captureOpen( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept;
+} \ No newline at end of file
diff --git a/Whisper/MF/MediaFoundation.cpp b/Whisper/MF/MediaFoundation.cpp
new file mode 100644
index 0000000..4a4f6a2
--- /dev/null
+++ b/Whisper/MF/MediaFoundation.cpp
@@ -0,0 +1,109 @@
+#include "stdafx.h"
+#include "../API/iMediaFoundation.cl.h"
+#include "mfStartup.h"
+#include "../ComLightLib/comLightServer.h"
+#include "loadAudioFile.h"
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include "mfUtils.h"
+#include "AudioCapture.h"
+
+namespace Whisper
+{
+ class AudioReader : public ComLight::ObjectRoot<iAudioReader>
+ {
+ CComPtr<IMFSourceReader> reader;
+ bool wantStereo;
+ CComPtr<iMediaFoundation> mediaFoundation;
+
+ HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const noexcept override final
+ {
+ if( pp == nullptr )
+ return E_POINTER;
+ CComPtr<IMFSourceReader> res = reader;
+ *pp = res.Detach();;
+ return S_OK;
+ }
+ HRESULT COMLIGHTCALL requestedStereo() const noexcept override final
+ {
+ return wantStereo ? S_OK : S_FALSE;
+ }
+ HRESULT COMLIGHTCALL getDuration( int64_t& rdi ) const noexcept override final
+ {
+ if( reader )
+ return getStreamDuration( reader, rdi );
+ return OLE_E_BLANK;
+ }
+ public:
+ HRESULT open( iMediaFoundation* owner, LPCTSTR path, bool stereo )
+ {
+ HRESULT hr = MFCreateSourceReaderFromURL( path, nullptr, &reader );
+ if( FAILED( hr ) )
+ {
+ logErrorHr( hr, u8"MFCreateSourceReaderFromURL failed" );
+ return hr;
+ }
+ wantStereo = stereo;
+ mediaFoundation = owner;
+ logDebug16( L"Created source reader from the file \"%s\"", path );
+ return S_OK;
+ }
+ };
+
+ class MediaFoundation : public ComLight::ObjectRoot<iMediaFoundation>
+ {
+ MfStartupRaii raii;
+ DWORD tid = ~(DWORD)0;
+
+ virtual HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const noexcept override final
+ {
+ return Whisper::loadAudioFile( path, stereo, pp );
+ }
+ virtual HRESULT COMLIGHTCALL openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ) noexcept override final
+ {
+ if( nullptr == path || nullptr == pp )
+ return E_POINTER;
+
+ ComLight::CComPtr<ComLight::Object<AudioReader>> res;
+ CHECK( ComLight::Object<AudioReader>::create( res ) );
+ CHECK( res->open( this, path, stereo ) );
+
+ res.detach( pp );
+ return S_OK;
+ }
+ HRESULT COMLIGHTCALL listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ) noexcept override final
+ {
+ return captureDeviceList( pfn, pv );
+ }
+ HRESULT COMLIGHTCALL openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept override final
+ {
+ return captureOpen( this, endpoint, captureParams, pp );
+ }
+ protected:
+
+ HRESULT FinalConstruct()
+ {
+ CHECK( raii.startup() );
+ tid = GetCurrentThreadId();
+ return S_OK;
+ }
+
+ public:
+
+ ~MediaFoundation() override
+ {
+ assert( tid == GetCurrentThreadId() );
+ }
+ };
+}
+
+HRESULT COMLIGHTCALL Whisper::initMediaFoundation( iMediaFoundation** pp )
+{
+ if( nullptr == pp )
+ return E_POINTER;
+
+ ComLight::CComPtr<ComLight::Object<MediaFoundation>> obj;
+ CHECK( ComLight::Object<MediaFoundation>::create( obj ) );
+ obj.detach( pp );
+ return S_OK;
+} \ No newline at end of file
diff --git a/Whisper/MF/PcmReader.cpp b/Whisper/MF/PcmReader.cpp
new file mode 100644
index 0000000..ab92fc3
--- /dev/null
+++ b/Whisper/MF/PcmReader.cpp
@@ -0,0 +1,274 @@
+#include "stdafx.h"
+#include "PcmReader.h"
+#include <mfapi.h>
+#include "mfUtils.h"
+
+namespace Whisper
+{
+ __interface iSampleHandler
+ {
+ void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const;
+ void moveBufferData( AudioBuffer& rdi, size_t amount ) const;
+ void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const;
+ void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const;
+ uint32_t readerChannelsCount() const;
+ };
+}
+
+namespace
+{
+ using namespace Whisper;
+
+ __forceinline void copyMono( PcmMonoChunk* rdi, const AudioBuffer& rsi, size_t sourceOffset, size_t samples )
+ {
+ assert( sourceOffset + samples <= rsi.mono.size() );
+ memcpy( rdi->mono.data(), &rsi.mono[ sourceOffset ], samples * 4 );
+ if( samples < FFT_STEP )
+ memset( rdi->mono.data() + samples, 0, ( FFT_STEP - samples ) * 4 );
+ }
+
+ __forceinline void copyStereo( PcmStereoChunk* rdi, const AudioBuffer& rsi, size_t sourceOffset, size_t samples )
+ {
+ memcpy( rdi->stereo.data(), &rsi.stereo[ sourceOffset * 2 ], samples * 8 );
+ if( samples < FFT_STEP )
+ memset( rdi->stereo.data() + samples * 2, 0, ( FFT_STEP - samples ) * 8 );
+ }
+
+ struct HandlerMono : iSampleHandler
+ {
+ void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override
+ {
+ rdi.appendMono( rsi, countFloats );
+ }
+ void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const override final
+ {
+ copyMono( pMono, rsi, sourceOffset, FFT_STEP );
+ }
+ void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const override final
+ {
+ copyMono( pMono, rsi, sourceOffset, samples );
+ }
+ void moveBufferData( AudioBuffer& rdi, size_t amount ) const override final
+ {
+ const size_t len = rdi.mono.size();
+ assert( amount <= len );
+ if( amount < len )
+ {
+ const size_t block = len - amount;
+ memmove( rdi.mono.data(), rdi.mono.data() + amount, block * 4 );
+ rdi.mono.resize( block );
+ }
+ else
+ rdi.mono.clear();
+ }
+ uint32_t readerChannelsCount() const override { return 1; }
+ };
+ struct HandlerDownmixedStereo : HandlerMono
+ {
+ void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override final
+ {
+ rdi.appendDownmixedStereo( rsi, countFloats );
+ }
+ uint32_t readerChannelsCount() const override final { return 2; }
+ };
+ struct HandlerStereo : iSampleHandler
+ {
+ void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override final
+ {
+ rdi.appendStereo( rsi, countFloats );
+ }
+ void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const override final
+ {
+ copyMono( pMono, rsi, sourceOffset, FFT_STEP );
+ copyStereo( pStereo, rsi, sourceOffset, FFT_STEP );
+ }
+ void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const override final
+ {
+ copyMono( pMono, rsi, sourceOffset, samples );
+ copyStereo( pStereo, rsi, sourceOffset, samples );
+ }
+ void moveBufferData( AudioBuffer& rdi, size_t amount ) const override final
+ {
+ const size_t len = rdi.mono.size();
+ assert( amount <= len );
+ if( amount < len )
+ {
+ const size_t block = len - amount;
+ memmove( rdi.mono.data(), rdi.mono.data() + amount, block * 4 );
+ rdi.mono.resize( block );
+ memmove( rdi.stereo.data(), rdi.stereo.data() + amount * 2, block * 8 );
+ rdi.mono.resize( block * 2 );
+ }
+ else
+ {
+ rdi.mono.clear();
+ rdi.stereo.clear();
+ }
+ }
+ uint32_t readerChannelsCount() const override final { return 2; }
+ };
+ static const HandlerMono s_mono;
+ static const HandlerDownmixedStereo s_downmix;
+ static const HandlerStereo s_stereo;
+}
+
+PcmReader::PcmReader( IMFSourceReader* reader, bool stereo )
+{
+ if( nullptr == reader )
+ throw E_POINTER;
+ this->reader = reader;
+
+ // Set up media type, and figure out sample handler
+ check( reader->SetStreamSelection( MF_SOURCE_READER_ALL_STREAMS, FALSE ) );
+ check( reader->SetStreamSelection( MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE ) );
+
+ CComPtr<IMFMediaType> mtNative;
+ check( reader->GetNativeMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, MF_SOURCE_READER_CURRENT_TYPE_INDEX, &mtNative ) );
+ UINT32 numChannels;
+ check( mtNative->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &numChannels ) );
+
+ const bool sourceMono = numChannels < 2;
+ if( sourceMono )
+ sampleHandler = &s_mono;
+ else if( !stereo )
+ sampleHandler = &s_downmix;
+ else
+ {
+ sampleHandler = &s_stereo;
+ m_stereoOutput = true;
+ }
+
+ CComPtr<IMFMediaType> mt;
+ check( createMediaType( !sourceMono, &mt ) );
+ check( reader->SetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, nullptr, mt ) );
+
+ // Find out the length
+ int64_t durationTicks;
+ check( getStreamDuration( reader, durationTicks ) );
+
+ // Convert length to chunks
+ // Seconds = Ticks / 10^7
+ // Samples = Seconds * SAMPLE_RATE = Ticks * SAMPLE_RATE / 10^7
+ // Chunks = Samples / FFT_STEP = Ticks * SAMPLE_RATE / ( FFT_STEP * 10^7 ), and we want that integer rounded down
+ constexpr __int64 mul = SAMPLE_RATE;
+ constexpr __int64 div = (__int64)FFT_STEP * 10'000'000;
+ m_length = (size_t)MFllMulDiv( durationTicks, mul, div, 0 );
+}
+
+HRESULT PcmReader::readNextSample()
+{
+ const size_t off = bufferReadOffset;
+ const size_t availableSamples = pcm.mono.size() - off;
+
+ // If needed, move the remaining PCM data to the start of these vectors
+ if( availableSamples > 0 )
+ {
+ if( 0 != off )
+ sampleHandler->moveBufferData( pcm, off );
+ }
+ else
+ pcm.clear();
+ bufferReadOffset = 0;
+
+ while( true )
+ {
+ DWORD dwFlags = 0;
+ CComPtr<IMFSample> sample;
+
+ // Read the next sample
+ HRESULT hr = reader->ReadSample( (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, nullptr, &dwFlags, nullptr, &sample );
+ if( FAILED( hr ) )
+ {
+ logErrorHr( hr, u8"IMFSourceReader.ReadSample" );
+ return hr;
+ }
+
+ if( dwFlags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED )
+ {
+ // logError( u8"Media type changes ain’t supported by the library." );
+ // return E_UNEXPECTED;
+
+ // This happens for some video files at the very start of the reading, with Dolby AC3 audio track.
+ // Instead of failing the transcribe process, verify the important attributes (FP32 samples, sample rate, count of channels) haven’t changed.
+ CHECK( validateCurrentMediaType( reader, sampleHandler->readerChannelsCount() ) );
+ }
+
+ if( dwFlags & MF_SOURCE_READERF_ENDOFSTREAM )
+ return E_EOF;
+
+ if( !sample )
+ {
+ // printf( "No sample\n" );
+ continue;
+ }
+
+ // Get a pointer to the audio data in the sample.
+ CComPtr<IMFMediaBuffer> buffer;
+ hr = sample->ConvertToContiguousBuffer( &buffer );
+ if( FAILED( hr ) )
+ return hr;
+
+ const float* pAudioData = nullptr;
+ DWORD cbBuffer;
+ hr = buffer->Lock( (BYTE**)&pAudioData, nullptr, &cbBuffer );
+ if( FAILED( hr ) )
+ return hr;
+
+ try
+ {
+ assert( 0 == ( cbBuffer % sizeof( float ) ) );
+ const size_t countFloats = cbBuffer / sizeof( float );
+ sampleHandler->appendPcm( pcm, pAudioData, countFloats );
+ }
+ catch( const std::bad_alloc& )
+ {
+ buffer->Unlock();
+ return E_OUTOFMEMORY;
+ }
+
+ // Unlock the buffer
+ hr = buffer->Unlock();
+ if( FAILED( hr ) )
+ return hr;
+
+ return S_OK;
+ }
+}
+
+HRESULT PcmReader::readChunk( PcmMonoChunk& mono, PcmStereoChunk* stereo )
+{
+ while( true )
+ {
+ const size_t off = bufferReadOffset;
+ const size_t availableSamples = pcm.mono.size() - off;
+ if( availableSamples >= FFT_STEP )
+ {
+ // We have enough data in the buffer
+ sampleHandler->copyChunk( &mono, pcm, off, stereo );
+ bufferReadOffset = off + FFT_STEP;
+ return S_OK;
+ }
+
+ if( !m_readerEndOfFile )
+ {
+ // We don't have enough data, but the stream has not ended yet, can load moar samples from the reader
+ HRESULT hr = readNextSample();
+ if( SUCCEEDED( hr ) )
+ continue;
+ if( hr != E_EOF )
+ return hr;
+ m_readerEndOfFile = true;
+ }
+
+ if( availableSamples > 0 )
+ {
+ // We have reached the end of stream of the reader, but the buffer still has a few samples.
+ // Return the final incomplete chunk padded with zeros
+ sampleHandler->copyChunk( &mono, pcm, off, availableSamples, stereo );
+ bufferReadOffset = off + availableSamples;
+ return S_OK;
+ }
+
+ return E_EOF;
+ }
+} \ No newline at end of file
diff --git a/Whisper/MF/PcmReader.h b/Whisper/MF/PcmReader.h
new file mode 100644
index 0000000..9e3757e
--- /dev/null
+++ b/Whisper/MF/PcmReader.h
@@ -0,0 +1,63 @@
+#pragma once
+#include "../Whisper/audioConstants.h"
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include "AudioBuffer.h"
+
+namespace Whisper
+{
+ // PCM buffer with 10 milliseconds of single-channel audio
+ struct PcmMonoChunk
+ {
+ std::array<float, FFT_STEP> mono;
+ };
+ // PCM buffer with 10 milliseconds of interleaved stereo
+ struct PcmStereoChunk
+ {
+ std::array<float, FFT_STEP * 2> stereo;
+ };
+
+ __interface iSampleHandler;
+
+ constexpr HRESULT E_EOF = HRESULT_FROM_WIN32( ERROR_HANDLE_EOF );
+
+ // Utility class which reads chunks of FFT_STEP FP32 PCM samples from the MF source reader
+ // The class always delivers mono chunks, and can optionally deliver stereo in a separate buffer.
+ class PcmReader
+ {
+ // A small intermediate buffer with PCM data for complete media foundation samples
+ AudioBuffer pcm;
+ // Index of the first unconsumed sample in the pcm buffer
+ size_t bufferReadOffset = 0;
+ // Utility object to abstract away mono versus stereo shenanigans
+ const iSampleHandler* sampleHandler;
+ // The underlying MF source reader which delivers audio data
+ CComPtr<IMFSourceReader> reader;
+ // True after we consumed all available media samples from the reader
+ bool m_readerEndOfFile = false;
+ // True if this object delivers stereo samples
+ bool m_stereoOutput = false;
+ // The count of chunks we expect to get from the reader
+ size_t m_length = 0;
+ // Read next sample from the reader, store in the PCM buffer in this class
+ HRESULT readNextSample();
+
+ public:
+
+ PcmReader( IMFSourceReader* source, bool stereo );
+
+ // Count of chunks in the MEL spectrogram.
+ // The PCM audio is generally slightly longer than that, due to the incomplete last chunk.
+ size_t getLength() const noexcept
+ {
+ return m_length;
+ }
+
+ // True when the stereo flag passed to constructor, and the audio stream actually has 2 or more audio channels
+ bool outputsStereo() const { return m_stereoOutput; }
+
+ // Load another 10ms chunk from the stream
+ // For the last chunk in the stream, the output buffers are padded with zeros
+ HRESULT readChunk( PcmMonoChunk& mono, PcmStereoChunk* stereo );
+ };
+} \ No newline at end of file
diff --git a/Whisper/MF/loadAudioFile.cpp b/Whisper/MF/loadAudioFile.cpp
new file mode 100644
index 0000000..d1a439a
--- /dev/null
+++ b/Whisper/MF/loadAudioFile.cpp
@@ -0,0 +1,151 @@
+#include "stdafx.h"
+#include "../ComLightLib/comLightServer.h"
+#include "loadAudioFile.h"
+#include "mfUtils.h"
+#include "AudioBuffer.h"
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include <mfapi.h>
+#pragma comment(lib, "Mfreadwrite.lib")
+#pragma comment(lib, "mfuuid.lib")
+
+namespace Whisper
+{
+ class MediaFileBuffer : public ComLight::ObjectRoot<iAudioBuffer>
+ {
+ AudioBuffer pcm;
+ uint32_t channels = 0;
+
+ uint32_t COMLIGHTCALL countSamples() const noexcept override final
+ {
+ return (uint32_t)( pcm.mono.size() );
+ }
+ const float* COMLIGHTCALL getPcmMono() const noexcept override final
+ {
+ if( !pcm.mono.empty() )
+ return pcm.mono.data();
+ return nullptr;
+ }
+ const float* COMLIGHTCALL getPcmStereo() const noexcept override final
+ {
+ if( !pcm.stereo.empty() )
+ return pcm.stereo.data();
+ return nullptr;
+ }
+ HRESULT COMLIGHTCALL getTime( int64_t& rdi ) const noexcept override final
+ {
+ rdi = 0;
+ return S_OK;
+ }
+ public:
+ HRESULT load( LPCTSTR path, bool stereo );
+ };
+
+ HRESULT MediaFileBuffer::load( LPCTSTR path, bool stereo )
+ {
+ CComPtr<IMFSourceReader> reader;
+ HRESULT hr = MFCreateSourceReaderFromURL( path, nullptr, &reader );
+ if( FAILED( hr ) )
+ {
+ logErrorHr( hr, u8"MFCreateSourceReaderFromURL failed" );
+ return hr;
+ }
+
+ CHECK( reader->SetStreamSelection( MF_SOURCE_READER_ALL_STREAMS, FALSE ) );
+ CHECK( reader->SetStreamSelection( MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE ) );
+
+ CComPtr<IMFMediaType> mtNative;
+ CHECK( reader->GetNativeMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, MF_SOURCE_READER_CURRENT_TYPE_INDEX, &mtNative ) );
+ UINT32 numChannels;
+ CHECK( mtNative->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &numChannels ) );
+ const bool sourceMono = numChannels == 1;
+ const AudioBuffer::pfnAppendSamples pfn = AudioBuffer::appendSamplesFunc( sourceMono, stereo );
+ channels = ( stereo && !sourceMono ) ? 2 : 1;
+
+ CComPtr<IMFMediaType> mt;
+ CHECK( createMediaType( !sourceMono, &mt ) );
+
+ CHECK( reader->SetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, nullptr, mt ) );
+
+ while( true )
+ {
+ DWORD dwFlags = 0;
+ CComPtr<IMFSample> sample;
+
+ // Read the next sample.
+ hr = reader->ReadSample( (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, nullptr, &dwFlags, nullptr, &sample );
+ if( FAILED( hr ) )
+ {
+ logErrorHr( hr, u8"IMFSourceReader.ReadSample" );
+ return hr;
+ }
+
+ if( dwFlags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED )
+ {
+ logError( u8"Media type changes ain’t supported by the library." );
+ return E_UNEXPECTED;
+ }
+
+ if( dwFlags & MF_SOURCE_READERF_ENDOFSTREAM )
+ break;
+
+ if( !sample )
+ {
+ // printf( "No sample\n" );
+ continue;
+ }
+
+ // Get a pointer to the audio data in the sample.
+ CComPtr<IMFMediaBuffer> buffer;
+ hr = sample->ConvertToContiguousBuffer( &buffer );
+ if( FAILED( hr ) )
+ return hr;
+
+ const float* pAudioData = nullptr;
+ DWORD cbBuffer;
+ hr = buffer->Lock( (BYTE**)&pAudioData, nullptr, &cbBuffer );
+ if( FAILED( hr ) )
+ return hr;
+
+ try
+ {
+ const size_t countFloats = cbBuffer / sizeof( float );
+ ( pcm.*pfn )( pAudioData, countFloats );
+ }
+ catch( const std::bad_alloc& )
+ {
+ return E_OUTOFMEMORY;
+ }
+
+ // Unlock the buffer
+ hr = buffer->Unlock();
+ if( FAILED( hr ) )
+ return hr;
+ }
+
+ const size_t len = pcm.mono.size();
+ if( len == 0 )
+ {
+ logError16( L"The audio file \"%s\" has no samples", path );
+ return E_INVALIDARG;
+ }
+ if( len < SAMPLE_RATE / 2 )
+ logError16( L"The file \"%s\" only has %zu samples, less than 0.5 seconds of audio", path, len );
+ else
+ logDebug16( L"Loaded audio file from \"%s\": %zu samples, %g seconds", path, len, (int)len * ( 1.0 / SAMPLE_RATE ) );
+ return S_OK;
+
+ }
+}
+
+HRESULT COMLIGHTCALL Whisper::loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp )
+{
+ if( nullptr == path || nullptr == pp )
+ return E_POINTER;
+
+ ComLight::CComPtr<ComLight::Object<MediaFileBuffer>> obj;
+ CHECK( ComLight::Object<MediaFileBuffer>::create( obj ) );
+ CHECK( obj->load( path, stereo ) );
+ obj.detach( pp );
+ return S_OK;
+} \ No newline at end of file
diff --git a/Whisper/MF/loadAudioFile.h b/Whisper/MF/loadAudioFile.h
new file mode 100644
index 0000000..9736ccd
--- /dev/null
+++ b/Whisper/MF/loadAudioFile.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "../API/iMediaFoundation.cl.h"
+
+namespace Whisper
+{
+ HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp );
+} \ No newline at end of file
diff --git a/Whisper/MF/mfStartup.cpp b/Whisper/MF/mfStartup.cpp
new file mode 100644
index 0000000..b7ab829
--- /dev/null
+++ b/Whisper/MF/mfStartup.cpp
@@ -0,0 +1,128 @@
+#include "stdafx.h"
+#include "mfStartup.h"
+#include <atlbase.h>
+#include <mfapi.h>
+#pragma comment(lib, "Mfplat.lib")
+
+namespace
+{
+ struct sCoInitStatus
+ {
+ // Possible state:
+ // -1 is the initial state, coInitialize never called
+ // S_OK - CoInitializeEx succeeded, in this state the counter tracks the count of coInitialize() for the current thread
+ // S_FALSE - CoInitializeEx failed with RPC_E_CHANGED_MODE, or did nothing because already initialized for the current thread
+ // Error status - CoInitializeEx failed for some other reason
+ HRESULT code = -1;
+ uint32_t counter = 0;
+ };
+ thread_local sCoInitStatus coInitStatus;
+
+ static HRESULT coInitialize()
+ {
+ sCoInitStatus& cis = coInitStatus;
+ HRESULT hr = cis.code;
+ if( SUCCEEDED( hr ) )
+ {
+ if( S_OK == hr )
+ cis.counter++;
+ return S_FALSE;
+ }
+
+ if( hr == HRESULT( -1 ) )
+ {
+ hr = CoInitializeEx( nullptr, COINIT_MULTITHREADED );
+ if( S_OK == hr )
+ {
+ cis.counter = 1;
+ return cis.code = S_OK;
+ }
+ if( S_FALSE == hr || RPC_E_CHANGED_MODE == hr )
+ {
+ return cis.code = S_FALSE;
+ }
+ cis.code = hr;
+ return hr;
+ }
+
+ return hr;
+ }
+
+ static void coUninitialize()
+ {
+ sCoInitStatus& cis = coInitStatus;
+ if( cis.code == S_OK )
+ {
+ assert( cis.counter > 0 );
+ cis.counter--;
+ if( 0 == cis.counter )
+ CoUninitialize();
+ }
+ }
+
+ static CComAutoCriticalSection s_lock;
+#define LOCK() CComCritSecLock<CComAutoCriticalSection> lock{ s_lock }
+ static uint32_t mfStartupCounter = 0;
+
+ constexpr uint8_t FlagCOM = 1;
+ constexpr uint8_t FlagMF = 0x10;
+}
+
+using namespace Whisper;
+
+MfStartupRaii::~MfStartupRaii()
+{
+ if( 0 != ( successFlags & FlagMF ) )
+ {
+ LOCK();
+ assert( mfStartupCounter > 0 );
+ mfStartupCounter--;
+ if( mfStartupCounter > 0 )
+ return;
+ MFShutdown();
+ successFlags &= ~FlagMF;
+ }
+
+ if( 0 != ( successFlags & FlagCOM ) )
+ {
+ coUninitialize();
+ successFlags &= ~FlagCOM;
+ }
+}
+
+HRESULT MfStartupRaii::startup()
+{
+ if( 0 != ( successFlags & FlagMF ) )
+ return HRESULT_FROM_WIN32( ERROR_ALREADY_INITIALIZED );
+
+ HRESULT hr = coInitialize();
+ CHECK( hr );
+ if( hr == S_OK )
+ successFlags |= FlagCOM;
+
+ LOCK();
+
+ if( 0 == mfStartupCounter )
+ {
+ HRESULT hr = MFStartup( MF_VERSION, MFSTARTUP_LITE );
+ if( SUCCEEDED( hr ) )
+ {
+ mfStartupCounter = 1;
+ successFlags |= FlagMF;
+ return S_OK;
+ }
+
+ if( 0 != ( successFlags & FlagCOM ) )
+ {
+ coUninitialize();
+ successFlags &= ~FlagCOM;
+ }
+ return hr;
+ }
+ else
+ {
+ mfStartupCounter++;
+ successFlags |= FlagMF;
+ return S_FALSE;
+ }
+} \ No newline at end of file
diff --git a/Whisper/MF/mfStartup.h b/Whisper/MF/mfStartup.h
new file mode 100644
index 0000000..1434ffc
--- /dev/null
+++ b/Whisper/MF/mfStartup.h
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace Whisper
+{
+ class MfStartupRaii
+ {
+ uint8_t successFlags = 0;
+ public:
+ MfStartupRaii() = default;
+ ~MfStartupRaii();
+ MfStartupRaii( const MfStartupRaii& ) = delete;
+
+ HRESULT startup();
+ };
+} \ No newline at end of file
diff --git a/Whisper/MF/mfUtils.cpp b/Whisper/MF/mfUtils.cpp
new file mode 100644
index 0000000..e739079
--- /dev/null
+++ b/Whisper/MF/mfUtils.cpp
@@ -0,0 +1,69 @@
+#include "stdafx.h"
+#include "mfUtils.h"
+#include <mfapi.h>
+
+HRESULT Whisper::createMediaType( bool stereo, IMFMediaType** pp )
+{
+ if( nullptr == pp )
+ return E_POINTER;
+
+ CComPtr<IMFMediaType> mt;
+ CHECK( MFCreateMediaType( &mt ) );
+ CHECK( mt->SetGUID( MF_MT_MAJOR_TYPE, MFMediaType_Audio ) );
+ CHECK( mt->SetGUID( MF_MT_SUBTYPE, MFAudioFormat_Float ) );
+ CHECK( mt->SetUINT32( MF_MT_AUDIO_SAMPLES_PER_SECOND, SAMPLE_RATE ) );
+
+ const uint32_t channels = stereo ? 2 : 1;
+ CHECK( mt->SetUINT32( MF_MT_AUDIO_NUM_CHANNELS, channels ) );
+ CHECK( mt->SetUINT32( MF_MT_AUDIO_BLOCK_ALIGNMENT, channels * 4 ) );
+ CHECK( mt->SetUINT32( MF_MT_AUDIO_AVG_BYTES_PER_SECOND, channels * 4 * SAMPLE_RATE ) );
+ CHECK( mt->SetUINT32( MF_MT_AUDIO_BITS_PER_SAMPLE, 32 ) );
+ CHECK( mt->SetUINT32( MF_MT_ALL_SAMPLES_INDEPENDENT, TRUE ) );
+
+ *pp = mt.Detach();
+
+ return S_OK;
+}
+
+HRESULT Whisper::getStreamDuration( IMFSourceReader* reader, int64_t& duration )
+{
+ PROPVARIANT var;
+ PropVariantInit( &var );
+ CHECK( reader->GetPresentationAttribute( MF_SOURCE_READER_MEDIASOURCE, MF_PD_DURATION, &var ) );
+
+ if( var.vt == VT_UI8 )
+ {
+ // The documentation says the type of that attribute is UINT64
+ // https://learn.microsoft.com/en-us/windows/win32/medfound/mf-pd-duration-attribute
+ duration = var.uhVal.QuadPart;
+ return S_OK;
+ }
+ logError( u8"Unexpected type of MF_PD_DURATION attribute" );
+ return E_INVALIDARG;
+}
+
+HRESULT Whisper::validateCurrentMediaType( IMFSourceReader* reader, uint32_t expectedChannels )
+{
+ CComPtr<IMFMediaType> mt;
+ CHECK( reader->GetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, &mt ) );
+
+ GUID guid;
+ CHECK( mt->GetGUID( MF_MT_MAJOR_TYPE, &guid ) );
+ if( guid != MFMediaType_Audio )
+ return E_FAIL;
+
+ CHECK( mt->GetGUID( MF_MT_SUBTYPE, &guid ) );
+ if( guid != MFAudioFormat_Float )
+ return E_FAIL;
+
+ UINT32 u32;
+ CHECK( mt->GetUINT32( MF_MT_AUDIO_SAMPLES_PER_SECOND, &u32 ) );
+ if( u32 != SAMPLE_RATE )
+ return E_FAIL;
+
+ CHECK( mt->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &u32 ) );
+ if( u32 != expectedChannels )
+ return E_FAIL;
+
+ return S_OK;
+} \ No newline at end of file
diff --git a/Whisper/MF/mfUtils.h b/Whisper/MF/mfUtils.h
new file mode 100644
index 0000000..c889a92
--- /dev/null
+++ b/Whisper/MF/mfUtils.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <stdint.h>
+#include <mfidl.h>
+#include <mfobjects.h>
+#include <mfreadwrite.h>
+#include "../Whisper/audioConstants.h"
+
+namespace Whisper
+{
+ HRESULT createMediaType( bool stereo, IMFMediaType** pp );
+
+ HRESULT getStreamDuration( IMFSourceReader* reader, int64_t& duration );
+
+ HRESULT validateCurrentMediaType( IMFSourceReader* reader, uint32_t expectedChannels );
+} \ No newline at end of file