diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/MF | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'Whisper/MF')
| -rw-r--r-- | Whisper/MF/AudioBuffer.cpp | 93 | ||||
| -rw-r--r-- | Whisper/MF/AudioBuffer.h | 41 | ||||
| -rw-r--r-- | Whisper/MF/AudioCapture.cpp | 167 | ||||
| -rw-r--r-- | Whisper/MF/AudioCapture.h | 12 | ||||
| -rw-r--r-- | Whisper/MF/MediaFoundation.cpp | 109 | ||||
| -rw-r--r-- | Whisper/MF/PcmReader.cpp | 274 | ||||
| -rw-r--r-- | Whisper/MF/PcmReader.h | 63 | ||||
| -rw-r--r-- | Whisper/MF/loadAudioFile.cpp | 151 | ||||
| -rw-r--r-- | Whisper/MF/loadAudioFile.h | 7 | ||||
| -rw-r--r-- | Whisper/MF/mfStartup.cpp | 128 | ||||
| -rw-r--r-- | Whisper/MF/mfStartup.h | 15 | ||||
| -rw-r--r-- | Whisper/MF/mfUtils.cpp | 69 | ||||
| -rw-r--r-- | Whisper/MF/mfUtils.h | 15 |
13 files changed, 1144 insertions, 0 deletions
diff --git a/Whisper/MF/AudioBuffer.cpp b/Whisper/MF/AudioBuffer.cpp new file mode 100644 index 0000000..ba4752d --- /dev/null +++ b/Whisper/MF/AudioBuffer.cpp @@ -0,0 +1,93 @@ +#include "stdafx.h" +#include "AudioBuffer.h" +using namespace Whisper; + +void AudioBuffer::appendMono( const float* rsi, size_t countFloats ) +{ + mono.insert( mono.end(), rsi, rsi + countFloats ); +} + +void AudioBuffer::appendStereo( const float* rsi, size_t countFloats ) +{ + assert( 0 == ( countFloats % 2 ) ); + const size_t countSamples = countFloats / 2; + + const size_t oldLength = mono.size(); + assert( oldLength * 2 == stereo.size() ); + mono.resize( oldLength + countSamples ); + stereo.resize( ( oldLength + countSamples ) * 2 ); + + const float* const rsiEnd = rsi + countSamples * 2; + const float* const rsiEndAligned = rsiEnd - ( countSamples * 2 ) % 8; + + float* rdiStereo = &stereo[ oldLength * 2 ]; + float* rdiMono = &mono[ oldLength ]; + + const __m128 half = _mm_set1_ps( 0.5f ); + for( ; rsi < rsiEndAligned; rsi += 8, rdiStereo += 8, rdiMono += 4 ) + { + // Load 4 samples = 8 floats + __m128 v0 = _mm_loadu_ps( rsi ); // L0, R0, L1, R1 + __m128 v1 = _mm_loadu_ps( rsi + 4 );// L2, R2, L3, R3 + + // Store into the stereo PCM vector + _mm_storeu_ps( rdiStereo, v0 ); + _mm_storeu_ps( rdiStereo + 4, v1 ); + + // Compute and store the average of these channels + __m128 left = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 2, 0, 2, 0 ) ); + __m128 right = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 3, 1, 3, 1 ) ); + __m128 sum = _mm_add_ps( left, right ); + sum = _mm_mul_ps( sum, half ); + _mm_storeu_ps( rdiMono, sum ); + } + +#pragma loop (no_vector) + for( ; rsi < rsiEnd; rsi += 2, rdiStereo += 2, rdiMono++ ) + { + __m128 vec = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) ); + _mm_store_sd( (double*)rdiStereo, _mm_castps_pd( vec ) ); + + vec = _mm_add_ss( vec, _mm_movehdup_ps( vec ) ); + vec = _mm_mul_ss( vec, half ); + _mm_store_ss( rdiMono, vec ); + } +} + +void AudioBuffer::appendDownmixedStereo( const float* rsi, size_t countFloats ) +{ + assert( 0 == ( countFloats % 2 ) ); + const size_t countSamples = countFloats / 2; + + const size_t oldLength = mono.size(); + mono.resize( oldLength + countSamples ); + + const float* const rsiEnd = rsi + countSamples * 2; + const float* const rsiEndAligned = rsiEnd - ( countSamples * 2 ) % 8; + + float* rdiMono = &mono[ oldLength ]; + + const __m128 half = _mm_set1_ps( 0.5f ); + for( ; rsi < rsiEndAligned; rsi += 8, rdiMono += 4 ) + { + // Load 4 samples = 8 floats + __m128 v0 = _mm_loadu_ps( rsi ); // L0, R0, L1, R1 + __m128 v1 = _mm_loadu_ps( rsi + 4 );// L2, R2, L3, R3 + + // Compute and store the average of these channels + __m128 left = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 2, 0, 2, 0 ) ); + __m128 right = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 3, 1, 3, 1 ) ); + __m128 sum = _mm_add_ps( left, right ); + sum = _mm_mul_ps( sum, half ); + _mm_storeu_ps( rdiMono, sum ); + } + +#pragma loop (no_vector) + for( ; rsi < rsiEnd; rsi += 2, rdiMono++ ) + { + __m128 vec = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) ); + vec = _mm_add_ss( vec, _mm_movehdup_ps( vec ) ); + vec = _mm_mul_ss( vec, half ); + _mm_store_ss( rdiMono, vec ); + } +}
\ No newline at end of file diff --git a/Whisper/MF/AudioBuffer.h b/Whisper/MF/AudioBuffer.h new file mode 100644 index 0000000..87319dd --- /dev/null +++ b/Whisper/MF/AudioBuffer.h @@ -0,0 +1,41 @@ +#pragma once +#include <vector> + +namespace Whisper +{ + struct AudioBuffer + { + std::vector<float> mono; + std::vector<float> stereo; + + void appendMono( const float* rsi, size_t countFloats ); + void appendDownmixedStereo( const float* rsi, size_t countFloats ); + void appendStereo( const float* rsi, size_t countFloats ); + + using pfnAppendSamples = void( AudioBuffer::* )( const float* rsi, size_t countFloats ); + + inline static pfnAppendSamples appendSamplesFunc( bool sourceMono, bool wantStereo ) + { + if( sourceMono ) + return &AudioBuffer::appendMono; + else if( !wantStereo ) + return &AudioBuffer::appendDownmixedStereo; + else + return &AudioBuffer::appendStereo; + } + + void clear() + { + mono.clear(); + stereo.clear(); + } + + void resize( size_t len ) + { + assert( len <= mono.size() ); + mono.resize( len ); + if( !stereo.empty() ) + stereo.resize( len * 2 ); + } + }; +}
\ No newline at end of file diff --git a/Whisper/MF/AudioCapture.cpp b/Whisper/MF/AudioCapture.cpp new file mode 100644 index 0000000..17f34dc --- /dev/null +++ b/Whisper/MF/AudioCapture.cpp @@ -0,0 +1,167 @@ +#include "stdafx.h" +#include <atlstr.h> +#include <mfapi.h> +#include <mfidl.h> +#include <mfreadwrite.h> +#include "AudioCapture.h" +#include "../API/iMediaFoundation.cl.h" +#include "../ComLightLib/comLightServer.h" +#pragma comment(lib, "Mf.lib") + +namespace +{ + struct Strings + { + CString displayName, endpoint; + }; + + HRESULT getAllocString( IMFActivate* activate, const GUID& id, CString& rdi ) + { + wchar_t* pointer = nullptr; + UINT32 cchName; + HRESULT hr = activate->GetAllocatedString( id, &pointer, &cchName ); + if( SUCCEEDED( hr ) ) + rdi.SetString( pointer, cchName ); + CoTaskMemFree( pointer ); + return hr; + } + + HRESULT getInfo( IMFActivate* activate, Strings& rdi ) + { + CHECK( getAllocString( activate, MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME, rdi.displayName ) ); + CHECK( getAllocString( activate, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_ENDPOINT_ID, rdi.endpoint ) ); + return S_OK; + } + + HRESULT __stdcall supplyDevices( Whisper::pfnFoundCaptureDevices pfn, void* pv, IMFActivate** ppDevices, UINT32 count ) + { + if( ppDevices == nullptr || count == 0 ) + return pfn( 0, nullptr, pv ); + + std::vector<Strings> strings; + strings.reserve( count ); + + for( UINT i = 0; i < count; i++ ) + { + IMFActivate* const activate = ppDevices[ i ]; + if( nullptr == activate ) + continue; + Strings info; + HRESULT hr = getInfo( activate, info ); + if( FAILED( hr ) ) + continue; + + strings.emplace_back( std::move( info ) ); + } + + const size_t len = strings.size(); + if( 0 == len ) + return pfn( 0, nullptr, pv ); + + std::vector<Whisper::sCaptureDevice> pointers; + pointers.resize( len ); + for( size_t i = 0; i < len; i++ ) + { + const auto& src = strings[ i ]; + auto& dest = pointers[ i ]; + dest.displayName = src.displayName; + dest.endpoint = src.endpoint; + } + return pfn( (int)len, pointers.data(), pv ); + } +} + +HRESULT __stdcall Whisper::captureDeviceList( pfnFoundCaptureDevices pfn, void* pv ) +{ + // Create an attribute store to hold the search criteria. + CComPtr<IMFAttributes> attrs; + CHECK( MFCreateAttributes( &attrs, 1 ) ); + // Request audio capture devices + CHECK( attrs->SetGUID( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID ) ); + + // Enumerate the devices + IMFActivate** ppDevices = nullptr; + UINT32 count = 0; + CHECK( MFEnumDeviceSources( attrs, &ppDevices, &count ) ); + + // Feed the data to the caller + HRESULT hr = supplyDevices( pfn, pv, ppDevices, count ); + + // Free the memory + for( DWORD i = 0; i < count; i++ ) + ppDevices[ i ]->Release(); + CoTaskMemFree( ppDevices ); + + return hr; +} + +namespace +{ + using namespace Whisper; + + class Capture : public ComLight::ObjectRoot<iAudioCapture> + { + CComPtr<IMFSourceReader> reader; + CComPtr<iMediaFoundation> mediaFoundation; + sCaptureParams captureParams; + + HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const noexcept override final + { + if( pp == nullptr ) + return E_POINTER; + CComPtr<IMFSourceReader> res = reader; + *pp = res.Detach();; + return S_OK; + } + const sCaptureParams& COMLIGHTCALL getParams() const noexcept override final + { + return captureParams; + } + public: + HRESULT open( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& cp ); + }; + + HRESULT Capture::open( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& cp ) + { + // Create an attribute store to hold the search criteria. + CComPtr<IMFAttributes> attrs; + CHECK( MFCreateAttributes( &attrs, 2 ) ); + // Request audio capture devices + CHECK( attrs->SetGUID( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID ) ); + CHECK( attrs->SetString( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_ENDPOINT_ID, endpoint ) ); + + CComPtr<IMFMediaSource> source; + HRESULT hr = MFCreateDeviceSource( attrs, &source ); + if( FAILED( hr ) ) + { + logErrorHr( hr, u8"MFCreateDeviceSource" ); + return hr; + } + + // TODO: implement IMFSourceReaderCallback, pass into MF_SOURCE_READER_ASYNC_CALLBACK attribute + // This is to support cancellation + hr = MFCreateSourceReaderFromMediaSource( source, nullptr, &reader ); + if( FAILED( hr ) ) + { + logErrorHr( hr, u8"MFCreateSourceReaderFromMediaSource" ); + return hr; + } + + captureParams = cp; + mediaFoundation = owner; + return S_OK; + } +} + +HRESULT __stdcall Whisper::captureOpen( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept +{ + if( nullptr == endpoint || nullptr == pp ) + return E_POINTER; + + ComLight::CComPtr<ComLight::Object<Capture>> res; + CHECK( ComLight::Object<Capture>::create( res ) ); + CHECK( res->open( owner, endpoint, captureParams ) ); + + res.detach( pp ); + return S_OK; +}
\ No newline at end of file diff --git a/Whisper/MF/AudioCapture.h b/Whisper/MF/AudioCapture.h new file mode 100644 index 0000000..276ee4b --- /dev/null +++ b/Whisper/MF/AudioCapture.h @@ -0,0 +1,12 @@ +#pragma once +#include "../API/MfStructs.h" + +namespace Whisper +{ + struct iAudioCapture; + struct iMediaFoundation; + + HRESULT __stdcall captureDeviceList( pfnFoundCaptureDevices pfn, void* pv ); + + HRESULT __stdcall captureOpen( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept; +}
\ No newline at end of file diff --git a/Whisper/MF/MediaFoundation.cpp b/Whisper/MF/MediaFoundation.cpp new file mode 100644 index 0000000..4a4f6a2 --- /dev/null +++ b/Whisper/MF/MediaFoundation.cpp @@ -0,0 +1,109 @@ +#include "stdafx.h" +#include "../API/iMediaFoundation.cl.h" +#include "mfStartup.h" +#include "../ComLightLib/comLightServer.h" +#include "loadAudioFile.h" +#include <mfidl.h> +#include <mfreadwrite.h> +#include "mfUtils.h" +#include "AudioCapture.h" + +namespace Whisper +{ + class AudioReader : public ComLight::ObjectRoot<iAudioReader> + { + CComPtr<IMFSourceReader> reader; + bool wantStereo; + CComPtr<iMediaFoundation> mediaFoundation; + + HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const noexcept override final + { + if( pp == nullptr ) + return E_POINTER; + CComPtr<IMFSourceReader> res = reader; + *pp = res.Detach();; + return S_OK; + } + HRESULT COMLIGHTCALL requestedStereo() const noexcept override final + { + return wantStereo ? S_OK : S_FALSE; + } + HRESULT COMLIGHTCALL getDuration( int64_t& rdi ) const noexcept override final + { + if( reader ) + return getStreamDuration( reader, rdi ); + return OLE_E_BLANK; + } + public: + HRESULT open( iMediaFoundation* owner, LPCTSTR path, bool stereo ) + { + HRESULT hr = MFCreateSourceReaderFromURL( path, nullptr, &reader ); + if( FAILED( hr ) ) + { + logErrorHr( hr, u8"MFCreateSourceReaderFromURL failed" ); + return hr; + } + wantStereo = stereo; + mediaFoundation = owner; + logDebug16( L"Created source reader from the file \"%s\"", path ); + return S_OK; + } + }; + + class MediaFoundation : public ComLight::ObjectRoot<iMediaFoundation> + { + MfStartupRaii raii; + DWORD tid = ~(DWORD)0; + + virtual HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const noexcept override final + { + return Whisper::loadAudioFile( path, stereo, pp ); + } + virtual HRESULT COMLIGHTCALL openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ) noexcept override final + { + if( nullptr == path || nullptr == pp ) + return E_POINTER; + + ComLight::CComPtr<ComLight::Object<AudioReader>> res; + CHECK( ComLight::Object<AudioReader>::create( res ) ); + CHECK( res->open( this, path, stereo ) ); + + res.detach( pp ); + return S_OK; + } + HRESULT COMLIGHTCALL listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ) noexcept override final + { + return captureDeviceList( pfn, pv ); + } + HRESULT COMLIGHTCALL openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept override final + { + return captureOpen( this, endpoint, captureParams, pp ); + } + protected: + + HRESULT FinalConstruct() + { + CHECK( raii.startup() ); + tid = GetCurrentThreadId(); + return S_OK; + } + + public: + + ~MediaFoundation() override + { + assert( tid == GetCurrentThreadId() ); + } + }; +} + +HRESULT COMLIGHTCALL Whisper::initMediaFoundation( iMediaFoundation** pp ) +{ + if( nullptr == pp ) + return E_POINTER; + + ComLight::CComPtr<ComLight::Object<MediaFoundation>> obj; + CHECK( ComLight::Object<MediaFoundation>::create( obj ) ); + obj.detach( pp ); + return S_OK; +}
\ No newline at end of file diff --git a/Whisper/MF/PcmReader.cpp b/Whisper/MF/PcmReader.cpp new file mode 100644 index 0000000..ab92fc3 --- /dev/null +++ b/Whisper/MF/PcmReader.cpp @@ -0,0 +1,274 @@ +#include "stdafx.h" +#include "PcmReader.h" +#include <mfapi.h> +#include "mfUtils.h" + +namespace Whisper +{ + __interface iSampleHandler + { + void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const; + void moveBufferData( AudioBuffer& rdi, size_t amount ) const; + void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const; + void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const; + uint32_t readerChannelsCount() const; + }; +} + +namespace +{ + using namespace Whisper; + + __forceinline void copyMono( PcmMonoChunk* rdi, const AudioBuffer& rsi, size_t sourceOffset, size_t samples ) + { + assert( sourceOffset + samples <= rsi.mono.size() ); + memcpy( rdi->mono.data(), &rsi.mono[ sourceOffset ], samples * 4 ); + if( samples < FFT_STEP ) + memset( rdi->mono.data() + samples, 0, ( FFT_STEP - samples ) * 4 ); + } + + __forceinline void copyStereo( PcmStereoChunk* rdi, const AudioBuffer& rsi, size_t sourceOffset, size_t samples ) + { + memcpy( rdi->stereo.data(), &rsi.stereo[ sourceOffset * 2 ], samples * 8 ); + if( samples < FFT_STEP ) + memset( rdi->stereo.data() + samples * 2, 0, ( FFT_STEP - samples ) * 8 ); + } + + struct HandlerMono : iSampleHandler + { + void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override + { + rdi.appendMono( rsi, countFloats ); + } + void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const override final + { + copyMono( pMono, rsi, sourceOffset, FFT_STEP ); + } + void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const override final + { + copyMono( pMono, rsi, sourceOffset, samples ); + } + void moveBufferData( AudioBuffer& rdi, size_t amount ) const override final + { + const size_t len = rdi.mono.size(); + assert( amount <= len ); + if( amount < len ) + { + const size_t block = len - amount; + memmove( rdi.mono.data(), rdi.mono.data() + amount, block * 4 ); + rdi.mono.resize( block ); + } + else + rdi.mono.clear(); + } + uint32_t readerChannelsCount() const override { return 1; } + }; + struct HandlerDownmixedStereo : HandlerMono + { + void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override final + { + rdi.appendDownmixedStereo( rsi, countFloats ); + } + uint32_t readerChannelsCount() const override final { return 2; } + }; + struct HandlerStereo : iSampleHandler + { + void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override final + { + rdi.appendStereo( rsi, countFloats ); + } + void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const override final + { + copyMono( pMono, rsi, sourceOffset, FFT_STEP ); + copyStereo( pStereo, rsi, sourceOffset, FFT_STEP ); + } + void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const override final + { + copyMono( pMono, rsi, sourceOffset, samples ); + copyStereo( pStereo, rsi, sourceOffset, samples ); + } + void moveBufferData( AudioBuffer& rdi, size_t amount ) const override final + { + const size_t len = rdi.mono.size(); + assert( amount <= len ); + if( amount < len ) + { + const size_t block = len - amount; + memmove( rdi.mono.data(), rdi.mono.data() + amount, block * 4 ); + rdi.mono.resize( block ); + memmove( rdi.stereo.data(), rdi.stereo.data() + amount * 2, block * 8 ); + rdi.mono.resize( block * 2 ); + } + else + { + rdi.mono.clear(); + rdi.stereo.clear(); + } + } + uint32_t readerChannelsCount() const override final { return 2; } + }; + static const HandlerMono s_mono; + static const HandlerDownmixedStereo s_downmix; + static const HandlerStereo s_stereo; +} + +PcmReader::PcmReader( IMFSourceReader* reader, bool stereo ) +{ + if( nullptr == reader ) + throw E_POINTER; + this->reader = reader; + + // Set up media type, and figure out sample handler + check( reader->SetStreamSelection( MF_SOURCE_READER_ALL_STREAMS, FALSE ) ); + check( reader->SetStreamSelection( MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE ) ); + + CComPtr<IMFMediaType> mtNative; + check( reader->GetNativeMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, MF_SOURCE_READER_CURRENT_TYPE_INDEX, &mtNative ) ); + UINT32 numChannels; + check( mtNative->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &numChannels ) ); + + const bool sourceMono = numChannels < 2; + if( sourceMono ) + sampleHandler = &s_mono; + else if( !stereo ) + sampleHandler = &s_downmix; + else + { + sampleHandler = &s_stereo; + m_stereoOutput = true; + } + + CComPtr<IMFMediaType> mt; + check( createMediaType( !sourceMono, &mt ) ); + check( reader->SetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, nullptr, mt ) ); + + // Find out the length + int64_t durationTicks; + check( getStreamDuration( reader, durationTicks ) ); + + // Convert length to chunks + // Seconds = Ticks / 10^7 + // Samples = Seconds * SAMPLE_RATE = Ticks * SAMPLE_RATE / 10^7 + // Chunks = Samples / FFT_STEP = Ticks * SAMPLE_RATE / ( FFT_STEP * 10^7 ), and we want that integer rounded down + constexpr __int64 mul = SAMPLE_RATE; + constexpr __int64 div = (__int64)FFT_STEP * 10'000'000; + m_length = (size_t)MFllMulDiv( durationTicks, mul, div, 0 ); +} + +HRESULT PcmReader::readNextSample() +{ + const size_t off = bufferReadOffset; + const size_t availableSamples = pcm.mono.size() - off; + + // If needed, move the remaining PCM data to the start of these vectors + if( availableSamples > 0 ) + { + if( 0 != off ) + sampleHandler->moveBufferData( pcm, off ); + } + else + pcm.clear(); + bufferReadOffset = 0; + + while( true ) + { + DWORD dwFlags = 0; + CComPtr<IMFSample> sample; + + // Read the next sample + HRESULT hr = reader->ReadSample( (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, nullptr, &dwFlags, nullptr, &sample ); + if( FAILED( hr ) ) + { + logErrorHr( hr, u8"IMFSourceReader.ReadSample" ); + return hr; + } + + if( dwFlags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED ) + { + // logError( u8"Media type changes ain’t supported by the library." ); + // return E_UNEXPECTED; + + // This happens for some video files at the very start of the reading, with Dolby AC3 audio track. + // Instead of failing the transcribe process, verify the important attributes (FP32 samples, sample rate, count of channels) haven’t changed. + CHECK( validateCurrentMediaType( reader, sampleHandler->readerChannelsCount() ) ); + } + + if( dwFlags & MF_SOURCE_READERF_ENDOFSTREAM ) + return E_EOF; + + if( !sample ) + { + // printf( "No sample\n" ); + continue; + } + + // Get a pointer to the audio data in the sample. + CComPtr<IMFMediaBuffer> buffer; + hr = sample->ConvertToContiguousBuffer( &buffer ); + if( FAILED( hr ) ) + return hr; + + const float* pAudioData = nullptr; + DWORD cbBuffer; + hr = buffer->Lock( (BYTE**)&pAudioData, nullptr, &cbBuffer ); + if( FAILED( hr ) ) + return hr; + + try + { + assert( 0 == ( cbBuffer % sizeof( float ) ) ); + const size_t countFloats = cbBuffer / sizeof( float ); + sampleHandler->appendPcm( pcm, pAudioData, countFloats ); + } + catch( const std::bad_alloc& ) + { + buffer->Unlock(); + return E_OUTOFMEMORY; + } + + // Unlock the buffer + hr = buffer->Unlock(); + if( FAILED( hr ) ) + return hr; + + return S_OK; + } +} + +HRESULT PcmReader::readChunk( PcmMonoChunk& mono, PcmStereoChunk* stereo ) +{ + while( true ) + { + const size_t off = bufferReadOffset; + const size_t availableSamples = pcm.mono.size() - off; + if( availableSamples >= FFT_STEP ) + { + // We have enough data in the buffer + sampleHandler->copyChunk( &mono, pcm, off, stereo ); + bufferReadOffset = off + FFT_STEP; + return S_OK; + } + + if( !m_readerEndOfFile ) + { + // We don't have enough data, but the stream has not ended yet, can load moar samples from the reader + HRESULT hr = readNextSample(); + if( SUCCEEDED( hr ) ) + continue; + if( hr != E_EOF ) + return hr; + m_readerEndOfFile = true; + } + + if( availableSamples > 0 ) + { + // We have reached the end of stream of the reader, but the buffer still has a few samples. + // Return the final incomplete chunk padded with zeros + sampleHandler->copyChunk( &mono, pcm, off, availableSamples, stereo ); + bufferReadOffset = off + availableSamples; + return S_OK; + } + + return E_EOF; + } +}
\ No newline at end of file diff --git a/Whisper/MF/PcmReader.h b/Whisper/MF/PcmReader.h new file mode 100644 index 0000000..9e3757e --- /dev/null +++ b/Whisper/MF/PcmReader.h @@ -0,0 +1,63 @@ +#pragma once +#include "../Whisper/audioConstants.h" +#include <mfidl.h> +#include <mfreadwrite.h> +#include "AudioBuffer.h" + +namespace Whisper +{ + // PCM buffer with 10 milliseconds of single-channel audio + struct PcmMonoChunk + { + std::array<float, FFT_STEP> mono; + }; + // PCM buffer with 10 milliseconds of interleaved stereo + struct PcmStereoChunk + { + std::array<float, FFT_STEP * 2> stereo; + }; + + __interface iSampleHandler; + + constexpr HRESULT E_EOF = HRESULT_FROM_WIN32( ERROR_HANDLE_EOF ); + + // Utility class which reads chunks of FFT_STEP FP32 PCM samples from the MF source reader + // The class always delivers mono chunks, and can optionally deliver stereo in a separate buffer. + class PcmReader + { + // A small intermediate buffer with PCM data for complete media foundation samples + AudioBuffer pcm; + // Index of the first unconsumed sample in the pcm buffer + size_t bufferReadOffset = 0; + // Utility object to abstract away mono versus stereo shenanigans + const iSampleHandler* sampleHandler; + // The underlying MF source reader which delivers audio data + CComPtr<IMFSourceReader> reader; + // True after we consumed all available media samples from the reader + bool m_readerEndOfFile = false; + // True if this object delivers stereo samples + bool m_stereoOutput = false; + // The count of chunks we expect to get from the reader + size_t m_length = 0; + // Read next sample from the reader, store in the PCM buffer in this class + HRESULT readNextSample(); + + public: + + PcmReader( IMFSourceReader* source, bool stereo ); + + // Count of chunks in the MEL spectrogram. + // The PCM audio is generally slightly longer than that, due to the incomplete last chunk. + size_t getLength() const noexcept + { + return m_length; + } + + // True when the stereo flag passed to constructor, and the audio stream actually has 2 or more audio channels + bool outputsStereo() const { return m_stereoOutput; } + + // Load another 10ms chunk from the stream + // For the last chunk in the stream, the output buffers are padded with zeros + HRESULT readChunk( PcmMonoChunk& mono, PcmStereoChunk* stereo ); + }; +}
\ No newline at end of file diff --git a/Whisper/MF/loadAudioFile.cpp b/Whisper/MF/loadAudioFile.cpp new file mode 100644 index 0000000..d1a439a --- /dev/null +++ b/Whisper/MF/loadAudioFile.cpp @@ -0,0 +1,151 @@ +#include "stdafx.h" +#include "../ComLightLib/comLightServer.h" +#include "loadAudioFile.h" +#include "mfUtils.h" +#include "AudioBuffer.h" +#include <mfidl.h> +#include <mfreadwrite.h> +#include <mfapi.h> +#pragma comment(lib, "Mfreadwrite.lib") +#pragma comment(lib, "mfuuid.lib") + +namespace Whisper +{ + class MediaFileBuffer : public ComLight::ObjectRoot<iAudioBuffer> + { + AudioBuffer pcm; + uint32_t channels = 0; + + uint32_t COMLIGHTCALL countSamples() const noexcept override final + { + return (uint32_t)( pcm.mono.size() ); + } + const float* COMLIGHTCALL getPcmMono() const noexcept override final + { + if( !pcm.mono.empty() ) + return pcm.mono.data(); + return nullptr; + } + const float* COMLIGHTCALL getPcmStereo() const noexcept override final + { + if( !pcm.stereo.empty() ) + return pcm.stereo.data(); + return nullptr; + } + HRESULT COMLIGHTCALL getTime( int64_t& rdi ) const noexcept override final + { + rdi = 0; + return S_OK; + } + public: + HRESULT load( LPCTSTR path, bool stereo ); + }; + + HRESULT MediaFileBuffer::load( LPCTSTR path, bool stereo ) + { + CComPtr<IMFSourceReader> reader; + HRESULT hr = MFCreateSourceReaderFromURL( path, nullptr, &reader ); + if( FAILED( hr ) ) + { + logErrorHr( hr, u8"MFCreateSourceReaderFromURL failed" ); + return hr; + } + + CHECK( reader->SetStreamSelection( MF_SOURCE_READER_ALL_STREAMS, FALSE ) ); + CHECK( reader->SetStreamSelection( MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE ) ); + + CComPtr<IMFMediaType> mtNative; + CHECK( reader->GetNativeMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, MF_SOURCE_READER_CURRENT_TYPE_INDEX, &mtNative ) ); + UINT32 numChannels; + CHECK( mtNative->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &numChannels ) ); + const bool sourceMono = numChannels == 1; + const AudioBuffer::pfnAppendSamples pfn = AudioBuffer::appendSamplesFunc( sourceMono, stereo ); + channels = ( stereo && !sourceMono ) ? 2 : 1; + + CComPtr<IMFMediaType> mt; + CHECK( createMediaType( !sourceMono, &mt ) ); + + CHECK( reader->SetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, nullptr, mt ) ); + + while( true ) + { + DWORD dwFlags = 0; + CComPtr<IMFSample> sample; + + // Read the next sample. + hr = reader->ReadSample( (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, nullptr, &dwFlags, nullptr, &sample ); + if( FAILED( hr ) ) + { + logErrorHr( hr, u8"IMFSourceReader.ReadSample" ); + return hr; + } + + if( dwFlags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED ) + { + logError( u8"Media type changes ain’t supported by the library." ); + return E_UNEXPECTED; + } + + if( dwFlags & MF_SOURCE_READERF_ENDOFSTREAM ) + break; + + if( !sample ) + { + // printf( "No sample\n" ); + continue; + } + + // Get a pointer to the audio data in the sample. + CComPtr<IMFMediaBuffer> buffer; + hr = sample->ConvertToContiguousBuffer( &buffer ); + if( FAILED( hr ) ) + return hr; + + const float* pAudioData = nullptr; + DWORD cbBuffer; + hr = buffer->Lock( (BYTE**)&pAudioData, nullptr, &cbBuffer ); + if( FAILED( hr ) ) + return hr; + + try + { + const size_t countFloats = cbBuffer / sizeof( float ); + ( pcm.*pfn )( pAudioData, countFloats ); + } + catch( const std::bad_alloc& ) + { + return E_OUTOFMEMORY; + } + + // Unlock the buffer + hr = buffer->Unlock(); + if( FAILED( hr ) ) + return hr; + } + + const size_t len = pcm.mono.size(); + if( len == 0 ) + { + logError16( L"The audio file \"%s\" has no samples", path ); + return E_INVALIDARG; + } + if( len < SAMPLE_RATE / 2 ) + logError16( L"The file \"%s\" only has %zu samples, less than 0.5 seconds of audio", path, len ); + else + logDebug16( L"Loaded audio file from \"%s\": %zu samples, %g seconds", path, len, (int)len * ( 1.0 / SAMPLE_RATE ) ); + return S_OK; + + } +} + +HRESULT COMLIGHTCALL Whisper::loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) +{ + if( nullptr == path || nullptr == pp ) + return E_POINTER; + + ComLight::CComPtr<ComLight::Object<MediaFileBuffer>> obj; + CHECK( ComLight::Object<MediaFileBuffer>::create( obj ) ); + CHECK( obj->load( path, stereo ) ); + obj.detach( pp ); + return S_OK; +}
\ No newline at end of file diff --git a/Whisper/MF/loadAudioFile.h b/Whisper/MF/loadAudioFile.h new file mode 100644 index 0000000..9736ccd --- /dev/null +++ b/Whisper/MF/loadAudioFile.h @@ -0,0 +1,7 @@ +#pragma once +#include "../API/iMediaFoundation.cl.h" + +namespace Whisper +{ + HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ); +}
\ No newline at end of file diff --git a/Whisper/MF/mfStartup.cpp b/Whisper/MF/mfStartup.cpp new file mode 100644 index 0000000..b7ab829 --- /dev/null +++ b/Whisper/MF/mfStartup.cpp @@ -0,0 +1,128 @@ +#include "stdafx.h" +#include "mfStartup.h" +#include <atlbase.h> +#include <mfapi.h> +#pragma comment(lib, "Mfplat.lib") + +namespace +{ + struct sCoInitStatus + { + // Possible state: + // -1 is the initial state, coInitialize never called + // S_OK - CoInitializeEx succeeded, in this state the counter tracks the count of coInitialize() for the current thread + // S_FALSE - CoInitializeEx failed with RPC_E_CHANGED_MODE, or did nothing because already initialized for the current thread + // Error status - CoInitializeEx failed for some other reason + HRESULT code = -1; + uint32_t counter = 0; + }; + thread_local sCoInitStatus coInitStatus; + + static HRESULT coInitialize() + { + sCoInitStatus& cis = coInitStatus; + HRESULT hr = cis.code; + if( SUCCEEDED( hr ) ) + { + if( S_OK == hr ) + cis.counter++; + return S_FALSE; + } + + if( hr == HRESULT( -1 ) ) + { + hr = CoInitializeEx( nullptr, COINIT_MULTITHREADED ); + if( S_OK == hr ) + { + cis.counter = 1; + return cis.code = S_OK; + } + if( S_FALSE == hr || RPC_E_CHANGED_MODE == hr ) + { + return cis.code = S_FALSE; + } + cis.code = hr; + return hr; + } + + return hr; + } + + static void coUninitialize() + { + sCoInitStatus& cis = coInitStatus; + if( cis.code == S_OK ) + { + assert( cis.counter > 0 ); + cis.counter--; + if( 0 == cis.counter ) + CoUninitialize(); + } + } + + static CComAutoCriticalSection s_lock; +#define LOCK() CComCritSecLock<CComAutoCriticalSection> lock{ s_lock } + static uint32_t mfStartupCounter = 0; + + constexpr uint8_t FlagCOM = 1; + constexpr uint8_t FlagMF = 0x10; +} + +using namespace Whisper; + +MfStartupRaii::~MfStartupRaii() +{ + if( 0 != ( successFlags & FlagMF ) ) + { + LOCK(); + assert( mfStartupCounter > 0 ); + mfStartupCounter--; + if( mfStartupCounter > 0 ) + return; + MFShutdown(); + successFlags &= ~FlagMF; + } + + if( 0 != ( successFlags & FlagCOM ) ) + { + coUninitialize(); + successFlags &= ~FlagCOM; + } +} + +HRESULT MfStartupRaii::startup() +{ + if( 0 != ( successFlags & FlagMF ) ) + return HRESULT_FROM_WIN32( ERROR_ALREADY_INITIALIZED ); + + HRESULT hr = coInitialize(); + CHECK( hr ); + if( hr == S_OK ) + successFlags |= FlagCOM; + + LOCK(); + + if( 0 == mfStartupCounter ) + { + HRESULT hr = MFStartup( MF_VERSION, MFSTARTUP_LITE ); + if( SUCCEEDED( hr ) ) + { + mfStartupCounter = 1; + successFlags |= FlagMF; + return S_OK; + } + + if( 0 != ( successFlags & FlagCOM ) ) + { + coUninitialize(); + successFlags &= ~FlagCOM; + } + return hr; + } + else + { + mfStartupCounter++; + successFlags |= FlagMF; + return S_FALSE; + } +}
\ No newline at end of file diff --git a/Whisper/MF/mfStartup.h b/Whisper/MF/mfStartup.h new file mode 100644 index 0000000..1434ffc --- /dev/null +++ b/Whisper/MF/mfStartup.h @@ -0,0 +1,15 @@ +#pragma once + +namespace Whisper +{ + class MfStartupRaii + { + uint8_t successFlags = 0; + public: + MfStartupRaii() = default; + ~MfStartupRaii(); + MfStartupRaii( const MfStartupRaii& ) = delete; + + HRESULT startup(); + }; +}
\ No newline at end of file diff --git a/Whisper/MF/mfUtils.cpp b/Whisper/MF/mfUtils.cpp new file mode 100644 index 0000000..e739079 --- /dev/null +++ b/Whisper/MF/mfUtils.cpp @@ -0,0 +1,69 @@ +#include "stdafx.h" +#include "mfUtils.h" +#include <mfapi.h> + +HRESULT Whisper::createMediaType( bool stereo, IMFMediaType** pp ) +{ + if( nullptr == pp ) + return E_POINTER; + + CComPtr<IMFMediaType> mt; + CHECK( MFCreateMediaType( &mt ) ); + CHECK( mt->SetGUID( MF_MT_MAJOR_TYPE, MFMediaType_Audio ) ); + CHECK( mt->SetGUID( MF_MT_SUBTYPE, MFAudioFormat_Float ) ); + CHECK( mt->SetUINT32( MF_MT_AUDIO_SAMPLES_PER_SECOND, SAMPLE_RATE ) ); + + const uint32_t channels = stereo ? 2 : 1; + CHECK( mt->SetUINT32( MF_MT_AUDIO_NUM_CHANNELS, channels ) ); + CHECK( mt->SetUINT32( MF_MT_AUDIO_BLOCK_ALIGNMENT, channels * 4 ) ); + CHECK( mt->SetUINT32( MF_MT_AUDIO_AVG_BYTES_PER_SECOND, channels * 4 * SAMPLE_RATE ) ); + CHECK( mt->SetUINT32( MF_MT_AUDIO_BITS_PER_SAMPLE, 32 ) ); + CHECK( mt->SetUINT32( MF_MT_ALL_SAMPLES_INDEPENDENT, TRUE ) ); + + *pp = mt.Detach(); + + return S_OK; +} + +HRESULT Whisper::getStreamDuration( IMFSourceReader* reader, int64_t& duration ) +{ + PROPVARIANT var; + PropVariantInit( &var ); + CHECK( reader->GetPresentationAttribute( MF_SOURCE_READER_MEDIASOURCE, MF_PD_DURATION, &var ) ); + + if( var.vt == VT_UI8 ) + { + // The documentation says the type of that attribute is UINT64 + // https://learn.microsoft.com/en-us/windows/win32/medfound/mf-pd-duration-attribute + duration = var.uhVal.QuadPart; + return S_OK; + } + logError( u8"Unexpected type of MF_PD_DURATION attribute" ); + return E_INVALIDARG; +} + +HRESULT Whisper::validateCurrentMediaType( IMFSourceReader* reader, uint32_t expectedChannels ) +{ + CComPtr<IMFMediaType> mt; + CHECK( reader->GetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, &mt ) ); + + GUID guid; + CHECK( mt->GetGUID( MF_MT_MAJOR_TYPE, &guid ) ); + if( guid != MFMediaType_Audio ) + return E_FAIL; + + CHECK( mt->GetGUID( MF_MT_SUBTYPE, &guid ) ); + if( guid != MFAudioFormat_Float ) + return E_FAIL; + + UINT32 u32; + CHECK( mt->GetUINT32( MF_MT_AUDIO_SAMPLES_PER_SECOND, &u32 ) ); + if( u32 != SAMPLE_RATE ) + return E_FAIL; + + CHECK( mt->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &u32 ) ); + if( u32 != expectedChannels ) + return E_FAIL; + + return S_OK; +}
\ No newline at end of file diff --git a/Whisper/MF/mfUtils.h b/Whisper/MF/mfUtils.h new file mode 100644 index 0000000..c889a92 --- /dev/null +++ b/Whisper/MF/mfUtils.h @@ -0,0 +1,15 @@ +#pragma once +#include <stdint.h> +#include <mfidl.h> +#include <mfobjects.h> +#include <mfreadwrite.h> +#include "../Whisper/audioConstants.h" + +namespace Whisper +{ + HRESULT createMediaType( bool stereo, IMFMediaType** pp ); + + HRESULT getStreamDuration( IMFSourceReader* reader, int64_t& duration ); + + HRESULT validateCurrentMediaType( IMFSourceReader* reader, uint32_t expectedChannels ); +}
\ No newline at end of file |
