Source codes

author: Konstantin <const@const.me> 2023-01-16 14:52:43 +0100
committer: Konstantin <const@const.me> 2023-01-16 14:52:43 +0100
commit: 8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree: 714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/MF
parent: 990a8d0dbaefc996244097397259e92758b15cce (diff)
13 files changed, 1144 insertions, 0 deletions
diff --git a/Whisper/MF/AudioBuffer.cpp b/Whisper/MF/AudioBuffer.cpp
new file mode 100644
index 0000000..ba4752d
--- /dev/null
+++ b/Whisper/MF/AudioBuffer.cpp
@@ -0,0 +1,93 @@
+#include "stdafx.h"
+#include "AudioBuffer.h"
+using namespace Whisper;
+
+void AudioBuffer::appendMono( const float* rsi, size_t countFloats )
+{
+	mono.insert( mono.end(), rsi, rsi + countFloats );
+}
+
+void AudioBuffer::appendStereo( const float* rsi, size_t countFloats )
+{
+	assert( 0 == ( countFloats % 2 ) );
+	const size_t countSamples = countFloats / 2;
+
+	const size_t oldLength = mono.size();
+	assert( oldLength * 2 == stereo.size() );
+	mono.resize( oldLength + countSamples );
+	stereo.resize( ( oldLength + countSamples ) * 2 );
+
+	const float* const rsiEnd = rsi + countSamples * 2;
+	const float* const rsiEndAligned = rsiEnd - ( countSamples * 2 ) % 8;
+
+	float* rdiStereo = &stereo[ oldLength * 2 ];
+	float* rdiMono = &mono[ oldLength ];
+
+	const __m128 half = _mm_set1_ps( 0.5f );
+	for( ; rsi < rsiEndAligned; rsi += 8, rdiStereo += 8, rdiMono += 4 )
+	{
+		// Load 4 samples = 8 floats 
+		__m128 v0 = _mm_loadu_ps( rsi );	// L0, R0, L1, R1
+		__m128 v1 = _mm_loadu_ps( rsi + 4 );// L2, R2, L3, R3
+
+		// Store into the stereo PCM vector
+		_mm_storeu_ps( rdiStereo, v0 );
+		_mm_storeu_ps( rdiStereo + 4, v1 );
+
+		// Compute and store the average of these channels
+		__m128 left = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 2, 0, 2, 0 ) );
+		__m128 right = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 3, 1, 3, 1 ) );
+		__m128 sum = _mm_add_ps( left, right );
+		sum = _mm_mul_ps( sum, half );
+		_mm_storeu_ps( rdiMono, sum );
+	}
+
+#pragma loop (no_vector)
+	for( ; rsi < rsiEnd; rsi += 2, rdiStereo += 2, rdiMono++ )
+	{
+		__m128 vec = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+		_mm_store_sd( (double*)rdiStereo, _mm_castps_pd( vec ) );
+
+		vec = _mm_add_ss( vec, _mm_movehdup_ps( vec ) );
+		vec = _mm_mul_ss( vec, half );
+		_mm_store_ss( rdiMono, vec );
+	}
+}
+
+void AudioBuffer::appendDownmixedStereo( const float* rsi, size_t countFloats )
+{
+	assert( 0 == ( countFloats % 2 ) );
+	const size_t countSamples = countFloats / 2;
+
+	const size_t oldLength = mono.size();
+	mono.resize( oldLength + countSamples );
+
+	const float* const rsiEnd = rsi + countSamples * 2;
+	const float* const rsiEndAligned = rsiEnd - ( countSamples * 2 ) % 8;
+
+	float* rdiMono = &mono[ oldLength ];
+
+	const __m128 half = _mm_set1_ps( 0.5f );
+	for( ; rsi < rsiEndAligned; rsi += 8, rdiMono += 4 )
+	{
+		// Load 4 samples = 8 floats 
+		__m128 v0 = _mm_loadu_ps( rsi );	// L0, R0, L1, R1
+		__m128 v1 = _mm_loadu_ps( rsi + 4 );// L2, R2, L3, R3
+
+		// Compute and store the average of these channels
+		__m128 left = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 2, 0, 2, 0 ) );
+		__m128 right = _mm_shuffle_ps( v0, v1, _MM_SHUFFLE( 3, 1, 3, 1 ) );
+		__m128 sum = _mm_add_ps( left, right );
+		sum = _mm_mul_ps( sum, half );
+		_mm_storeu_ps( rdiMono, sum );
+	}
+
+#pragma loop (no_vector)
+	for( ; rsi < rsiEnd; rsi += 2, rdiMono++ )
+	{
+		__m128 vec = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+		vec = _mm_add_ss( vec, _mm_movehdup_ps( vec ) );
+		vec = _mm_mul_ss( vec, half );
+		_mm_store_ss( rdiMono, vec );
+	}
+}
+\ No newline at end of file
diff --git a/Whisper/MF/AudioBuffer.h b/Whisper/MF/AudioBuffer.h
new file mode 100644
index 0000000..87319dd
--- /dev/null
+++ b/Whisper/MF/AudioBuffer.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <vector>
+
+namespace Whisper
+{
+	struct AudioBuffer
+	{
+		std::vector<float> mono;
+		std::vector<float> stereo;
+
+		void appendMono( const float* rsi, size_t countFloats );
+		void appendDownmixedStereo( const float* rsi, size_t countFloats );
+		void appendStereo( const float* rsi, size_t countFloats );
+
+		using pfnAppendSamples = void( AudioBuffer::* )( const float* rsi, size_t countFloats );
+
+		inline static pfnAppendSamples appendSamplesFunc( bool sourceMono, bool wantStereo )
+		{
+			if( sourceMono )
+				return &AudioBuffer::appendMono;
+			else if( !wantStereo )
+				return &AudioBuffer::appendDownmixedStereo;
+			else
+				return &AudioBuffer::appendStereo;
+		}
+
+		void clear()
+		{
+			mono.clear();
+			stereo.clear();
+		}
+
+		void resize( size_t len )
+		{
+			assert( len <= mono.size() );
+			mono.resize( len );
+			if( !stereo.empty() )
+				stereo.resize( len * 2 );
+		}
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/MF/AudioCapture.cpp b/Whisper/MF/AudioCapture.cpp
new file mode 100644
index 0000000..17f34dc
--- /dev/null
+++ b/Whisper/MF/AudioCapture.cpp
@@ -0,0 +1,167 @@
+#include "stdafx.h"
+#include <atlstr.h>
+#include <mfapi.h>
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include "AudioCapture.h"
+#include "../API/iMediaFoundation.cl.h"
+#include "../ComLightLib/comLightServer.h"
+#pragma comment(lib, "Mf.lib")
+
+namespace
+{
+	struct Strings
+	{
+		CString displayName, endpoint;
+	};
+
+	HRESULT getAllocString( IMFActivate* activate, const GUID& id, CString& rdi )
+	{
+		wchar_t* pointer = nullptr;
+		UINT32 cchName;
+		HRESULT hr = activate->GetAllocatedString( id, &pointer, &cchName );
+		if( SUCCEEDED( hr ) )
+			rdi.SetString( pointer, cchName );
+		CoTaskMemFree( pointer );
+		return hr;
+	}
+
+	HRESULT getInfo( IMFActivate* activate, Strings& rdi )
+	{
+		CHECK( getAllocString( activate, MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME, rdi.displayName ) );
+		CHECK( getAllocString( activate, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_ENDPOINT_ID, rdi.endpoint ) );
+		return S_OK;
+	}
+
+	HRESULT __stdcall supplyDevices( Whisper::pfnFoundCaptureDevices pfn, void* pv, IMFActivate** ppDevices, UINT32 count )
+	{
+		if( ppDevices == nullptr || count == 0 )
+			return pfn( 0, nullptr, pv );
+
+		std::vector<Strings> strings;
+		strings.reserve( count );
+
+		for( UINT i = 0; i < count; i++ )
+		{
+			IMFActivate* const activate = ppDevices[ i ];
+			if( nullptr == activate )
+				continue;
+			Strings info;
+			HRESULT hr = getInfo( activate, info );
+			if( FAILED( hr ) )
+				continue;
+
+			strings.emplace_back( std::move( info ) );
+		}
+
+		const size_t len = strings.size();
+		if( 0 == len )
+			return pfn( 0, nullptr, pv );
+
+		std::vector<Whisper::sCaptureDevice> pointers;
+		pointers.resize( len );
+		for( size_t i = 0; i < len; i++ )
+		{
+			const auto& src = strings[ i ];
+			auto& dest = pointers[ i ];
+			dest.displayName = src.displayName;
+			dest.endpoint = src.endpoint;
+		}
+		return pfn( (int)len, pointers.data(), pv );
+	}
+}
+
+HRESULT __stdcall Whisper::captureDeviceList( pfnFoundCaptureDevices pfn, void* pv )
+{
+	// Create an attribute store to hold the search criteria.
+	CComPtr<IMFAttributes> attrs;
+	CHECK( MFCreateAttributes( &attrs, 1 ) );
+	// Request audio capture devices
+	CHECK( attrs->SetGUID( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID ) );
+
+	// Enumerate the devices
+	IMFActivate** ppDevices = nullptr;
+	UINT32 count = 0;
+	CHECK( MFEnumDeviceSources( attrs, &ppDevices, &count ) );
+
+	// Feed the data to the caller
+	HRESULT hr = supplyDevices( pfn, pv, ppDevices, count );
+
+	// Free the memory
+	for( DWORD i = 0; i < count; i++ )
+		ppDevices[ i ]->Release();
+	CoTaskMemFree( ppDevices );
+
+	return hr;
+}
+
+namespace
+{
+	using namespace Whisper;
+
+	class Capture : public ComLight::ObjectRoot<iAudioCapture>
+	{
+		CComPtr<IMFSourceReader> reader;
+		CComPtr<iMediaFoundation> mediaFoundation;
+		sCaptureParams captureParams;
+
+		HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const noexcept override final
+		{
+			if( pp == nullptr )
+				return E_POINTER;
+			CComPtr<IMFSourceReader> res = reader;
+			*pp = res.Detach();;
+			return S_OK;
+		}
+		const sCaptureParams& COMLIGHTCALL getParams() const noexcept override final
+		{
+			return captureParams;
+		}
+	public:
+		HRESULT open( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& cp );
+	};
+
+	HRESULT Capture::open( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& cp )
+	{
+		// Create an attribute store to hold the search criteria.
+		CComPtr<IMFAttributes> attrs;
+		CHECK( MFCreateAttributes( &attrs, 2 ) );
+		// Request audio capture devices
+		CHECK( attrs->SetGUID( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID ) );
+		CHECK( attrs->SetString( MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_ENDPOINT_ID, endpoint ) );
+
+		CComPtr<IMFMediaSource> source;
+		HRESULT hr = MFCreateDeviceSource( attrs, &source );
+		if( FAILED( hr ) )
+		{
+			logErrorHr( hr, u8"MFCreateDeviceSource" );
+			return hr;
+		}
+
+		// TODO: implement IMFSourceReaderCallback, pass into MF_SOURCE_READER_ASYNC_CALLBACK attribute
+		// This is to support cancellation
+		hr = MFCreateSourceReaderFromMediaSource( source, nullptr, &reader );
+		if( FAILED( hr ) )
+		{
+			logErrorHr( hr, u8"MFCreateSourceReaderFromMediaSource" );
+			return hr;
+		}
+
+		captureParams = cp;
+		mediaFoundation = owner;
+		return S_OK;
+	}
+}
+
+HRESULT __stdcall Whisper::captureOpen( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept
+{
+	if( nullptr == endpoint || nullptr == pp )
+		return E_POINTER;
+
+	ComLight::CComPtr<ComLight::Object<Capture>> res;
+	CHECK( ComLight::Object<Capture>::create( res ) );
+	CHECK( res->open( owner, endpoint, captureParams ) );
+
+	res.detach( pp );
+	return S_OK;
+}
+\ No newline at end of file
diff --git a/Whisper/MF/AudioCapture.h b/Whisper/MF/AudioCapture.h
new file mode 100644
index 0000000..276ee4b
--- /dev/null
+++ b/Whisper/MF/AudioCapture.h
@@ -0,0 +1,12 @@
+#pragma once
+#include "../API/MfStructs.h"
+
+namespace Whisper
+{
+	struct iAudioCapture;
+	struct iMediaFoundation;
+
+	HRESULT __stdcall captureDeviceList( pfnFoundCaptureDevices pfn, void* pv );
+
+	HRESULT __stdcall captureOpen( iMediaFoundation* owner, const wchar_t* endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept;
+}
+\ No newline at end of file
diff --git a/Whisper/MF/MediaFoundation.cpp b/Whisper/MF/MediaFoundation.cpp
new file mode 100644
index 0000000..4a4f6a2
--- /dev/null
+++ b/Whisper/MF/MediaFoundation.cpp
@@ -0,0 +1,109 @@
+#include "stdafx.h"
+#include "../API/iMediaFoundation.cl.h"
+#include "mfStartup.h"
+#include "../ComLightLib/comLightServer.h"
+#include "loadAudioFile.h"
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include "mfUtils.h"
+#include "AudioCapture.h"
+
+namespace Whisper
+{
+	class AudioReader : public ComLight::ObjectRoot<iAudioReader>
+	{
+		CComPtr<IMFSourceReader> reader;
+		bool wantStereo;
+		CComPtr<iMediaFoundation> mediaFoundation;
+
+		HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const noexcept override final
+		{
+			if( pp == nullptr )
+				return E_POINTER;
+			CComPtr<IMFSourceReader> res = reader;
+			*pp = res.Detach();;
+			return S_OK;
+		}
+		HRESULT COMLIGHTCALL requestedStereo() const noexcept override final
+		{
+			return wantStereo ? S_OK : S_FALSE;
+		}
+		HRESULT COMLIGHTCALL getDuration( int64_t& rdi ) const noexcept override final
+		{
+			if( reader )
+				return getStreamDuration( reader, rdi );
+			return OLE_E_BLANK;
+		}
+	public:
+		HRESULT open( iMediaFoundation* owner, LPCTSTR path, bool stereo )
+		{
+			HRESULT hr = MFCreateSourceReaderFromURL( path, nullptr, &reader );
+			if( FAILED( hr ) )
+			{
+				logErrorHr( hr, u8"MFCreateSourceReaderFromURL failed" );
+				return hr;
+			}
+			wantStereo = stereo;
+			mediaFoundation = owner;
+			logDebug16( L"Created source reader from the file \"%s\"", path );
+			return S_OK;
+		}
+	};
+
+	class MediaFoundation : public ComLight::ObjectRoot<iMediaFoundation>
+	{
+		MfStartupRaii raii;
+		DWORD tid = ~(DWORD)0;
+
+		virtual HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const noexcept override final
+		{
+			return Whisper::loadAudioFile( path, stereo, pp );
+		}
+		virtual HRESULT COMLIGHTCALL openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ) noexcept override final
+		{
+			if( nullptr == path || nullptr == pp )
+				return E_POINTER;
+
+			ComLight::CComPtr<ComLight::Object<AudioReader>> res;
+			CHECK( ComLight::Object<AudioReader>::create( res ) );
+			CHECK( res->open( this, path, stereo ) );
+
+			res.detach( pp );
+			return S_OK;
+		}
+		HRESULT COMLIGHTCALL listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ) noexcept override final
+		{
+			return captureDeviceList( pfn, pv );
+		}
+		HRESULT COMLIGHTCALL openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) noexcept override final
+		{
+			return captureOpen( this, endpoint, captureParams, pp );
+		}
+	protected:
+
+		HRESULT FinalConstruct()
+		{
+			CHECK( raii.startup() );
+			tid = GetCurrentThreadId();
+			return S_OK;
+		}
+
+	public:
+
+		~MediaFoundation() override
+		{
+			assert( tid == GetCurrentThreadId() );
+		}
+	};
+}
+
+HRESULT COMLIGHTCALL Whisper::initMediaFoundation( iMediaFoundation** pp )
+{
+	if( nullptr == pp )
+		return E_POINTER;
+
+	ComLight::CComPtr<ComLight::Object<MediaFoundation>> obj;
+	CHECK( ComLight::Object<MediaFoundation>::create( obj ) );
+	obj.detach( pp );
+	return S_OK;
+}
+\ No newline at end of file
diff --git a/Whisper/MF/PcmReader.cpp b/Whisper/MF/PcmReader.cpp
new file mode 100644
index 0000000..ab92fc3
--- /dev/null
+++ b/Whisper/MF/PcmReader.cpp
@@ -0,0 +1,274 @@
+#include "stdafx.h"
+#include "PcmReader.h"
+#include <mfapi.h>
+#include "mfUtils.h"
+
+namespace Whisper
+{
+	__interface iSampleHandler
+	{
+		void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const;
+		void moveBufferData( AudioBuffer& rdi, size_t amount ) const;
+		void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const;
+		void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const;
+		uint32_t readerChannelsCount() const;
+	};
+}
+
+namespace
+{
+	using namespace Whisper;
+
+	__forceinline void copyMono( PcmMonoChunk* rdi, const AudioBuffer& rsi, size_t sourceOffset, size_t samples )
+	{
+		assert( sourceOffset + samples <= rsi.mono.size() );
+		memcpy( rdi->mono.data(), &rsi.mono[ sourceOffset ], samples * 4 );
+		if( samples < FFT_STEP )
+			memset( rdi->mono.data() + samples, 0, ( FFT_STEP - samples ) * 4 );
+	}
+
+	__forceinline void copyStereo( PcmStereoChunk* rdi, const AudioBuffer& rsi, size_t sourceOffset, size_t samples )
+	{
+		memcpy( rdi->stereo.data(), &rsi.stereo[ sourceOffset * 2 ], samples * 8 );
+		if( samples < FFT_STEP )
+			memset( rdi->stereo.data() + samples * 2, 0, ( FFT_STEP - samples ) * 8 );
+	}
+
+	struct HandlerMono : iSampleHandler
+	{
+		void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override
+		{
+			rdi.appendMono( rsi, countFloats );
+		}
+		void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const override final
+		{
+			copyMono( pMono, rsi, sourceOffset, FFT_STEP );
+		}
+		void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const override final
+		{
+			copyMono( pMono, rsi, sourceOffset, samples );
+		}
+		void moveBufferData( AudioBuffer& rdi, size_t amount ) const override final
+		{
+			const size_t len = rdi.mono.size();
+			assert( amount <= len );
+			if( amount < len )
+			{
+				const size_t block = len - amount;
+				memmove( rdi.mono.data(), rdi.mono.data() + amount, block * 4 );
+				rdi.mono.resize( block );
+			}
+			else
+				rdi.mono.clear();
+		}
+		uint32_t readerChannelsCount() const override { return 1; }
+	};
+	struct HandlerDownmixedStereo : HandlerMono
+	{
+		void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override final
+		{
+			rdi.appendDownmixedStereo( rsi, countFloats );
+		}
+		uint32_t readerChannelsCount() const override final { return 2; }
+	};
+	struct HandlerStereo : iSampleHandler
+	{
+		void appendPcm( AudioBuffer& rdi, const float* rsi, size_t countFloats ) const override final
+		{
+			rdi.appendStereo( rsi, countFloats );
+		}
+		void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, PcmStereoChunk* pStereo ) const override final
+		{
+			copyMono( pMono, rsi, sourceOffset, FFT_STEP );
+			copyStereo( pStereo, rsi, sourceOffset, FFT_STEP );
+		}
+		void copyChunk( PcmMonoChunk* pMono, const AudioBuffer& rsi, size_t sourceOffset, size_t samples, PcmStereoChunk* pStereo ) const override final
+		{
+			copyMono( pMono, rsi, sourceOffset, samples );
+			copyStereo( pStereo, rsi, sourceOffset, samples );
+		}
+		void moveBufferData( AudioBuffer& rdi, size_t amount ) const override final
+		{
+			const size_t len = rdi.mono.size();
+			assert( amount <= len );
+			if( amount < len )
+			{
+				const size_t block = len - amount;
+				memmove( rdi.mono.data(), rdi.mono.data() + amount, block * 4 );
+				rdi.mono.resize( block );
+				memmove( rdi.stereo.data(), rdi.stereo.data() + amount * 2, block * 8 );
+				rdi.mono.resize( block * 2 );
+			}
+			else
+			{
+				rdi.mono.clear();
+				rdi.stereo.clear();
+			}
+		}
+		uint32_t readerChannelsCount() const override final { return 2; }
+	};
+	static const HandlerMono s_mono;
+	static const HandlerDownmixedStereo s_downmix;
+	static const HandlerStereo s_stereo;
+}
+
+PcmReader::PcmReader( IMFSourceReader* reader, bool stereo )
+{
+	if( nullptr == reader )
+		throw E_POINTER;
+	this->reader = reader;
+
+	// Set up media type, and figure out sample handler
+	check( reader->SetStreamSelection( MF_SOURCE_READER_ALL_STREAMS, FALSE ) );
+	check( reader->SetStreamSelection( MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE ) );
+
+	CComPtr<IMFMediaType> mtNative;
+	check( reader->GetNativeMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, MF_SOURCE_READER_CURRENT_TYPE_INDEX, &mtNative ) );
+	UINT32 numChannels;
+	check( mtNative->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &numChannels ) );
+
+	const bool sourceMono = numChannels < 2;
+	if( sourceMono )
+		sampleHandler = &s_mono;
+	else if( !stereo )
+		sampleHandler = &s_downmix;
+	else
+	{
+		sampleHandler = &s_stereo;
+		m_stereoOutput = true;
+	}
+
+	CComPtr<IMFMediaType> mt;
+	check( createMediaType( !sourceMono, &mt ) );
+	check( reader->SetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, nullptr, mt ) );
+
+	// Find out the length
+	int64_t durationTicks;
+	check( getStreamDuration( reader, durationTicks ) );
+
+	// Convert length to chunks
+	// Seconds = Ticks / 10^7
+	// Samples = Seconds * SAMPLE_RATE = Ticks * SAMPLE_RATE / 10^7
+	// Chunks = Samples / FFT_STEP = Ticks * SAMPLE_RATE / ( FFT_STEP * 10^7 ), and we want that integer rounded down
+	constexpr __int64 mul = SAMPLE_RATE;
+	constexpr __int64 div = (__int64)FFT_STEP * 10'000'000;
+	m_length = (size_t)MFllMulDiv( durationTicks, mul, div, 0 );
+}
+
+HRESULT PcmReader::readNextSample()
+{
+	const size_t off = bufferReadOffset;
+	const size_t availableSamples = pcm.mono.size() - off;
+
+	// If needed, move the remaining PCM data to the start of these vectors
+	if( availableSamples > 0 )
+	{
+		if( 0 != off )
+			sampleHandler->moveBufferData( pcm, off );
+	}
+	else
+		pcm.clear();
+	bufferReadOffset = 0;
+
+	while( true )
+	{
+		DWORD dwFlags = 0;
+		CComPtr<IMFSample> sample;
+
+		// Read the next sample
+		HRESULT hr = reader->ReadSample( (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, nullptr, &dwFlags, nullptr, &sample );
+		if( FAILED( hr ) )
+		{
+			logErrorHr( hr, u8"IMFSourceReader.ReadSample" );
+			return hr;
+		}
+
+		if( dwFlags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED )
+		{
+			// logError( u8"Media type changes ain’t supported by the library." );
+			// return E_UNEXPECTED;
+
+			// This happens for some video files at the very start of the reading, with Dolby AC3 audio track.
+			// Instead of failing the transcribe process, verify the important attributes (FP32 samples, sample rate, count of channels) haven’t changed.
+			CHECK( validateCurrentMediaType( reader, sampleHandler->readerChannelsCount() ) );
+		}
+
+		if( dwFlags & MF_SOURCE_READERF_ENDOFSTREAM )
+			return E_EOF;
+
+		if( !sample )
+		{
+			// printf( "No sample\n" );
+			continue;
+		}
+
+		// Get a pointer to the audio data in the sample.
+		CComPtr<IMFMediaBuffer> buffer;
+		hr = sample->ConvertToContiguousBuffer( &buffer );
+		if( FAILED( hr ) )
+			return hr;
+
+		const float* pAudioData = nullptr;
+		DWORD cbBuffer;
+		hr = buffer->Lock( (BYTE**)&pAudioData, nullptr, &cbBuffer );
+		if( FAILED( hr ) )
+			return hr;
+
+		try
+		{
+			assert( 0 == ( cbBuffer % sizeof( float ) ) );
+			const size_t countFloats = cbBuffer / sizeof( float );
+			sampleHandler->appendPcm( pcm, pAudioData, countFloats );
+		}
+		catch( const std::bad_alloc& )
+		{
+			buffer->Unlock();
+			return E_OUTOFMEMORY;
+		}
+
+		// Unlock the buffer
+		hr = buffer->Unlock();
+		if( FAILED( hr ) )
+			return hr;
+
+		return S_OK;
+	}
+}
+
+HRESULT PcmReader::readChunk( PcmMonoChunk& mono, PcmStereoChunk* stereo )
+{
+	while( true )
+	{
+		const size_t off = bufferReadOffset;
+		const size_t availableSamples = pcm.mono.size() - off;
+		if( availableSamples >= FFT_STEP )
+		{
+			// We have enough data in the buffer
+			sampleHandler->copyChunk( &mono, pcm, off, stereo );
+			bufferReadOffset = off + FFT_STEP;
+			return S_OK;
+		}
+
+		if( !m_readerEndOfFile )
+		{
+			// We don't have enough data, but the stream has not ended yet, can load moar samples from the reader
+			HRESULT hr = readNextSample();
+			if( SUCCEEDED( hr ) )
+				continue;
+			if( hr != E_EOF )
+				return hr;
+			m_readerEndOfFile = true;
+		}
+
+		if( availableSamples > 0 )
+		{
+			// We have reached the end of stream of the reader, but the buffer still has a few samples.
+			// Return the final incomplete chunk padded with zeros
+			sampleHandler->copyChunk( &mono, pcm, off, availableSamples, stereo );
+			bufferReadOffset = off + availableSamples;
+			return S_OK;
+		}
+
+		return E_EOF;
+	}
+}
+\ No newline at end of file
diff --git a/Whisper/MF/PcmReader.h b/Whisper/MF/PcmReader.h
new file mode 100644
index 0000000..9e3757e
--- /dev/null
+++ b/Whisper/MF/PcmReader.h
@@ -0,0 +1,63 @@
+#pragma once
+#include "../Whisper/audioConstants.h"
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include "AudioBuffer.h"
+
+namespace Whisper
+{
+	// PCM buffer with 10 milliseconds of single-channel audio
+	struct PcmMonoChunk
+	{
+		std::array<float, FFT_STEP> mono;
+	};
+	// PCM buffer with 10 milliseconds of interleaved stereo
+	struct PcmStereoChunk
+	{
+		std::array<float, FFT_STEP * 2> stereo;
+	};
+
+	__interface iSampleHandler;
+
+	constexpr HRESULT E_EOF = HRESULT_FROM_WIN32( ERROR_HANDLE_EOF );
+
+	// Utility class which reads chunks of FFT_STEP FP32 PCM samples from the MF source reader
+	// The class always delivers mono chunks, and can optionally deliver stereo in a separate buffer.
+	class PcmReader
+	{
+		// A small intermediate buffer with PCM data for complete media foundation samples
+		AudioBuffer pcm;
+		// Index of the first unconsumed sample in the pcm buffer
+		size_t bufferReadOffset = 0;
+		// Utility object to abstract away mono versus stereo shenanigans
+		const iSampleHandler* sampleHandler;
+		// The underlying MF source reader which delivers audio data
+		CComPtr<IMFSourceReader> reader;
+		// True after we consumed all available media samples from the reader
+		bool m_readerEndOfFile = false;
+		// True if this object delivers stereo samples
+		bool m_stereoOutput = false;
+		// The count of chunks we expect to get from the reader
+		size_t m_length = 0;
+		// Read next sample from the reader, store in the PCM buffer in this class
+		HRESULT readNextSample();
+
+	public:
+
+		PcmReader( IMFSourceReader* source, bool stereo );
+
+		// Count of chunks in the MEL spectrogram.
+		// The PCM audio is generally slightly longer than that, due to the incomplete last chunk.
+		size_t getLength() const noexcept
+		{
+			return m_length;
+		}
+
+		// True when the stereo flag passed to constructor, and the audio stream actually has 2 or more audio channels
+		bool outputsStereo() const { return m_stereoOutput; }
+
+		// Load another 10ms chunk from the stream
+		// For the last chunk in the stream, the output buffers are padded with zeros
+		HRESULT readChunk( PcmMonoChunk& mono, PcmStereoChunk* stereo );
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/MF/loadAudioFile.cpp b/Whisper/MF/loadAudioFile.cpp
new file mode 100644
index 0000000..d1a439a
--- /dev/null
+++ b/Whisper/MF/loadAudioFile.cpp
@@ -0,0 +1,151 @@
+#include "stdafx.h"
+#include "../ComLightLib/comLightServer.h"
+#include "loadAudioFile.h"
+#include "mfUtils.h"
+#include "AudioBuffer.h"
+#include <mfidl.h>
+#include <mfreadwrite.h>
+#include <mfapi.h>
+#pragma comment(lib, "Mfreadwrite.lib")
+#pragma comment(lib, "mfuuid.lib")
+
+namespace Whisper
+{
+	class MediaFileBuffer : public ComLight::ObjectRoot<iAudioBuffer>
+	{
+		AudioBuffer pcm;
+		uint32_t channels = 0;
+
+		uint32_t COMLIGHTCALL countSamples() const noexcept override final
+		{
+			return (uint32_t)( pcm.mono.size() );
+		}
+		const float* COMLIGHTCALL getPcmMono() const noexcept override final
+		{
+			if( !pcm.mono.empty() )
+				return pcm.mono.data();
+			return nullptr;
+		}
+		const float* COMLIGHTCALL getPcmStereo() const noexcept override final
+		{
+			if( !pcm.stereo.empty() )
+				return pcm.stereo.data();
+			return nullptr;
+		}
+		HRESULT COMLIGHTCALL getTime( int64_t& rdi ) const noexcept override final
+		{
+			rdi = 0;
+			return S_OK;
+		}
+	public:
+		HRESULT load( LPCTSTR path, bool stereo );
+	};
+
+	HRESULT MediaFileBuffer::load( LPCTSTR path, bool stereo )
+	{
+		CComPtr<IMFSourceReader> reader;
+		HRESULT hr = MFCreateSourceReaderFromURL( path, nullptr, &reader );
+		if( FAILED( hr ) )
+		{
+			logErrorHr( hr, u8"MFCreateSourceReaderFromURL failed" );
+			return hr;
+		}
+
+		CHECK( reader->SetStreamSelection( MF_SOURCE_READER_ALL_STREAMS, FALSE ) );
+		CHECK( reader->SetStreamSelection( MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE ) );
+
+		CComPtr<IMFMediaType> mtNative;
+		CHECK( reader->GetNativeMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, MF_SOURCE_READER_CURRENT_TYPE_INDEX, &mtNative ) );
+		UINT32 numChannels;
+		CHECK( mtNative->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &numChannels ) );
+		const bool sourceMono = numChannels == 1;
+		const AudioBuffer::pfnAppendSamples pfn = AudioBuffer::appendSamplesFunc( sourceMono, stereo );
+		channels = ( stereo && !sourceMono ) ? 2 : 1;
+
+		CComPtr<IMFMediaType> mt;
+		CHECK( createMediaType( !sourceMono, &mt ) );
+
+		CHECK( reader->SetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, nullptr, mt ) );
+
+		while( true )
+		{
+			DWORD dwFlags = 0;
+			CComPtr<IMFSample> sample;
+
+			// Read the next sample.
+			hr = reader->ReadSample( (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, nullptr, &dwFlags, nullptr, &sample );
+			if( FAILED( hr ) )
+			{
+				logErrorHr( hr, u8"IMFSourceReader.ReadSample" );
+				return hr;
+			}
+
+			if( dwFlags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED )
+			{
+				logError( u8"Media type changes ain’t supported by the library." );
+				return E_UNEXPECTED;
+			}
+
+			if( dwFlags & MF_SOURCE_READERF_ENDOFSTREAM )
+				break;
+
+			if( !sample )
+			{
+				// printf( "No sample\n" );
+				continue;
+			}
+
+			// Get a pointer to the audio data in the sample.
+			CComPtr<IMFMediaBuffer> buffer;
+			hr = sample->ConvertToContiguousBuffer( &buffer );
+			if( FAILED( hr ) )
+				return hr;
+
+			const float* pAudioData = nullptr;
+			DWORD cbBuffer;
+			hr = buffer->Lock( (BYTE**)&pAudioData, nullptr, &cbBuffer );
+			if( FAILED( hr ) )
+				return hr;
+
+			try
+			{
+				const size_t countFloats = cbBuffer / sizeof( float );
+				( pcm.*pfn )( pAudioData, countFloats );
+			}
+			catch( const std::bad_alloc& )
+			{
+				return E_OUTOFMEMORY;
+			}
+
+			// Unlock the buffer
+			hr = buffer->Unlock();
+			if( FAILED( hr ) )
+				return hr;
+		}
+
+		const size_t len = pcm.mono.size();
+		if( len == 0 )
+		{
+			logError16( L"The audio file \"%s\" has no samples", path );
+			return E_INVALIDARG;
+		}
+		if( len < SAMPLE_RATE / 2 )
+			logError16( L"The file \"%s\" only has %zu samples, less than 0.5 seconds of audio", path, len );
+		else
+			logDebug16( L"Loaded audio file from \"%s\": %zu samples, %g seconds", path, len, (int)len * ( 1.0 / SAMPLE_RATE ) );
+		return S_OK;
+
+	}
+}
+
+HRESULT COMLIGHTCALL Whisper::loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp )
+{
+	if( nullptr == path || nullptr == pp )
+		return E_POINTER;
+
+	ComLight::CComPtr<ComLight::Object<MediaFileBuffer>> obj;
+	CHECK( ComLight::Object<MediaFileBuffer>::create( obj ) );
+	CHECK( obj->load( path, stereo ) );
+	obj.detach( pp );
+	return S_OK;
+}
+\ No newline at end of file
diff --git a/Whisper/MF/loadAudioFile.h b/Whisper/MF/loadAudioFile.h
new file mode 100644
index 0000000..9736ccd
--- /dev/null
+++ b/Whisper/MF/loadAudioFile.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "../API/iMediaFoundation.cl.h"
+
+namespace Whisper
+{
+	HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp );
+}
+\ No newline at end of file
diff --git a/Whisper/MF/mfStartup.cpp b/Whisper/MF/mfStartup.cpp
new file mode 100644
index 0000000..b7ab829
--- /dev/null
+++ b/Whisper/MF/mfStartup.cpp
@@ -0,0 +1,128 @@
+#include "stdafx.h"
+#include "mfStartup.h"
+#include <atlbase.h>
+#include <mfapi.h>
+#pragma comment(lib, "Mfplat.lib")
+
+namespace
+{
+	struct sCoInitStatus
+	{
+		// Possible state:
+		// -1 is the initial state, coInitialize never called
+		// S_OK - CoInitializeEx succeeded, in this state the counter tracks the count of coInitialize() for the current thread
+		// S_FALSE - CoInitializeEx failed with RPC_E_CHANGED_MODE, or did nothing because already initialized for the current thread
+		// Error status - CoInitializeEx failed for some other reason
+		HRESULT code = -1;
+		uint32_t counter = 0;
+	};
+	thread_local sCoInitStatus coInitStatus;
+
+	static HRESULT coInitialize()
+	{
+		sCoInitStatus& cis = coInitStatus;
+		HRESULT hr = cis.code;
+		if( SUCCEEDED( hr ) )
+		{
+			if( S_OK == hr )
+				cis.counter++;
+			return S_FALSE;
+		}
+
+		if( hr == HRESULT( -1 ) )
+		{
+			hr = CoInitializeEx( nullptr, COINIT_MULTITHREADED );
+			if( S_OK == hr )
+			{
+				cis.counter = 1;
+				return cis.code = S_OK;
+			}
+			if( S_FALSE == hr || RPC_E_CHANGED_MODE == hr )
+			{
+				return cis.code = S_FALSE;
+			}
+			cis.code = hr;
+			return hr;
+		}
+		
+		return hr;
+	}
+
+	static void coUninitialize()
+	{
+		sCoInitStatus& cis = coInitStatus;
+		if( cis.code == S_OK )
+		{
+			assert( cis.counter > 0 );
+			cis.counter--;
+			if( 0 == cis.counter )
+				CoUninitialize();
+		}
+	}
+
+	static CComAutoCriticalSection s_lock;
+#define LOCK() CComCritSecLock<CComAutoCriticalSection> lock{ s_lock }
+	static uint32_t mfStartupCounter = 0;
+
+	constexpr uint8_t FlagCOM = 1;
+	constexpr uint8_t FlagMF = 0x10;
+}
+
+using namespace Whisper;
+
+MfStartupRaii::~MfStartupRaii()
+{
+	if( 0 != ( successFlags & FlagMF ) )
+	{
+		LOCK();
+		assert( mfStartupCounter > 0 );
+		mfStartupCounter--;
+		if( mfStartupCounter > 0 )
+			return;
+		MFShutdown();
+		successFlags &= ~FlagMF;
+	}
+	
+	if( 0 != ( successFlags & FlagCOM ) )
+	{
+		coUninitialize();
+		successFlags &= ~FlagCOM;
+	}
+}
+
+HRESULT MfStartupRaii::startup()
+{
+	if( 0 != ( successFlags & FlagMF ) )
+		return HRESULT_FROM_WIN32( ERROR_ALREADY_INITIALIZED );
+
+	HRESULT hr = coInitialize();
+	CHECK( hr );
+	if( hr == S_OK )
+		successFlags |= FlagCOM;
+
+	LOCK();
+
+	if( 0 == mfStartupCounter )
+	{
+		HRESULT hr = MFStartup( MF_VERSION, MFSTARTUP_LITE );
+		if( SUCCEEDED( hr ) )
+		{
+			mfStartupCounter = 1;
+			successFlags |= FlagMF;
+			return S_OK;
+		}
+
+		if( 0 != ( successFlags & FlagCOM ) )
+		{
+			coUninitialize();
+			successFlags &= ~FlagCOM;
+		}
+		return hr;
+	}
+	else
+	{
+		mfStartupCounter++;
+		successFlags |= FlagMF;
+		return S_FALSE;
+	}
+}
+\ No newline at end of file
diff --git a/Whisper/MF/mfStartup.h b/Whisper/MF/mfStartup.h
new file mode 100644
index 0000000..1434ffc
--- /dev/null
+++ b/Whisper/MF/mfStartup.h
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace Whisper
+{
+	class MfStartupRaii
+	{
+		uint8_t successFlags = 0;
+	public:
+		MfStartupRaii() = default;
+		~MfStartupRaii();
+		MfStartupRaii( const MfStartupRaii& ) = delete;
+
+		HRESULT startup();
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/MF/mfUtils.cpp b/Whisper/MF/mfUtils.cpp
new file mode 100644
index 0000000..e739079
--- /dev/null
+++ b/Whisper/MF/mfUtils.cpp
@@ -0,0 +1,69 @@
+#include "stdafx.h"
+#include "mfUtils.h"
+#include <mfapi.h>
+
+HRESULT Whisper::createMediaType( bool stereo, IMFMediaType** pp )
+{
+	if( nullptr == pp )
+		return E_POINTER;
+
+	CComPtr<IMFMediaType> mt;
+	CHECK( MFCreateMediaType( &mt ) );
+	CHECK( mt->SetGUID( MF_MT_MAJOR_TYPE, MFMediaType_Audio ) );
+	CHECK( mt->SetGUID( MF_MT_SUBTYPE, MFAudioFormat_Float ) );
+	CHECK( mt->SetUINT32( MF_MT_AUDIO_SAMPLES_PER_SECOND, SAMPLE_RATE ) );
+
+	const uint32_t channels = stereo ? 2 : 1;
+	CHECK( mt->SetUINT32( MF_MT_AUDIO_NUM_CHANNELS, channels ) );
+	CHECK( mt->SetUINT32( MF_MT_AUDIO_BLOCK_ALIGNMENT, channels * 4 ) );
+	CHECK( mt->SetUINT32( MF_MT_AUDIO_AVG_BYTES_PER_SECOND, channels * 4 * SAMPLE_RATE ) );
+	CHECK( mt->SetUINT32( MF_MT_AUDIO_BITS_PER_SAMPLE, 32 ) );
+	CHECK( mt->SetUINT32( MF_MT_ALL_SAMPLES_INDEPENDENT, TRUE ) );
+
+	*pp = mt.Detach();
+
+	return S_OK;
+}
+
+HRESULT Whisper::getStreamDuration( IMFSourceReader* reader, int64_t& duration )
+{
+	PROPVARIANT var;
+	PropVariantInit( &var );
+	CHECK( reader->GetPresentationAttribute( MF_SOURCE_READER_MEDIASOURCE, MF_PD_DURATION, &var ) );
+
+	if( var.vt == VT_UI8 )
+	{
+		// The documentation says the type of that attribute is UINT64
+		// https://learn.microsoft.com/en-us/windows/win32/medfound/mf-pd-duration-attribute
+		duration = var.uhVal.QuadPart;
+		return S_OK;
+	}
+	logError( u8"Unexpected type of MF_PD_DURATION attribute" );
+	return E_INVALIDARG;
+}
+
+HRESULT Whisper::validateCurrentMediaType( IMFSourceReader* reader, uint32_t expectedChannels )
+{
+	CComPtr<IMFMediaType> mt;
+	CHECK( reader->GetCurrentMediaType( MF_SOURCE_READER_FIRST_AUDIO_STREAM, &mt ) );
+
+	GUID guid;
+	CHECK( mt->GetGUID( MF_MT_MAJOR_TYPE, &guid ) );
+	if( guid != MFMediaType_Audio )
+		return E_FAIL;
+
+	CHECK( mt->GetGUID( MF_MT_SUBTYPE, &guid ) );
+	if( guid != MFAudioFormat_Float )
+		return E_FAIL;
+
+	UINT32 u32;
+	CHECK( mt->GetUINT32( MF_MT_AUDIO_SAMPLES_PER_SECOND, &u32 ) );
+	if( u32 != SAMPLE_RATE )
+		return E_FAIL;
+
+	CHECK( mt->GetUINT32( MF_MT_AUDIO_NUM_CHANNELS, &u32 ) );
+	if( u32 != expectedChannels )
+		return E_FAIL;
+
+	return S_OK;
+}
+\ No newline at end of file
diff --git a/Whisper/MF/mfUtils.h b/Whisper/MF/mfUtils.h
new file mode 100644
index 0000000..c889a92
--- /dev/null
+++ b/Whisper/MF/mfUtils.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <stdint.h>
+#include <mfidl.h>
+#include <mfobjects.h>
+#include <mfreadwrite.h>
+#include "../Whisper/audioConstants.h"
+
+namespace Whisper
+{
+	HRESULT createMediaType( bool stereo, IMFMediaType** pp );
+
+	HRESULT getStreamDuration( IMFSourceReader* reader, int64_t& duration );
+
+	HRESULT validateCurrentMediaType( IMFSourceReader* reader, uint32_t expectedChannels );
+}
+\ No newline at end of file
author	Konstantin <const@const.me>	2023-01-16 14:52:43 +0100
committer	Konstantin <const@const.me>	2023-01-16 14:52:43 +0100
commit	8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree	714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/MF
parent	990a8d0dbaefc996244097397259e92758b15cce (diff)