Source codes

author: Konstantin <const@const.me> 2023-01-16 14:52:43 +0100
committer: Konstantin <const@const.me> 2023-01-16 14:52:43 +0100
commit: 8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree: 714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/API
parent: 990a8d0dbaefc996244097397259e92758b15cce (diff)
16 files changed, 670 insertions, 0 deletions
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h
new file mode 100644
index 0000000..cd27659
--- /dev/null
+++ b/Whisper/API/MfStructs.h
@@ -0,0 +1,51 @@
+#pragma once
+
+namespace Whisper
+{
+	struct sCaptureDevice
+	{
+		// The display name is suitable for showing to the user, but might not be unique.
+		const wchar_t* displayName;
+
+		// Endpoint ID for an audio capture device
+		// It uniquely identifies the device on the system, but is not a readable string.
+		const wchar_t* endpoint;
+	};
+
+	using pfnFoundCaptureDevices = HRESULT( __stdcall* )( int len, const sCaptureDevice* buffer, void* pv );
+
+	// Flags for the audio capture
+	enum struct eCaptureFlags : uint32_t
+	{
+		// When the capture device supports stereo, keep stereo PCM samples in addition to mono
+		Stereo = 1,
+	};
+
+	// Parameters for audio capture
+	struct sCaptureParams
+	{
+		float minDuration = 2.0f;
+		float maxDuration = 3.0f;
+		float dropStartSilence = 0.25f;
+		float pauseDuration = 0.333f;
+		// Flags for the audio capture
+		uint32_t flags = 0;
+	};
+
+	enum struct eCaptureStatus : uint8_t
+	{
+		Listening = 1,
+		Voice = 2,
+		Transcribing = 4,
+		Stalled = 0x80,
+	};
+
+	using pfnShouldCancel = HRESULT( __stdcall* )( void* pv ) noexcept;
+	using pfnCaptureStatus = HRESULT( __stdcall* )( void* pv, eCaptureStatus status ) noexcept;
+	struct sCaptureCallbacks
+	{
+		pfnShouldCancel shouldCancel;
+		pfnCaptureStatus captureStatus;
+		void* pv;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/Readme.txt b/Whisper/API/Readme.txt
new file mode 100644
index 0000000..7d40494
--- /dev/null
+++ b/Whisper/API/Readme.txt
@@ -0,0 +1,15 @@
+The headers in this folder define the complete public API of Whisper.dll.
+
+To consume the library in your C++ software, include exactly one of the following headers.
+
+1. If you’re building a windows app, include whisperWindows.h header, and you'll get traditional Win32 COM projection of the API.
+
+2. If you’re porting to other OS, or porting to different C++ compiler, or already using ComLight support library, include whisperComLight.h header.
+If you do that, in addition to this "Whisper/API" folder you also gonna need the "ComLightLib" dependency.
+This will get you the ComLight flavor of these COM interfaces.
+
+Internally, the actual implementation uses the ComLight flavour of the interfaces, but that’s fine because they are binary compatible.
+
+The reason for the difference between these flavors — Visual Studio’s CComPtr<T> and other related utilities expect interface IDs specified with __declspec(uuid) directive.
+
+That language extension is specific to Visual C++, not supported in GCC nor Clang compilers.
+\ No newline at end of file
diff --git a/Whisper/API/SpecialTokens.h b/Whisper/API/SpecialTokens.h
new file mode 100644
index 0000000..67fd020
--- /dev/null
+++ b/Whisper/API/SpecialTokens.h
@@ -0,0 +1,25 @@
+#pragma once
+
+namespace Whisper
+{
+	struct SpecialTokens
+	{
+		// The end of a transcription, token_eot
+		int TranscriptionEnd;
+		// Start of a transcription, token_sot
+		int TranscriptionStart;
+		// Represents the previous word in the transcription. It is used to help the model predict the current word based on the context of the words that came before it.
+		int PreviousWord;   // token_prev
+		// Start of a sentence
+		int SentenceStart;   // token_solm
+		//Represents the word "not" in the transcription
+		int Not;    // token_not
+		//New transcription
+		int TranscriptionBegin;    // token_beg
+
+		// token_translate
+		int TaskTranslate;
+		// token_transcribe
+		int TaskTranscribe;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/TranscribeStructs.h b/Whisper/API/TranscribeStructs.h
new file mode 100644
index 0000000..ac28357
--- /dev/null
+++ b/Whisper/API/TranscribeStructs.h
@@ -0,0 +1,127 @@
+#pragma once
+#include <stdint.h>
+#include <assert.h>
+
+namespace Whisper
+{
+	enum struct eModelImplementation : uint32_t
+	{
+		GPU = 1,
+		Hybrid = 2,
+		Reference = 3,
+	};
+
+	struct sTimeSpanFields
+	{
+		uint32_t days;
+		uint8_t hours, minutes, seconds;
+		uint32_t ticks;
+
+		sTimeSpanFields( uint64_t tt )
+		{
+			ticks = (uint32_t)( tt % 10'000'000 );
+			tt /= 10'000'000;
+			seconds = (uint8_t)( tt % 60 );
+			tt /= 60;
+			minutes = (uint8_t)( tt % 60 );
+			tt /= 60;
+			hours = (uint8_t)( tt % 24 );
+			tt /= 24;
+			days = (uint32_t)tt;
+		}
+	};
+
+	struct sTimeSpan
+	{
+		uint64_t ticks;
+
+		operator sTimeSpanFields() const
+		{
+			return sTimeSpanFields{ ticks };
+		}
+		void operator=( uint64_t tt )
+		{
+			ticks = tt;
+		}
+		void operator=( int64_t tt )
+		{
+			assert( tt >= 0 );
+			ticks = (uint64_t)tt;
+		}
+	};
+
+	// Start and end times of the segment or token, expressed in 100-nanosecond ticks
+	struct sTimeInterval
+	{
+		sTimeSpan begin, end;
+	};
+
+	// Segment data
+	struct sSegment
+	{
+		// Segment text, null-terminated, and probably UTF-8 encoded
+		const char* text;
+		// Start and end times of the segment
+		sTimeInterval time;
+		uint32_t firstToken, countTokens;
+	};
+
+	enum eTokenFlags : uint32_t
+	{
+		None = 0,
+		Special = 1,
+	};
+	inline bool operator &( eTokenFlags a, eTokenFlags b )
+	{
+		return 0 != ( (uint32_t)a & (uint32_t)b );
+	}
+
+	// Token data
+	struct sToken
+	{
+		// Token text, null-terminated, and probably UTF-8 encoded
+		const char* text;
+		// Start and end times of the token
+		sTimeInterval time;
+		// Probability of the token
+		float probability;
+		// Probability of the timestamp token
+		float probabilityTimestamp;
+		// Sum of probabilities of all timestamp tokens
+		float ptsum;
+		// Voice length of the token
+		float vlen;
+		// Token id
+		int id;
+		eTokenFlags flags;
+	};
+
+	struct sTranscribeLength
+	{
+		uint32_t countSegments, countTokens;
+	};
+
+	enum struct eResultFlags : uint32_t
+	{
+		None = 0,
+		// Return individual tokens in addition to the segments
+		Tokens = 1,
+		// Return timestamps
+		Timestamps = 2,
+
+		// Create a new COM object for the results.
+		// Without this flag, the context returns a pointer to the COM object stored in the context.
+		// The content of that object is replaced every time you call iContext.getResults method
+		NewObject = 0x100,
+	};
+
+	inline eResultFlags operator |( eResultFlags a, eResultFlags b )
+	{
+		return (eResultFlags)( (uint32_t)a | (uint32_t)b );
+	}
+
+	inline bool operator &( eResultFlags a, eResultFlags b )
+	{
+		return 0 != ( (uint32_t)a & (uint32_t)b );
+	}
+}
+\ No newline at end of file
diff --git a/Whisper/API/iContext.cl.h b/Whisper/API/iContext.cl.h
new file mode 100644
index 0000000..97d34c7
--- /dev/null
+++ b/Whisper/API/iContext.cl.h
@@ -0,0 +1,66 @@
+#pragma once
+#include "../../ComLightLib/comLightCommon.h"
+#include "iTranscribeResult.cl.h"
+#include "SpecialTokens.h"
+#include "loggerApi.h"
+#include "sLanguageList.h"
+#include "sLoadModelCallbacks.h"
+
+namespace Whisper
+{
+	struct iModel;
+	struct iAudioBuffer;
+	struct iAudioReader;
+	struct iAudioCapture;
+	struct sCaptureCallbacks;
+	struct sFullParams;
+	enum struct eModelImplementation : uint32_t;
+	enum struct eSamplingStrategy : int;
+	using whisper_token = int;
+	struct sProgressSink;
+
+	struct DECLSPEC_NOVTABLE iContext : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{b9956374-3b18-4943-90f2-2ab18a404537}" );
+
+		// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+		// Uses the specified decoding strategy to obtain the text.
+		virtual HRESULT COMLIGHTCALL runFull( const sFullParams& params, const iAudioBuffer* buffer ) = 0;
+		virtual HRESULT COMLIGHTCALL runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader ) = 0;
+		virtual HRESULT COMLIGHTCALL runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader ) = 0;
+
+		virtual HRESULT COMLIGHTCALL getResults( eResultFlags flags, iTranscribeResult** pp ) const = 0;
+
+		virtual HRESULT COMLIGHTCALL getModel( iModel** pp ) = 0;
+
+		virtual HRESULT COMLIGHTCALL fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi ) = 0;
+
+		// Performance information
+		virtual HRESULT COMLIGHTCALL timingsPrint() = 0;
+		virtual HRESULT COMLIGHTCALL timingsReset() = 0;
+	};
+
+	struct DECLSPEC_NOVTABLE iModel : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{abefb4c9-e8d8-46a3-8747-5afbadef1adb}" );
+
+		virtual HRESULT COMLIGHTCALL createContext( iContext** pp ) = 0;
+
+		virtual HRESULT COMLIGHTCALL isMultilingual() = 0;
+
+		virtual HRESULT COMLIGHTCALL getSpecialTokens( SpecialTokens& rdi ) = 0;
+
+		// Token Id -> String
+		virtual const char* COMLIGHTCALL stringFromToken( whisper_token token ) = 0;
+	};
+
+	HRESULT COMLIGHTCALL setupLogger( const sLoggerSetup& setup );
+	HRESULT COMLIGHTCALL loadModel( const wchar_t* path, eModelImplementation impl, const sLoadModelCallbacks* callbacks, iModel** pp );
+
+	uint32_t COMLIGHTCALL findLanguageKeyW( const wchar_t* lang );
+	uint32_t COMLIGHTCALL findLanguageKeyA( const char* lang );
+
+	HRESULT COMLIGHTCALL getSupportedLanguages( sLanguageList& rdi );
+}
+
+#include "sFullParams.h"
+\ No newline at end of file
diff --git a/Whisper/API/iContext.h b/Whisper/API/iContext.h
new file mode 100644
index 0000000..9661093
--- /dev/null
+++ b/Whisper/API/iContext.h
@@ -0,0 +1,61 @@
+#pragma once
+#include "iTranscribeResult.h"
+#include "SpecialTokens.h"
+#include "loggerApi.h"
+#include "sLanguageList.h"
+#include "sLoadModelCallbacks.h"
+
+namespace Whisper
+{
+	__interface iModel;
+	__interface iAudioBuffer;
+	__interface iAudioReader;
+	__interface iAudioCapture;
+	struct sCaptureCallbacks;
+	struct sFullParams;
+	enum struct eModelImplementation : uint32_t;
+	enum struct eSamplingStrategy : int;
+	using whisper_token = int;
+	struct sProgressSink;
+
+	__interface __declspec( novtable, uuid( "b9956374-3b18-4943-90f2-2ab18a404537" ) ) iContext : public IUnknown
+	{
+		// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+		// Uses the specified decoding strategy to obtain the text.
+		HRESULT __stdcall runFull( const sFullParams& params, const iAudioBuffer* buffer );
+		HRESULT __stdcall runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader );
+		HRESULT __stdcall runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader );
+
+		HRESULT __stdcall getResults( eResultFlags flags, iTranscribeResult** pp ) const;
+
+		HRESULT __stdcall getModel( iModel** pp );
+
+		HRESULT __stdcall fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi );
+
+		// Performance information
+		HRESULT __stdcall timingsPrint();
+		HRESULT __stdcall timingsReset();
+	};
+
+	__interface __declspec( novtable, uuid( "abefb4c9-e8d8-46a3-8747-5afbadef1adb" ) ) iModel : public IUnknown
+	{
+		HRESULT __stdcall createContext( iContext** pp );
+
+		HRESULT __stdcall isMultilingual();
+
+		HRESULT __stdcall getSpecialTokens( SpecialTokens& rdi );
+
+		// Token Id -> String
+		const char* __stdcall stringFromToken( whisper_token token );
+	};
+
+	HRESULT __stdcall setupLogger( const sLoggerSetup& setup );
+	HRESULT __stdcall loadModel( const wchar_t* path, eModelImplementation impl, const sLoadModelCallbacks* callbacks, iModel** pp );
+
+	uint32_t __stdcall findLanguageKeyW( const wchar_t* lang );
+	uint32_t __stdcall findLanguageKeyA( const char* lang );
+
+	HRESULT __stdcall getSupportedLanguages( sLanguageList& rdi );
+}
+
+#include "sFullParams.h"
+\ No newline at end of file
diff --git a/Whisper/API/iMediaFoundation.cl.h b/Whisper/API/iMediaFoundation.cl.h
new file mode 100644
index 0000000..516b67f
--- /dev/null
+++ b/Whisper/API/iMediaFoundation.cl.h
@@ -0,0 +1,48 @@
+#pragma once
+#include "../../ComLightLib/comLightCommon.h"
+#include "MfStructs.h"
+
+struct IMFSourceReader;
+
+namespace Whisper
+{
+	struct DECLSPEC_NOVTABLE iAudioBuffer : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{013583aa-c9eb-42bc-83db-633c2c317051}" );
+
+		virtual uint32_t COMLIGHTCALL countSamples() const = 0;
+		virtual const float* COMLIGHTCALL getPcmMono() const = 0;
+		virtual const float* COMLIGHTCALL getPcmStereo() const = 0;
+		virtual HRESULT COMLIGHTCALL getTime( int64_t& rdi ) const = 0;
+	};
+
+	struct DECLSPEC_NOVTABLE iAudioReader : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{35b988da-04a6-476a-a193-d8891d5dc390}" );
+
+		virtual HRESULT COMLIGHTCALL getDuration( int64_t& rdi ) const = 0;
+		virtual HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const = 0;
+		virtual HRESULT COMLIGHTCALL requestedStereo() const = 0;
+	};
+
+	struct DECLSPEC_NOVTABLE iAudioCapture : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{747752c2-d9fd-40df-8847-583c781bf013}" );
+
+		virtual HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const = 0;
+		virtual const sCaptureParams& COMLIGHTCALL getParams() const = 0;
+	};
+
+	struct DECLSPEC_NOVTABLE iMediaFoundation : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{fb9763a5-d77d-4b6e-aff8-f494813cebd8}" );
+
+		virtual HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const = 0;
+		virtual HRESULT COMLIGHTCALL openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ) = 0;
+
+		virtual HRESULT COMLIGHTCALL listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ) = 0;
+		virtual HRESULT COMLIGHTCALL openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) = 0;
+	};
+
+	HRESULT COMLIGHTCALL initMediaFoundation( iMediaFoundation** pp );
+}
+\ No newline at end of file
diff --git a/Whisper/API/iMediaFoundation.h b/Whisper/API/iMediaFoundation.h
new file mode 100644
index 0000000..93dc287
--- /dev/null
+++ b/Whisper/API/iMediaFoundation.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <stdint.h>
+#include "MfStructs.h"
+struct IMFSourceReader;
+
+namespace Whisper
+{
+	__interface __declspec( novtable, uuid( "013583aa-c9eb-42bc-83db-633c2c317051" ) ) iAudioBuffer : public IUnknown
+	{
+		uint32_t __stdcall countSamples() const;
+		const float* __stdcall getPcmMono() const;
+		const float* __stdcall getPcmStereo() const;
+		HRESULT __stdcall getTime( int64_t& rdi ) const;
+	};
+
+	__interface __declspec( novtable, uuid( "35b988da-04a6-476a-a193-d8891d5dc390" ) ) iAudioReader : public IUnknown
+	{
+		HRESULT __stdcall getDuration( int64_t& rdi ) const;
+		HRESULT __stdcall getReader( IMFSourceReader** pp ) const;
+		HRESULT __stdcall requestedStereo() const;
+	};
+
+	__interface __declspec( novtable, uuid( "747752c2-d9fd-40df-8847-583c781bf013" ) ) iAudioCapture : public IUnknown
+	{
+		HRESULT __stdcall getReader( IMFSourceReader** pp ) const;
+		const sCaptureParams& __stdcall getParams() const;
+	};
+
+	__interface __declspec( novtable, uuid( "fb9763a5-d77d-4b6e-aff8-f494813cebd8" ) ) iMediaFoundation : public IUnknown
+	{
+		HRESULT __stdcall loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const;
+		HRESULT __stdcall openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp );
+
+		HRESULT __stdcall listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv );
+		HRESULT __stdcall openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp );
+	};
+
+	HRESULT __stdcall initMediaFoundation( iMediaFoundation** pp );
+}
+\ No newline at end of file
diff --git a/Whisper/API/iTranscribeResult.cl.h b/Whisper/API/iTranscribeResult.cl.h
new file mode 100644
index 0000000..ab65178
--- /dev/null
+++ b/Whisper/API/iTranscribeResult.cl.h
@@ -0,0 +1,15 @@
+#pragma once
+#include "TranscribeStructs.h"
+#include "../../ComLightLib/comLightCommon.h"
+
+namespace Whisper
+{
+	struct iTranscribeResult : public ComLight::IUnknown
+	{
+		DEFINE_INTERFACE_ID( "{2871a73f-5ce3-48f8-8779-6582ee11935e}" );
+
+		virtual HRESULT COMLIGHTCALL getSize( sTranscribeLength& rdi ) const = 0;
+		virtual const sSegment* COMLIGHTCALL getSegments() const = 0;
+		virtual const sToken* COMLIGHTCALL getTokens() const = 0;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/iTranscribeResult.h b/Whisper/API/iTranscribeResult.h
new file mode 100644
index 0000000..27e0c0d
--- /dev/null
+++ b/Whisper/API/iTranscribeResult.h
@@ -0,0 +1,12 @@
+#pragma once
+#include "TranscribeStructs.h"
+
+namespace Whisper
+{
+	__interface __declspec( novtable, uuid( "2871a73f-5ce3-48f8-8779-6582ee11935e" ) ) iTranscribeResult : public IUnknown
+	{
+		HRESULT __stdcall getSize( sTranscribeLength& rdi ) const;
+		const sSegment* __stdcall getSegments() const;
+		const sToken* __stdcall getTokens() const;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/loggerApi.h b/Whisper/API/loggerApi.h
new file mode 100644
index 0000000..6af1c4e
--- /dev/null
+++ b/Whisper/API/loggerApi.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <stdint.h>
+
+namespace Whisper
+{
+	// Log level for messages
+	enum struct eLogLevel : uint8_t
+	{
+		Error = 0,
+		Warning = 1,
+		Info = 2,
+		Debug = 3
+	};
+	enum struct eLoggerFlags : uint8_t
+	{
+		UseStandardError = 1,
+		SkipFormatMessage = 2,
+	};
+
+	// C function pointer to receive log messages from the library. The messages are encoded in UTF-8.
+	using pfnLoggerSink = void( __stdcall* )( void* context, eLogLevel lvl, const char* message );
+
+	// A sink to receive log messages produced by MeshRepair.dll
+	struct sLoggerSetup
+	{
+		// C function pointer to receive log messages from the library
+		pfnLoggerSink sink = nullptr;
+		// Optional context parameter for the sink function; when consuming from C# you don't need that, pass IntPtr.Zero, delegates can capture things.
+		void* context = nullptr;
+		// Maximum log level to produce
+		eLogLevel level;
+		// Flags about the logger
+		eLoggerFlags flags = (eLoggerFlags)0;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/sFullParams.h b/Whisper/API/sFullParams.h
new file mode 100644
index 0000000..0a1d352
--- /dev/null
+++ b/Whisper/API/sFullParams.h
@@ -0,0 +1,136 @@
+#pragma once
+#include <stdint.h>
+#include <assert.h>
+
+namespace Whisper
+{
+	// Available sampling strategies
+	enum struct eSamplingStrategy : int
+	{
+		// Always select the most probable token
+		Greedy,
+		// TODO: not implemented yet!
+		BeamSearch,
+	};
+
+	using pfnNewSegment = HRESULT( __cdecl* )( iContext* ctx, uint32_t n_new, void* user_data ) noexcept;
+	using pfnEncoderBegin = HRESULT( __cdecl* )( iContext* ctx, void* user_data ) noexcept;
+
+	enum struct eFullParamsFlags : uint32_t
+	{
+		Translate = 1,
+		NoContext = 2,
+		SingleSegment = 4,
+		PrintSpecial = 8,
+		PrintProgress = 0x10,
+		PrintRealtime = 0x20,
+		PrintTimestamps = 0x40,
+
+		// Experimental
+		TokenTimestamps = 0x100,
+		SpeedupAudio = 0x200,
+	};
+
+	inline eFullParamsFlags operator | ( eFullParamsFlags a, eFullParamsFlags b )
+	{
+		return (eFullParamsFlags)( (uint32_t)a | (uint32_t)b );
+	}
+	inline void operator |= ( eFullParamsFlags& a, eFullParamsFlags b )
+	{
+		a = a | b;
+	}
+
+	struct sFullParams
+	{
+		eSamplingStrategy strategy;
+		// Count of CPU threads
+		int cpuThreads;
+		int n_max_text_ctx;
+		int offset_ms;          // start offset in ms
+		int duration_ms;        // audio duration to process in ms
+		eFullParamsFlags flags;
+		uint32_t language;
+
+		// [EXPERIMENTAL] token-level timestamps
+		float thold_pt;         // timestamp token probability threshold (~0.01)
+		float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
+		int   max_len;          // max segment length in characters
+		int   max_tokens;       // max tokens per segment (0 = no limit)
+
+		struct
+		{
+			int n_past;
+		} greedy;
+
+		struct
+		{
+			int n_past;
+			int beam_width;
+			int n_best;
+		} beam_search;
+
+		// [EXPERIMENTAL] speed-up techniques
+		int  audio_ctx;         // overwrite the audio context size (0 = use default)
+
+		// tokens to provide the whisper model as initial prompt
+		// these are prepended to any existing text context from a previous call
+		const whisper_token* prompt_tokens;
+		int prompt_n_tokens;
+
+		pfnNewSegment new_segment_callback;
+		void* new_segment_callback_user_data;
+
+		pfnEncoderBegin encoder_begin_callback;
+		void* encoder_begin_callback_user_data;
+
+		// Couple utility methods, they workaround the lack of bit fields in C++
+		inline bool flag( eFullParamsFlags f ) const
+		{
+			return 0 != ( (uint32_t)flags & (uint32_t)f );
+		}
+		inline void resetFlag( eFullParamsFlags bit )
+		{
+			uint32_t f = (uint32_t)flags;
+			f &= ~(uint32_t)bit;
+			flags = (eFullParamsFlags)f;
+		}
+		inline void setFlag( eFullParamsFlags bit, bool set = true )
+		{
+			uint32_t f = (uint32_t)flags;
+			if( set )
+				f |= (uint32_t)bit;
+			else
+				f &= ~(uint32_t)bit;
+			flags = (eFullParamsFlags)f;
+		}
+	};
+
+	struct sSegmentTime
+	{
+		int64_t begin, end;
+	};
+
+	inline uint32_t makeLanguageKey( const char* code )
+	{
+		assert( strlen( code ) <= 4 );
+		uint32_t res = 0;
+		uint32_t shift = 0;
+		for( size_t i = 0; i < 4; i++, code++, shift += 8 )
+		{
+			const char c = *code;
+			if( c == '\0' )
+				return res;
+			uint32_t u32 = (uint8_t)c;
+			u32 = u32 << shift;
+			res |= u32;
+		}
+		return res;
+	}
+
+	using pfnReportProgress = HRESULT( __stdcall* )( double val, iContext* ctx, void* pv ) noexcept;
+	struct sProgressSink
+	{
+		pfnReportProgress pfn;
+		void* pv;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/sLanguageList.h b/Whisper/API/sLanguageList.h
new file mode 100644
index 0000000..49ca596
--- /dev/null
+++ b/Whisper/API/sLanguageList.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <stdint.h>
+
+namespace Whisper
+{
+	struct sLanguageEntry
+	{
+		uint32_t key;
+		int id;
+		const char* name;
+	};
+
+	struct sLanguageList
+	{
+		uint32_t length;
+		const sLanguageEntry* pointer;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/sLoadModelCallbacks.h b/Whisper/API/sLoadModelCallbacks.h
new file mode 100644
index 0000000..f5248c6
--- /dev/null
+++ b/Whisper/API/sLoadModelCallbacks.h
@@ -0,0 +1,14 @@
+#pragma once
+
+namespace Whisper
+{
+	using pfnLoadProgress = HRESULT( __stdcall* )( double val, void* pv ) noexcept;
+	using pfnCancel = HRESULT( __stdcall* )( void* pv ) noexcept;
+	
+	struct sLoadModelCallbacks
+	{
+		pfnLoadProgress progress;
+		pfnCancel cancel;
+		void* pv;
+	};
+}
+\ No newline at end of file
diff --git a/Whisper/API/whisperComLight.h b/Whisper/API/whisperComLight.h
new file mode 100644
index 0000000..c7f0b93
--- /dev/null
+++ b/Whisper/API/whisperComLight.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "iMediaFoundation.cl.h"
+#include "iContext.cl.h"
+#include "iTranscribeResult.cl.h"
+\ No newline at end of file
diff --git a/Whisper/API/whisperWindows.h b/Whisper/API/whisperWindows.h
new file mode 100644
index 0000000..925e307
--- /dev/null
+++ b/Whisper/API/whisperWindows.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "iMediaFoundation.h"
+#include "iContext.h"
+#include "iTranscribeResult.h"
+\ No newline at end of file
author	Konstantin <const@const.me>	2023-01-16 14:52:43 +0100
committer	Konstantin <const@const.me>	2023-01-16 14:52:43 +0100
commit	8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree	714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/API
parent	990a8d0dbaefc996244097397259e92758b15cce (diff)