summaryrefslogtreecommitdiffstats
path: root/Whisper/API
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/API
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Whisper/API')
-rw-r--r--Whisper/API/MfStructs.h51
-rw-r--r--Whisper/API/Readme.txt15
-rw-r--r--Whisper/API/SpecialTokens.h25
-rw-r--r--Whisper/API/TranscribeStructs.h127
-rw-r--r--Whisper/API/iContext.cl.h66
-rw-r--r--Whisper/API/iContext.h61
-rw-r--r--Whisper/API/iMediaFoundation.cl.h48
-rw-r--r--Whisper/API/iMediaFoundation.h39
-rw-r--r--Whisper/API/iTranscribeResult.cl.h15
-rw-r--r--Whisper/API/iTranscribeResult.h12
-rw-r--r--Whisper/API/loggerApi.h35
-rw-r--r--Whisper/API/sFullParams.h136
-rw-r--r--Whisper/API/sLanguageList.h18
-rw-r--r--Whisper/API/sLoadModelCallbacks.h14
-rw-r--r--Whisper/API/whisperComLight.h4
-rw-r--r--Whisper/API/whisperWindows.h4
16 files changed, 670 insertions, 0 deletions
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h
new file mode 100644
index 0000000..cd27659
--- /dev/null
+++ b/Whisper/API/MfStructs.h
@@ -0,0 +1,51 @@
+#pragma once
+
+namespace Whisper
+{
+ struct sCaptureDevice
+ {
+ // The display name is suitable for showing to the user, but might not be unique.
+ const wchar_t* displayName;
+
+ // Endpoint ID for an audio capture device
+ // It uniquely identifies the device on the system, but is not a readable string.
+ const wchar_t* endpoint;
+ };
+
+ using pfnFoundCaptureDevices = HRESULT( __stdcall* )( int len, const sCaptureDevice* buffer, void* pv );
+
+ // Flags for the audio capture
+ enum struct eCaptureFlags : uint32_t
+ {
+ // When the capture device supports stereo, keep stereo PCM samples in addition to mono
+ Stereo = 1,
+ };
+
+ // Parameters for audio capture
+ struct sCaptureParams
+ {
+ float minDuration = 2.0f;
+ float maxDuration = 3.0f;
+ float dropStartSilence = 0.25f;
+ float pauseDuration = 0.333f;
+ // Flags for the audio capture
+ uint32_t flags = 0;
+ };
+
+ enum struct eCaptureStatus : uint8_t
+ {
+ Listening = 1,
+ Voice = 2,
+ Transcribing = 4,
+ Stalled = 0x80,
+ };
+
+ using pfnShouldCancel = HRESULT( __stdcall* )( void* pv ) noexcept;
+ using pfnCaptureStatus = HRESULT( __stdcall* )( void* pv, eCaptureStatus status ) noexcept;
+ struct sCaptureCallbacks
+ {
+ pfnShouldCancel shouldCancel;
+ pfnCaptureStatus captureStatus;
+ void* pv;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/Readme.txt b/Whisper/API/Readme.txt
new file mode 100644
index 0000000..7d40494
--- /dev/null
+++ b/Whisper/API/Readme.txt
@@ -0,0 +1,15 @@
+The headers in this folder define the complete public API of Whisper.dll.
+
+To consume the library in your C++ software, include exactly one of the following headers.
+
+1. If you’re building a windows app, include whisperWindows.h header, and you'll get traditional Win32 COM projection of the API.
+
+2. If you’re porting to other OS, or porting to different C++ compiler, or already using ComLight support library, include whisperComLight.h header.
+If you do that, in addition to this "Whisper/API" folder you also gonna need the "ComLightLib" dependency.
+This will get you the ComLight flavor of these COM interfaces.
+
+Internally, the actual implementation uses the ComLight flavour of the interfaces, but that’s fine because they are binary compatible.
+
+The reason for the difference between these flavors — Visual Studio’s CComPtr<T> and other related utilities expect interface IDs specified with __declspec(uuid) directive.
+
+That language extension is specific to Visual C++, not supported in GCC nor Clang compilers. \ No newline at end of file
diff --git a/Whisper/API/SpecialTokens.h b/Whisper/API/SpecialTokens.h
new file mode 100644
index 0000000..67fd020
--- /dev/null
+++ b/Whisper/API/SpecialTokens.h
@@ -0,0 +1,25 @@
+#pragma once
+
+namespace Whisper
+{
+ struct SpecialTokens
+ {
+ // The end of a transcription, token_eot
+ int TranscriptionEnd;
+ // Start of a transcription, token_sot
+ int TranscriptionStart;
+ // Represents the previous word in the transcription. It is used to help the model predict the current word based on the context of the words that came before it.
+ int PreviousWord; // token_prev
+ // Start of a sentence
+ int SentenceStart; // token_solm
+ //Represents the word "not" in the transcription
+ int Not; // token_not
+ //New transcription
+ int TranscriptionBegin; // token_beg
+
+ // token_translate
+ int TaskTranslate;
+ // token_transcribe
+ int TaskTranscribe;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/TranscribeStructs.h b/Whisper/API/TranscribeStructs.h
new file mode 100644
index 0000000..ac28357
--- /dev/null
+++ b/Whisper/API/TranscribeStructs.h
@@ -0,0 +1,127 @@
+#pragma once
+#include <stdint.h>
+#include <assert.h>
+
+namespace Whisper
+{
+ enum struct eModelImplementation : uint32_t
+ {
+ GPU = 1,
+ Hybrid = 2,
+ Reference = 3,
+ };
+
+ struct sTimeSpanFields
+ {
+ uint32_t days;
+ uint8_t hours, minutes, seconds;
+ uint32_t ticks;
+
+ sTimeSpanFields( uint64_t tt )
+ {
+ ticks = (uint32_t)( tt % 10'000'000 );
+ tt /= 10'000'000;
+ seconds = (uint8_t)( tt % 60 );
+ tt /= 60;
+ minutes = (uint8_t)( tt % 60 );
+ tt /= 60;
+ hours = (uint8_t)( tt % 24 );
+ tt /= 24;
+ days = (uint32_t)tt;
+ }
+ };
+
+ struct sTimeSpan
+ {
+ uint64_t ticks;
+
+ operator sTimeSpanFields() const
+ {
+ return sTimeSpanFields{ ticks };
+ }
+ void operator=( uint64_t tt )
+ {
+ ticks = tt;
+ }
+ void operator=( int64_t tt )
+ {
+ assert( tt >= 0 );
+ ticks = (uint64_t)tt;
+ }
+ };
+
+ // Start and end times of the segment or token, expressed in 100-nanosecond ticks
+ struct sTimeInterval
+ {
+ sTimeSpan begin, end;
+ };
+
+ // Segment data
+ struct sSegment
+ {
+ // Segment text, null-terminated, and probably UTF-8 encoded
+ const char* text;
+ // Start and end times of the segment
+ sTimeInterval time;
+ uint32_t firstToken, countTokens;
+ };
+
+ enum eTokenFlags : uint32_t
+ {
+ None = 0,
+ Special = 1,
+ };
+ inline bool operator &( eTokenFlags a, eTokenFlags b )
+ {
+ return 0 != ( (uint32_t)a & (uint32_t)b );
+ }
+
+ // Token data
+ struct sToken
+ {
+ // Token text, null-terminated, and probably UTF-8 encoded
+ const char* text;
+ // Start and end times of the token
+ sTimeInterval time;
+ // Probability of the token
+ float probability;
+ // Probability of the timestamp token
+ float probabilityTimestamp;
+ // Sum of probabilities of all timestamp tokens
+ float ptsum;
+ // Voice length of the token
+ float vlen;
+ // Token id
+ int id;
+ eTokenFlags flags;
+ };
+
+ struct sTranscribeLength
+ {
+ uint32_t countSegments, countTokens;
+ };
+
+ enum struct eResultFlags : uint32_t
+ {
+ None = 0,
+ // Return individual tokens in addition to the segments
+ Tokens = 1,
+ // Return timestamps
+ Timestamps = 2,
+
+ // Create a new COM object for the results.
+ // Without this flag, the context returns a pointer to the COM object stored in the context.
+ // The content of that object is replaced every time you call iContext.getResults method
+ NewObject = 0x100,
+ };
+
+ inline eResultFlags operator |( eResultFlags a, eResultFlags b )
+ {
+ return (eResultFlags)( (uint32_t)a | (uint32_t)b );
+ }
+
+ inline bool operator &( eResultFlags a, eResultFlags b )
+ {
+ return 0 != ( (uint32_t)a & (uint32_t)b );
+ }
+} \ No newline at end of file
diff --git a/Whisper/API/iContext.cl.h b/Whisper/API/iContext.cl.h
new file mode 100644
index 0000000..97d34c7
--- /dev/null
+++ b/Whisper/API/iContext.cl.h
@@ -0,0 +1,66 @@
+#pragma once
+#include "../../ComLightLib/comLightCommon.h"
+#include "iTranscribeResult.cl.h"
+#include "SpecialTokens.h"
+#include "loggerApi.h"
+#include "sLanguageList.h"
+#include "sLoadModelCallbacks.h"
+
+namespace Whisper
+{
+ struct iModel;
+ struct iAudioBuffer;
+ struct iAudioReader;
+ struct iAudioCapture;
+ struct sCaptureCallbacks;
+ struct sFullParams;
+ enum struct eModelImplementation : uint32_t;
+ enum struct eSamplingStrategy : int;
+ using whisper_token = int;
+ struct sProgressSink;
+
+ struct DECLSPEC_NOVTABLE iContext : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{b9956374-3b18-4943-90f2-2ab18a404537}" );
+
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+ // Uses the specified decoding strategy to obtain the text.
+ virtual HRESULT COMLIGHTCALL runFull( const sFullParams& params, const iAudioBuffer* buffer ) = 0;
+ virtual HRESULT COMLIGHTCALL runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader ) = 0;
+ virtual HRESULT COMLIGHTCALL runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader ) = 0;
+
+ virtual HRESULT COMLIGHTCALL getResults( eResultFlags flags, iTranscribeResult** pp ) const = 0;
+
+ virtual HRESULT COMLIGHTCALL getModel( iModel** pp ) = 0;
+
+ virtual HRESULT COMLIGHTCALL fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi ) = 0;
+
+ // Performance information
+ virtual HRESULT COMLIGHTCALL timingsPrint() = 0;
+ virtual HRESULT COMLIGHTCALL timingsReset() = 0;
+ };
+
+ struct DECLSPEC_NOVTABLE iModel : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{abefb4c9-e8d8-46a3-8747-5afbadef1adb}" );
+
+ virtual HRESULT COMLIGHTCALL createContext( iContext** pp ) = 0;
+
+ virtual HRESULT COMLIGHTCALL isMultilingual() = 0;
+
+ virtual HRESULT COMLIGHTCALL getSpecialTokens( SpecialTokens& rdi ) = 0;
+
+ // Token Id -> String
+ virtual const char* COMLIGHTCALL stringFromToken( whisper_token token ) = 0;
+ };
+
+ HRESULT COMLIGHTCALL setupLogger( const sLoggerSetup& setup );
+ HRESULT COMLIGHTCALL loadModel( const wchar_t* path, eModelImplementation impl, const sLoadModelCallbacks* callbacks, iModel** pp );
+
+ uint32_t COMLIGHTCALL findLanguageKeyW( const wchar_t* lang );
+ uint32_t COMLIGHTCALL findLanguageKeyA( const char* lang );
+
+ HRESULT COMLIGHTCALL getSupportedLanguages( sLanguageList& rdi );
+}
+
+#include "sFullParams.h" \ No newline at end of file
diff --git a/Whisper/API/iContext.h b/Whisper/API/iContext.h
new file mode 100644
index 0000000..9661093
--- /dev/null
+++ b/Whisper/API/iContext.h
@@ -0,0 +1,61 @@
+#pragma once
+#include "iTranscribeResult.h"
+#include "SpecialTokens.h"
+#include "loggerApi.h"
+#include "sLanguageList.h"
+#include "sLoadModelCallbacks.h"
+
+namespace Whisper
+{
+ __interface iModel;
+ __interface iAudioBuffer;
+ __interface iAudioReader;
+ __interface iAudioCapture;
+ struct sCaptureCallbacks;
+ struct sFullParams;
+ enum struct eModelImplementation : uint32_t;
+ enum struct eSamplingStrategy : int;
+ using whisper_token = int;
+ struct sProgressSink;
+
+ __interface __declspec( novtable, uuid( "b9956374-3b18-4943-90f2-2ab18a404537" ) ) iContext : public IUnknown
+ {
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+ // Uses the specified decoding strategy to obtain the text.
+ HRESULT __stdcall runFull( const sFullParams& params, const iAudioBuffer* buffer );
+ HRESULT __stdcall runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader );
+ HRESULT __stdcall runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader );
+
+ HRESULT __stdcall getResults( eResultFlags flags, iTranscribeResult** pp ) const;
+
+ HRESULT __stdcall getModel( iModel** pp );
+
+ HRESULT __stdcall fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi );
+
+ // Performance information
+ HRESULT __stdcall timingsPrint();
+ HRESULT __stdcall timingsReset();
+ };
+
+ __interface __declspec( novtable, uuid( "abefb4c9-e8d8-46a3-8747-5afbadef1adb" ) ) iModel : public IUnknown
+ {
+ HRESULT __stdcall createContext( iContext** pp );
+
+ HRESULT __stdcall isMultilingual();
+
+ HRESULT __stdcall getSpecialTokens( SpecialTokens& rdi );
+
+ // Token Id -> String
+ const char* __stdcall stringFromToken( whisper_token token );
+ };
+
+ HRESULT __stdcall setupLogger( const sLoggerSetup& setup );
+ HRESULT __stdcall loadModel( const wchar_t* path, eModelImplementation impl, const sLoadModelCallbacks* callbacks, iModel** pp );
+
+ uint32_t __stdcall findLanguageKeyW( const wchar_t* lang );
+ uint32_t __stdcall findLanguageKeyA( const char* lang );
+
+ HRESULT __stdcall getSupportedLanguages( sLanguageList& rdi );
+}
+
+#include "sFullParams.h" \ No newline at end of file
diff --git a/Whisper/API/iMediaFoundation.cl.h b/Whisper/API/iMediaFoundation.cl.h
new file mode 100644
index 0000000..516b67f
--- /dev/null
+++ b/Whisper/API/iMediaFoundation.cl.h
@@ -0,0 +1,48 @@
+#pragma once
+#include "../../ComLightLib/comLightCommon.h"
+#include "MfStructs.h"
+
+struct IMFSourceReader;
+
+namespace Whisper
+{
+ struct DECLSPEC_NOVTABLE iAudioBuffer : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{013583aa-c9eb-42bc-83db-633c2c317051}" );
+
+ virtual uint32_t COMLIGHTCALL countSamples() const = 0;
+ virtual const float* COMLIGHTCALL getPcmMono() const = 0;
+ virtual const float* COMLIGHTCALL getPcmStereo() const = 0;
+ virtual HRESULT COMLIGHTCALL getTime( int64_t& rdi ) const = 0;
+ };
+
+ struct DECLSPEC_NOVTABLE iAudioReader : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{35b988da-04a6-476a-a193-d8891d5dc390}" );
+
+ virtual HRESULT COMLIGHTCALL getDuration( int64_t& rdi ) const = 0;
+ virtual HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const = 0;
+ virtual HRESULT COMLIGHTCALL requestedStereo() const = 0;
+ };
+
+ struct DECLSPEC_NOVTABLE iAudioCapture : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{747752c2-d9fd-40df-8847-583c781bf013}" );
+
+ virtual HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const = 0;
+ virtual const sCaptureParams& COMLIGHTCALL getParams() const = 0;
+ };
+
+ struct DECLSPEC_NOVTABLE iMediaFoundation : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{fb9763a5-d77d-4b6e-aff8-f494813cebd8}" );
+
+ virtual HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const = 0;
+ virtual HRESULT COMLIGHTCALL openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ) = 0;
+
+ virtual HRESULT COMLIGHTCALL listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ) = 0;
+ virtual HRESULT COMLIGHTCALL openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) = 0;
+ };
+
+ HRESULT COMLIGHTCALL initMediaFoundation( iMediaFoundation** pp );
+} \ No newline at end of file
diff --git a/Whisper/API/iMediaFoundation.h b/Whisper/API/iMediaFoundation.h
new file mode 100644
index 0000000..93dc287
--- /dev/null
+++ b/Whisper/API/iMediaFoundation.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <stdint.h>
+#include "MfStructs.h"
+struct IMFSourceReader;
+
+namespace Whisper
+{
+ __interface __declspec( novtable, uuid( "013583aa-c9eb-42bc-83db-633c2c317051" ) ) iAudioBuffer : public IUnknown
+ {
+ uint32_t __stdcall countSamples() const;
+ const float* __stdcall getPcmMono() const;
+ const float* __stdcall getPcmStereo() const;
+ HRESULT __stdcall getTime( int64_t& rdi ) const;
+ };
+
+ __interface __declspec( novtable, uuid( "35b988da-04a6-476a-a193-d8891d5dc390" ) ) iAudioReader : public IUnknown
+ {
+ HRESULT __stdcall getDuration( int64_t& rdi ) const;
+ HRESULT __stdcall getReader( IMFSourceReader** pp ) const;
+ HRESULT __stdcall requestedStereo() const;
+ };
+
+ __interface __declspec( novtable, uuid( "747752c2-d9fd-40df-8847-583c781bf013" ) ) iAudioCapture : public IUnknown
+ {
+ HRESULT __stdcall getReader( IMFSourceReader** pp ) const;
+ const sCaptureParams& __stdcall getParams() const;
+ };
+
+ __interface __declspec( novtable, uuid( "fb9763a5-d77d-4b6e-aff8-f494813cebd8" ) ) iMediaFoundation : public IUnknown
+ {
+ HRESULT __stdcall loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const;
+ HRESULT __stdcall openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp );
+
+ HRESULT __stdcall listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv );
+ HRESULT __stdcall openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp );
+ };
+
+ HRESULT __stdcall initMediaFoundation( iMediaFoundation** pp );
+} \ No newline at end of file
diff --git a/Whisper/API/iTranscribeResult.cl.h b/Whisper/API/iTranscribeResult.cl.h
new file mode 100644
index 0000000..ab65178
--- /dev/null
+++ b/Whisper/API/iTranscribeResult.cl.h
@@ -0,0 +1,15 @@
+#pragma once
+#include "TranscribeStructs.h"
+#include "../../ComLightLib/comLightCommon.h"
+
+namespace Whisper
+{
+ struct iTranscribeResult : public ComLight::IUnknown
+ {
+ DEFINE_INTERFACE_ID( "{2871a73f-5ce3-48f8-8779-6582ee11935e}" );
+
+ virtual HRESULT COMLIGHTCALL getSize( sTranscribeLength& rdi ) const = 0;
+ virtual const sSegment* COMLIGHTCALL getSegments() const = 0;
+ virtual const sToken* COMLIGHTCALL getTokens() const = 0;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/iTranscribeResult.h b/Whisper/API/iTranscribeResult.h
new file mode 100644
index 0000000..27e0c0d
--- /dev/null
+++ b/Whisper/API/iTranscribeResult.h
@@ -0,0 +1,12 @@
+#pragma once
+#include "TranscribeStructs.h"
+
+namespace Whisper
+{
+ __interface __declspec( novtable, uuid( "2871a73f-5ce3-48f8-8779-6582ee11935e" ) ) iTranscribeResult : public IUnknown
+ {
+ HRESULT __stdcall getSize( sTranscribeLength& rdi ) const;
+ const sSegment* __stdcall getSegments() const;
+ const sToken* __stdcall getTokens() const;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/loggerApi.h b/Whisper/API/loggerApi.h
new file mode 100644
index 0000000..6af1c4e
--- /dev/null
+++ b/Whisper/API/loggerApi.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <stdint.h>
+
+namespace Whisper
+{
+ // Log level for messages
+ enum struct eLogLevel : uint8_t
+ {
+ Error = 0,
+ Warning = 1,
+ Info = 2,
+ Debug = 3
+ };
+ enum struct eLoggerFlags : uint8_t
+ {
+ UseStandardError = 1,
+ SkipFormatMessage = 2,
+ };
+
+ // C function pointer to receive log messages from the library. The messages are encoded in UTF-8.
+ using pfnLoggerSink = void( __stdcall* )( void* context, eLogLevel lvl, const char* message );
+
+ // A sink to receive log messages produced by MeshRepair.dll
+ struct sLoggerSetup
+ {
+ // C function pointer to receive log messages from the library
+ pfnLoggerSink sink = nullptr;
+ // Optional context parameter for the sink function; when consuming from C# you don't need that, pass IntPtr.Zero, delegates can capture things.
+ void* context = nullptr;
+ // Maximum log level to produce
+ eLogLevel level;
+ // Flags about the logger
+ eLoggerFlags flags = (eLoggerFlags)0;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/sFullParams.h b/Whisper/API/sFullParams.h
new file mode 100644
index 0000000..0a1d352
--- /dev/null
+++ b/Whisper/API/sFullParams.h
@@ -0,0 +1,136 @@
+#pragma once
+#include <stdint.h>
+#include <assert.h>
+
+namespace Whisper
+{
+ // Available sampling strategies
+ enum struct eSamplingStrategy : int
+ {
+ // Always select the most probable token
+ Greedy,
+ // TODO: not implemented yet!
+ BeamSearch,
+ };
+
+ using pfnNewSegment = HRESULT( __cdecl* )( iContext* ctx, uint32_t n_new, void* user_data ) noexcept;
+ using pfnEncoderBegin = HRESULT( __cdecl* )( iContext* ctx, void* user_data ) noexcept;
+
+ enum struct eFullParamsFlags : uint32_t
+ {
+ Translate = 1,
+ NoContext = 2,
+ SingleSegment = 4,
+ PrintSpecial = 8,
+ PrintProgress = 0x10,
+ PrintRealtime = 0x20,
+ PrintTimestamps = 0x40,
+
+ // Experimental
+ TokenTimestamps = 0x100,
+ SpeedupAudio = 0x200,
+ };
+
+ inline eFullParamsFlags operator | ( eFullParamsFlags a, eFullParamsFlags b )
+ {
+ return (eFullParamsFlags)( (uint32_t)a | (uint32_t)b );
+ }
+ inline void operator |= ( eFullParamsFlags& a, eFullParamsFlags b )
+ {
+ a = a | b;
+ }
+
+ struct sFullParams
+ {
+ eSamplingStrategy strategy;
+ // Count of CPU threads
+ int cpuThreads;
+ int n_max_text_ctx;
+ int offset_ms; // start offset in ms
+ int duration_ms; // audio duration to process in ms
+ eFullParamsFlags flags;
+ uint32_t language;
+
+ // [EXPERIMENTAL] token-level timestamps
+ float thold_pt; // timestamp token probability threshold (~0.01)
+ float thold_ptsum; // timestamp token sum probability threshold (~0.01)
+ int max_len; // max segment length in characters
+ int max_tokens; // max tokens per segment (0 = no limit)
+
+ struct
+ {
+ int n_past;
+ } greedy;
+
+ struct
+ {
+ int n_past;
+ int beam_width;
+ int n_best;
+ } beam_search;
+
+ // [EXPERIMENTAL] speed-up techniques
+ int audio_ctx; // overwrite the audio context size (0 = use default)
+
+ // tokens to provide the whisper model as initial prompt
+ // these are prepended to any existing text context from a previous call
+ const whisper_token* prompt_tokens;
+ int prompt_n_tokens;
+
+ pfnNewSegment new_segment_callback;
+ void* new_segment_callback_user_data;
+
+ pfnEncoderBegin encoder_begin_callback;
+ void* encoder_begin_callback_user_data;
+
+ // Couple utility methods, they workaround the lack of bit fields in C++
+ inline bool flag( eFullParamsFlags f ) const
+ {
+ return 0 != ( (uint32_t)flags & (uint32_t)f );
+ }
+ inline void resetFlag( eFullParamsFlags bit )
+ {
+ uint32_t f = (uint32_t)flags;
+ f &= ~(uint32_t)bit;
+ flags = (eFullParamsFlags)f;
+ }
+ inline void setFlag( eFullParamsFlags bit, bool set = true )
+ {
+ uint32_t f = (uint32_t)flags;
+ if( set )
+ f |= (uint32_t)bit;
+ else
+ f &= ~(uint32_t)bit;
+ flags = (eFullParamsFlags)f;
+ }
+ };
+
+ struct sSegmentTime
+ {
+ int64_t begin, end;
+ };
+
+ inline uint32_t makeLanguageKey( const char* code )
+ {
+ assert( strlen( code ) <= 4 );
+ uint32_t res = 0;
+ uint32_t shift = 0;
+ for( size_t i = 0; i < 4; i++, code++, shift += 8 )
+ {
+ const char c = *code;
+ if( c == '\0' )
+ return res;
+ uint32_t u32 = (uint8_t)c;
+ u32 = u32 << shift;
+ res |= u32;
+ }
+ return res;
+ }
+
+ using pfnReportProgress = HRESULT( __stdcall* )( double val, iContext* ctx, void* pv ) noexcept;
+ struct sProgressSink
+ {
+ pfnReportProgress pfn;
+ void* pv;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/sLanguageList.h b/Whisper/API/sLanguageList.h
new file mode 100644
index 0000000..49ca596
--- /dev/null
+++ b/Whisper/API/sLanguageList.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <stdint.h>
+
+namespace Whisper
+{
+ struct sLanguageEntry
+ {
+ uint32_t key;
+ int id;
+ const char* name;
+ };
+
+ struct sLanguageList
+ {
+ uint32_t length;
+ const sLanguageEntry* pointer;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/sLoadModelCallbacks.h b/Whisper/API/sLoadModelCallbacks.h
new file mode 100644
index 0000000..f5248c6
--- /dev/null
+++ b/Whisper/API/sLoadModelCallbacks.h
@@ -0,0 +1,14 @@
+#pragma once
+
+namespace Whisper
+{
+ using pfnLoadProgress = HRESULT( __stdcall* )( double val, void* pv ) noexcept;
+ using pfnCancel = HRESULT( __stdcall* )( void* pv ) noexcept;
+
+ struct sLoadModelCallbacks
+ {
+ pfnLoadProgress progress;
+ pfnCancel cancel;
+ void* pv;
+ };
+} \ No newline at end of file
diff --git a/Whisper/API/whisperComLight.h b/Whisper/API/whisperComLight.h
new file mode 100644
index 0000000..c7f0b93
--- /dev/null
+++ b/Whisper/API/whisperComLight.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "iMediaFoundation.cl.h"
+#include "iContext.cl.h"
+#include "iTranscribeResult.cl.h" \ No newline at end of file
diff --git a/Whisper/API/whisperWindows.h b/Whisper/API/whisperWindows.h
new file mode 100644
index 0000000..925e307
--- /dev/null
+++ b/Whisper/API/whisperWindows.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "iMediaFoundation.h"
+#include "iContext.h"
+#include "iTranscribeResult.h" \ No newline at end of file