diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/API | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'Whisper/API')
| -rw-r--r-- | Whisper/API/MfStructs.h | 51 | ||||
| -rw-r--r-- | Whisper/API/Readme.txt | 15 | ||||
| -rw-r--r-- | Whisper/API/SpecialTokens.h | 25 | ||||
| -rw-r--r-- | Whisper/API/TranscribeStructs.h | 127 | ||||
| -rw-r--r-- | Whisper/API/iContext.cl.h | 66 | ||||
| -rw-r--r-- | Whisper/API/iContext.h | 61 | ||||
| -rw-r--r-- | Whisper/API/iMediaFoundation.cl.h | 48 | ||||
| -rw-r--r-- | Whisper/API/iMediaFoundation.h | 39 | ||||
| -rw-r--r-- | Whisper/API/iTranscribeResult.cl.h | 15 | ||||
| -rw-r--r-- | Whisper/API/iTranscribeResult.h | 12 | ||||
| -rw-r--r-- | Whisper/API/loggerApi.h | 35 | ||||
| -rw-r--r-- | Whisper/API/sFullParams.h | 136 | ||||
| -rw-r--r-- | Whisper/API/sLanguageList.h | 18 | ||||
| -rw-r--r-- | Whisper/API/sLoadModelCallbacks.h | 14 | ||||
| -rw-r--r-- | Whisper/API/whisperComLight.h | 4 | ||||
| -rw-r--r-- | Whisper/API/whisperWindows.h | 4 |
16 files changed, 670 insertions, 0 deletions
diff --git a/Whisper/API/MfStructs.h b/Whisper/API/MfStructs.h new file mode 100644 index 0000000..cd27659 --- /dev/null +++ b/Whisper/API/MfStructs.h @@ -0,0 +1,51 @@ +#pragma once + +namespace Whisper +{ + struct sCaptureDevice + { + // The display name is suitable for showing to the user, but might not be unique. + const wchar_t* displayName; + + // Endpoint ID for an audio capture device + // It uniquely identifies the device on the system, but is not a readable string. + const wchar_t* endpoint; + }; + + using pfnFoundCaptureDevices = HRESULT( __stdcall* )( int len, const sCaptureDevice* buffer, void* pv ); + + // Flags for the audio capture + enum struct eCaptureFlags : uint32_t + { + // When the capture device supports stereo, keep stereo PCM samples in addition to mono + Stereo = 1, + }; + + // Parameters for audio capture + struct sCaptureParams + { + float minDuration = 2.0f; + float maxDuration = 3.0f; + float dropStartSilence = 0.25f; + float pauseDuration = 0.333f; + // Flags for the audio capture + uint32_t flags = 0; + }; + + enum struct eCaptureStatus : uint8_t + { + Listening = 1, + Voice = 2, + Transcribing = 4, + Stalled = 0x80, + }; + + using pfnShouldCancel = HRESULT( __stdcall* )( void* pv ) noexcept; + using pfnCaptureStatus = HRESULT( __stdcall* )( void* pv, eCaptureStatus status ) noexcept; + struct sCaptureCallbacks + { + pfnShouldCancel shouldCancel; + pfnCaptureStatus captureStatus; + void* pv; + }; +}
\ No newline at end of file diff --git a/Whisper/API/Readme.txt b/Whisper/API/Readme.txt new file mode 100644 index 0000000..7d40494 --- /dev/null +++ b/Whisper/API/Readme.txt @@ -0,0 +1,15 @@ +The headers in this folder define the complete public API of Whisper.dll. + +To consume the library in your C++ software, include exactly one of the following headers. + +1. If you’re building a windows app, include whisperWindows.h header, and you'll get traditional Win32 COM projection of the API. + +2. If you’re porting to other OS, or porting to different C++ compiler, or already using ComLight support library, include whisperComLight.h header. +If you do that, in addition to this "Whisper/API" folder you also gonna need the "ComLightLib" dependency. +This will get you the ComLight flavor of these COM interfaces. + +Internally, the actual implementation uses the ComLight flavour of the interfaces, but that’s fine because they are binary compatible. + +The reason for the difference between these flavors — Visual Studio’s CComPtr<T> and other related utilities expect interface IDs specified with __declspec(uuid) directive. + +That language extension is specific to Visual C++, not supported in GCC nor Clang compilers.
\ No newline at end of file diff --git a/Whisper/API/SpecialTokens.h b/Whisper/API/SpecialTokens.h new file mode 100644 index 0000000..67fd020 --- /dev/null +++ b/Whisper/API/SpecialTokens.h @@ -0,0 +1,25 @@ +#pragma once + +namespace Whisper +{ + struct SpecialTokens + { + // The end of a transcription, token_eot + int TranscriptionEnd; + // Start of a transcription, token_sot + int TranscriptionStart; + // Represents the previous word in the transcription. It is used to help the model predict the current word based on the context of the words that came before it. + int PreviousWord; // token_prev + // Start of a sentence + int SentenceStart; // token_solm + //Represents the word "not" in the transcription + int Not; // token_not + //New transcription + int TranscriptionBegin; // token_beg + + // token_translate + int TaskTranslate; + // token_transcribe + int TaskTranscribe; + }; +}
\ No newline at end of file diff --git a/Whisper/API/TranscribeStructs.h b/Whisper/API/TranscribeStructs.h new file mode 100644 index 0000000..ac28357 --- /dev/null +++ b/Whisper/API/TranscribeStructs.h @@ -0,0 +1,127 @@ +#pragma once +#include <stdint.h> +#include <assert.h> + +namespace Whisper +{ + enum struct eModelImplementation : uint32_t + { + GPU = 1, + Hybrid = 2, + Reference = 3, + }; + + struct sTimeSpanFields + { + uint32_t days; + uint8_t hours, minutes, seconds; + uint32_t ticks; + + sTimeSpanFields( uint64_t tt ) + { + ticks = (uint32_t)( tt % 10'000'000 ); + tt /= 10'000'000; + seconds = (uint8_t)( tt % 60 ); + tt /= 60; + minutes = (uint8_t)( tt % 60 ); + tt /= 60; + hours = (uint8_t)( tt % 24 ); + tt /= 24; + days = (uint32_t)tt; + } + }; + + struct sTimeSpan + { + uint64_t ticks; + + operator sTimeSpanFields() const + { + return sTimeSpanFields{ ticks }; + } + void operator=( uint64_t tt ) + { + ticks = tt; + } + void operator=( int64_t tt ) + { + assert( tt >= 0 ); + ticks = (uint64_t)tt; + } + }; + + // Start and end times of the segment or token, expressed in 100-nanosecond ticks + struct sTimeInterval + { + sTimeSpan begin, end; + }; + + // Segment data + struct sSegment + { + // Segment text, null-terminated, and probably UTF-8 encoded + const char* text; + // Start and end times of the segment + sTimeInterval time; + uint32_t firstToken, countTokens; + }; + + enum eTokenFlags : uint32_t + { + None = 0, + Special = 1, + }; + inline bool operator &( eTokenFlags a, eTokenFlags b ) + { + return 0 != ( (uint32_t)a & (uint32_t)b ); + } + + // Token data + struct sToken + { + // Token text, null-terminated, and probably UTF-8 encoded + const char* text; + // Start and end times of the token + sTimeInterval time; + // Probability of the token + float probability; + // Probability of the timestamp token + float probabilityTimestamp; + // Sum of probabilities of all timestamp tokens + float ptsum; + // Voice length of the token + float vlen; + // Token id + int id; + eTokenFlags flags; + }; + + struct sTranscribeLength + { + uint32_t countSegments, countTokens; + }; + + enum struct eResultFlags : uint32_t + { + None = 0, + // Return individual tokens in addition to the segments + Tokens = 1, + // Return timestamps + Timestamps = 2, + + // Create a new COM object for the results. + // Without this flag, the context returns a pointer to the COM object stored in the context. + // The content of that object is replaced every time you call iContext.getResults method + NewObject = 0x100, + }; + + inline eResultFlags operator |( eResultFlags a, eResultFlags b ) + { + return (eResultFlags)( (uint32_t)a | (uint32_t)b ); + } + + inline bool operator &( eResultFlags a, eResultFlags b ) + { + return 0 != ( (uint32_t)a & (uint32_t)b ); + } +}
\ No newline at end of file diff --git a/Whisper/API/iContext.cl.h b/Whisper/API/iContext.cl.h new file mode 100644 index 0000000..97d34c7 --- /dev/null +++ b/Whisper/API/iContext.cl.h @@ -0,0 +1,66 @@ +#pragma once +#include "../../ComLightLib/comLightCommon.h" +#include "iTranscribeResult.cl.h" +#include "SpecialTokens.h" +#include "loggerApi.h" +#include "sLanguageList.h" +#include "sLoadModelCallbacks.h" + +namespace Whisper +{ + struct iModel; + struct iAudioBuffer; + struct iAudioReader; + struct iAudioCapture; + struct sCaptureCallbacks; + struct sFullParams; + enum struct eModelImplementation : uint32_t; + enum struct eSamplingStrategy : int; + using whisper_token = int; + struct sProgressSink; + + struct DECLSPEC_NOVTABLE iContext : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{b9956374-3b18-4943-90f2-2ab18a404537}" ); + + // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text + // Uses the specified decoding strategy to obtain the text. + virtual HRESULT COMLIGHTCALL runFull( const sFullParams& params, const iAudioBuffer* buffer ) = 0; + virtual HRESULT COMLIGHTCALL runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader ) = 0; + virtual HRESULT COMLIGHTCALL runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader ) = 0; + + virtual HRESULT COMLIGHTCALL getResults( eResultFlags flags, iTranscribeResult** pp ) const = 0; + + virtual HRESULT COMLIGHTCALL getModel( iModel** pp ) = 0; + + virtual HRESULT COMLIGHTCALL fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi ) = 0; + + // Performance information + virtual HRESULT COMLIGHTCALL timingsPrint() = 0; + virtual HRESULT COMLIGHTCALL timingsReset() = 0; + }; + + struct DECLSPEC_NOVTABLE iModel : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{abefb4c9-e8d8-46a3-8747-5afbadef1adb}" ); + + virtual HRESULT COMLIGHTCALL createContext( iContext** pp ) = 0; + + virtual HRESULT COMLIGHTCALL isMultilingual() = 0; + + virtual HRESULT COMLIGHTCALL getSpecialTokens( SpecialTokens& rdi ) = 0; + + // Token Id -> String + virtual const char* COMLIGHTCALL stringFromToken( whisper_token token ) = 0; + }; + + HRESULT COMLIGHTCALL setupLogger( const sLoggerSetup& setup ); + HRESULT COMLIGHTCALL loadModel( const wchar_t* path, eModelImplementation impl, const sLoadModelCallbacks* callbacks, iModel** pp ); + + uint32_t COMLIGHTCALL findLanguageKeyW( const wchar_t* lang ); + uint32_t COMLIGHTCALL findLanguageKeyA( const char* lang ); + + HRESULT COMLIGHTCALL getSupportedLanguages( sLanguageList& rdi ); +} + +#include "sFullParams.h"
\ No newline at end of file diff --git a/Whisper/API/iContext.h b/Whisper/API/iContext.h new file mode 100644 index 0000000..9661093 --- /dev/null +++ b/Whisper/API/iContext.h @@ -0,0 +1,61 @@ +#pragma once +#include "iTranscribeResult.h" +#include "SpecialTokens.h" +#include "loggerApi.h" +#include "sLanguageList.h" +#include "sLoadModelCallbacks.h" + +namespace Whisper +{ + __interface iModel; + __interface iAudioBuffer; + __interface iAudioReader; + __interface iAudioCapture; + struct sCaptureCallbacks; + struct sFullParams; + enum struct eModelImplementation : uint32_t; + enum struct eSamplingStrategy : int; + using whisper_token = int; + struct sProgressSink; + + __interface __declspec( novtable, uuid( "b9956374-3b18-4943-90f2-2ab18a404537" ) ) iContext : public IUnknown + { + // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text + // Uses the specified decoding strategy to obtain the text. + HRESULT __stdcall runFull( const sFullParams& params, const iAudioBuffer* buffer ); + HRESULT __stdcall runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader ); + HRESULT __stdcall runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader ); + + HRESULT __stdcall getResults( eResultFlags flags, iTranscribeResult** pp ) const; + + HRESULT __stdcall getModel( iModel** pp ); + + HRESULT __stdcall fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi ); + + // Performance information + HRESULT __stdcall timingsPrint(); + HRESULT __stdcall timingsReset(); + }; + + __interface __declspec( novtable, uuid( "abefb4c9-e8d8-46a3-8747-5afbadef1adb" ) ) iModel : public IUnknown + { + HRESULT __stdcall createContext( iContext** pp ); + + HRESULT __stdcall isMultilingual(); + + HRESULT __stdcall getSpecialTokens( SpecialTokens& rdi ); + + // Token Id -> String + const char* __stdcall stringFromToken( whisper_token token ); + }; + + HRESULT __stdcall setupLogger( const sLoggerSetup& setup ); + HRESULT __stdcall loadModel( const wchar_t* path, eModelImplementation impl, const sLoadModelCallbacks* callbacks, iModel** pp ); + + uint32_t __stdcall findLanguageKeyW( const wchar_t* lang ); + uint32_t __stdcall findLanguageKeyA( const char* lang ); + + HRESULT __stdcall getSupportedLanguages( sLanguageList& rdi ); +} + +#include "sFullParams.h"
\ No newline at end of file diff --git a/Whisper/API/iMediaFoundation.cl.h b/Whisper/API/iMediaFoundation.cl.h new file mode 100644 index 0000000..516b67f --- /dev/null +++ b/Whisper/API/iMediaFoundation.cl.h @@ -0,0 +1,48 @@ +#pragma once +#include "../../ComLightLib/comLightCommon.h" +#include "MfStructs.h" + +struct IMFSourceReader; + +namespace Whisper +{ + struct DECLSPEC_NOVTABLE iAudioBuffer : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{013583aa-c9eb-42bc-83db-633c2c317051}" ); + + virtual uint32_t COMLIGHTCALL countSamples() const = 0; + virtual const float* COMLIGHTCALL getPcmMono() const = 0; + virtual const float* COMLIGHTCALL getPcmStereo() const = 0; + virtual HRESULT COMLIGHTCALL getTime( int64_t& rdi ) const = 0; + }; + + struct DECLSPEC_NOVTABLE iAudioReader : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{35b988da-04a6-476a-a193-d8891d5dc390}" ); + + virtual HRESULT COMLIGHTCALL getDuration( int64_t& rdi ) const = 0; + virtual HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const = 0; + virtual HRESULT COMLIGHTCALL requestedStereo() const = 0; + }; + + struct DECLSPEC_NOVTABLE iAudioCapture : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{747752c2-d9fd-40df-8847-583c781bf013}" ); + + virtual HRESULT COMLIGHTCALL getReader( IMFSourceReader** pp ) const = 0; + virtual const sCaptureParams& COMLIGHTCALL getParams() const = 0; + }; + + struct DECLSPEC_NOVTABLE iMediaFoundation : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{fb9763a5-d77d-4b6e-aff8-f494813cebd8}" ); + + virtual HRESULT COMLIGHTCALL loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const = 0; + virtual HRESULT COMLIGHTCALL openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ) = 0; + + virtual HRESULT COMLIGHTCALL listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ) = 0; + virtual HRESULT COMLIGHTCALL openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ) = 0; + }; + + HRESULT COMLIGHTCALL initMediaFoundation( iMediaFoundation** pp ); +}
\ No newline at end of file diff --git a/Whisper/API/iMediaFoundation.h b/Whisper/API/iMediaFoundation.h new file mode 100644 index 0000000..93dc287 --- /dev/null +++ b/Whisper/API/iMediaFoundation.h @@ -0,0 +1,39 @@ +#pragma once +#include <stdint.h> +#include "MfStructs.h" +struct IMFSourceReader; + +namespace Whisper +{ + __interface __declspec( novtable, uuid( "013583aa-c9eb-42bc-83db-633c2c317051" ) ) iAudioBuffer : public IUnknown + { + uint32_t __stdcall countSamples() const; + const float* __stdcall getPcmMono() const; + const float* __stdcall getPcmStereo() const; + HRESULT __stdcall getTime( int64_t& rdi ) const; + }; + + __interface __declspec( novtable, uuid( "35b988da-04a6-476a-a193-d8891d5dc390" ) ) iAudioReader : public IUnknown + { + HRESULT __stdcall getDuration( int64_t& rdi ) const; + HRESULT __stdcall getReader( IMFSourceReader** pp ) const; + HRESULT __stdcall requestedStereo() const; + }; + + __interface __declspec( novtable, uuid( "747752c2-d9fd-40df-8847-583c781bf013" ) ) iAudioCapture : public IUnknown + { + HRESULT __stdcall getReader( IMFSourceReader** pp ) const; + const sCaptureParams& __stdcall getParams() const; + }; + + __interface __declspec( novtable, uuid( "fb9763a5-d77d-4b6e-aff8-f494813cebd8" ) ) iMediaFoundation : public IUnknown + { + HRESULT __stdcall loadAudioFile( LPCTSTR path, bool stereo, iAudioBuffer** pp ) const; + HRESULT __stdcall openAudioFile( LPCTSTR path, bool stereo, iAudioReader** pp ); + + HRESULT __stdcall listCaptureDevices( pfnFoundCaptureDevices pfn, void* pv ); + HRESULT __stdcall openCaptureDevice( LPCTSTR endpoint, const sCaptureParams& captureParams, iAudioCapture** pp ); + }; + + HRESULT __stdcall initMediaFoundation( iMediaFoundation** pp ); +}
\ No newline at end of file diff --git a/Whisper/API/iTranscribeResult.cl.h b/Whisper/API/iTranscribeResult.cl.h new file mode 100644 index 0000000..ab65178 --- /dev/null +++ b/Whisper/API/iTranscribeResult.cl.h @@ -0,0 +1,15 @@ +#pragma once +#include "TranscribeStructs.h" +#include "../../ComLightLib/comLightCommon.h" + +namespace Whisper +{ + struct iTranscribeResult : public ComLight::IUnknown + { + DEFINE_INTERFACE_ID( "{2871a73f-5ce3-48f8-8779-6582ee11935e}" ); + + virtual HRESULT COMLIGHTCALL getSize( sTranscribeLength& rdi ) const = 0; + virtual const sSegment* COMLIGHTCALL getSegments() const = 0; + virtual const sToken* COMLIGHTCALL getTokens() const = 0; + }; +}
\ No newline at end of file diff --git a/Whisper/API/iTranscribeResult.h b/Whisper/API/iTranscribeResult.h new file mode 100644 index 0000000..27e0c0d --- /dev/null +++ b/Whisper/API/iTranscribeResult.h @@ -0,0 +1,12 @@ +#pragma once +#include "TranscribeStructs.h" + +namespace Whisper +{ + __interface __declspec( novtable, uuid( "2871a73f-5ce3-48f8-8779-6582ee11935e" ) ) iTranscribeResult : public IUnknown + { + HRESULT __stdcall getSize( sTranscribeLength& rdi ) const; + const sSegment* __stdcall getSegments() const; + const sToken* __stdcall getTokens() const; + }; +}
\ No newline at end of file diff --git a/Whisper/API/loggerApi.h b/Whisper/API/loggerApi.h new file mode 100644 index 0000000..6af1c4e --- /dev/null +++ b/Whisper/API/loggerApi.h @@ -0,0 +1,35 @@ +#pragma once +#include <stdint.h> + +namespace Whisper +{ + // Log level for messages + enum struct eLogLevel : uint8_t + { + Error = 0, + Warning = 1, + Info = 2, + Debug = 3 + }; + enum struct eLoggerFlags : uint8_t + { + UseStandardError = 1, + SkipFormatMessage = 2, + }; + + // C function pointer to receive log messages from the library. The messages are encoded in UTF-8. + using pfnLoggerSink = void( __stdcall* )( void* context, eLogLevel lvl, const char* message ); + + // A sink to receive log messages produced by MeshRepair.dll + struct sLoggerSetup + { + // C function pointer to receive log messages from the library + pfnLoggerSink sink = nullptr; + // Optional context parameter for the sink function; when consuming from C# you don't need that, pass IntPtr.Zero, delegates can capture things. + void* context = nullptr; + // Maximum log level to produce + eLogLevel level; + // Flags about the logger + eLoggerFlags flags = (eLoggerFlags)0; + }; +}
\ No newline at end of file diff --git a/Whisper/API/sFullParams.h b/Whisper/API/sFullParams.h new file mode 100644 index 0000000..0a1d352 --- /dev/null +++ b/Whisper/API/sFullParams.h @@ -0,0 +1,136 @@ +#pragma once +#include <stdint.h> +#include <assert.h> + +namespace Whisper +{ + // Available sampling strategies + enum struct eSamplingStrategy : int + { + // Always select the most probable token + Greedy, + // TODO: not implemented yet! + BeamSearch, + }; + + using pfnNewSegment = HRESULT( __cdecl* )( iContext* ctx, uint32_t n_new, void* user_data ) noexcept; + using pfnEncoderBegin = HRESULT( __cdecl* )( iContext* ctx, void* user_data ) noexcept; + + enum struct eFullParamsFlags : uint32_t + { + Translate = 1, + NoContext = 2, + SingleSegment = 4, + PrintSpecial = 8, + PrintProgress = 0x10, + PrintRealtime = 0x20, + PrintTimestamps = 0x40, + + // Experimental + TokenTimestamps = 0x100, + SpeedupAudio = 0x200, + }; + + inline eFullParamsFlags operator | ( eFullParamsFlags a, eFullParamsFlags b ) + { + return (eFullParamsFlags)( (uint32_t)a | (uint32_t)b ); + } + inline void operator |= ( eFullParamsFlags& a, eFullParamsFlags b ) + { + a = a | b; + } + + struct sFullParams + { + eSamplingStrategy strategy; + // Count of CPU threads + int cpuThreads; + int n_max_text_ctx; + int offset_ms; // start offset in ms + int duration_ms; // audio duration to process in ms + eFullParamsFlags flags; + uint32_t language; + + // [EXPERIMENTAL] token-level timestamps + float thold_pt; // timestamp token probability threshold (~0.01) + float thold_ptsum; // timestamp token sum probability threshold (~0.01) + int max_len; // max segment length in characters + int max_tokens; // max tokens per segment (0 = no limit) + + struct + { + int n_past; + } greedy; + + struct + { + int n_past; + int beam_width; + int n_best; + } beam_search; + + // [EXPERIMENTAL] speed-up techniques + int audio_ctx; // overwrite the audio context size (0 = use default) + + // tokens to provide the whisper model as initial prompt + // these are prepended to any existing text context from a previous call + const whisper_token* prompt_tokens; + int prompt_n_tokens; + + pfnNewSegment new_segment_callback; + void* new_segment_callback_user_data; + + pfnEncoderBegin encoder_begin_callback; + void* encoder_begin_callback_user_data; + + // Couple utility methods, they workaround the lack of bit fields in C++ + inline bool flag( eFullParamsFlags f ) const + { + return 0 != ( (uint32_t)flags & (uint32_t)f ); + } + inline void resetFlag( eFullParamsFlags bit ) + { + uint32_t f = (uint32_t)flags; + f &= ~(uint32_t)bit; + flags = (eFullParamsFlags)f; + } + inline void setFlag( eFullParamsFlags bit, bool set = true ) + { + uint32_t f = (uint32_t)flags; + if( set ) + f |= (uint32_t)bit; + else + f &= ~(uint32_t)bit; + flags = (eFullParamsFlags)f; + } + }; + + struct sSegmentTime + { + int64_t begin, end; + }; + + inline uint32_t makeLanguageKey( const char* code ) + { + assert( strlen( code ) <= 4 ); + uint32_t res = 0; + uint32_t shift = 0; + for( size_t i = 0; i < 4; i++, code++, shift += 8 ) + { + const char c = *code; + if( c == '\0' ) + return res; + uint32_t u32 = (uint8_t)c; + u32 = u32 << shift; + res |= u32; + } + return res; + } + + using pfnReportProgress = HRESULT( __stdcall* )( double val, iContext* ctx, void* pv ) noexcept; + struct sProgressSink + { + pfnReportProgress pfn; + void* pv; + }; +}
\ No newline at end of file diff --git a/Whisper/API/sLanguageList.h b/Whisper/API/sLanguageList.h new file mode 100644 index 0000000..49ca596 --- /dev/null +++ b/Whisper/API/sLanguageList.h @@ -0,0 +1,18 @@ +#pragma once +#include <stdint.h> + +namespace Whisper +{ + struct sLanguageEntry + { + uint32_t key; + int id; + const char* name; + }; + + struct sLanguageList + { + uint32_t length; + const sLanguageEntry* pointer; + }; +}
\ No newline at end of file diff --git a/Whisper/API/sLoadModelCallbacks.h b/Whisper/API/sLoadModelCallbacks.h new file mode 100644 index 0000000..f5248c6 --- /dev/null +++ b/Whisper/API/sLoadModelCallbacks.h @@ -0,0 +1,14 @@ +#pragma once + +namespace Whisper +{ + using pfnLoadProgress = HRESULT( __stdcall* )( double val, void* pv ) noexcept; + using pfnCancel = HRESULT( __stdcall* )( void* pv ) noexcept; + + struct sLoadModelCallbacks + { + pfnLoadProgress progress; + pfnCancel cancel; + void* pv; + }; +}
\ No newline at end of file diff --git a/Whisper/API/whisperComLight.h b/Whisper/API/whisperComLight.h new file mode 100644 index 0000000..c7f0b93 --- /dev/null +++ b/Whisper/API/whisperComLight.h @@ -0,0 +1,4 @@ +#pragma once +#include "iMediaFoundation.cl.h" +#include "iContext.cl.h" +#include "iTranscribeResult.cl.h"
\ No newline at end of file diff --git a/Whisper/API/whisperWindows.h b/Whisper/API/whisperWindows.h new file mode 100644 index 0000000..925e307 --- /dev/null +++ b/Whisper/API/whisperWindows.h @@ -0,0 +1,4 @@ +#pragma once +#include "iMediaFoundation.h" +#include "iContext.h" +#include "iTranscribeResult.h"
\ No newline at end of file |
