Whisper/API/iContext.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

#pragma once
#include "iTranscribeResult.h"
#include "SpecialTokens.h"
#include "loggerApi.h"
#include "sLanguageList.h"
#include "sLoadModelCallbacks.h"
#include "eGpuModelFlags.h"

namespace Whisper
{
	__interface iModel;
	__interface iAudioBuffer;
	__interface iAudioReader;
	__interface iAudioCapture;
	struct sCaptureCallbacks;
	struct sFullParams;
	enum struct eModelImplementation : uint32_t;
	enum struct eSamplingStrategy : int;
	using whisper_token = int;
	struct sProgressSink;

	__interface __declspec( novtable, uuid( "b9956374-3b18-4943-90f2-2ab18a404537" ) ) iContext : public IUnknown
	{
		// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
		// Uses the specified decoding strategy to obtain the text.
		HRESULT __stdcall runFull( const sFullParams& params, const iAudioBuffer* buffer );
		HRESULT __stdcall runStreamed( const sFullParams& params, const sProgressSink& progress, const iAudioReader* reader );
		HRESULT __stdcall runCapture( const sFullParams& params, const sCaptureCallbacks& callbacks, const iAudioCapture* reader );

		HRESULT __stdcall getResults( eResultFlags flags, iTranscribeResult** pp ) const;
		// Try to detect speaker by comparing channels of the stereo PCM data
		HRESULT __stdcall detectSpeaker( const sTimeInterval& time, eSpeakerChannel& result ) const;

		HRESULT __stdcall getModel( iModel** pp );

		HRESULT __stdcall fullDefaultParams( eSamplingStrategy strategy, sFullParams* rdi );

		// Performance information
		HRESULT __stdcall timingsPrint();
		HRESULT __stdcall timingsReset();
	};

	__interface __declspec( novtable, uuid( "abefb4c9-e8d8-46a3-8747-5afbadef1adb" ) ) iModel : public IUnknown
	{
		HRESULT __stdcall createContext( iContext** pp );

		HRESULT __stdcall isMultilingual();

		HRESULT __stdcall getSpecialTokens( SpecialTokens& rdi );

		// Token Id -> String
		const char* __stdcall stringFromToken( whisper_token token );
	};

	HRESULT __stdcall setupLogger( const sLoggerSetup& setup );
	HRESULT __stdcall loadModel( const wchar_t* path, eModelImplementation impl, uint32_t flags, const sLoadModelCallbacks* callbacks, iModel** pp );

	uint32_t __stdcall findLanguageKeyW( const wchar_t* lang );
	uint32_t __stdcall findLanguageKeyA( const char* lang );

	HRESULT __stdcall getSupportedLanguages( sLanguageList& rdi );
}

#include "sFullParams.h"