summaryrefslogtreecommitdiffstats
path: root/WhisperNet/API
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /WhisperNet/API
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'WhisperNet/API')
-rw-r--r--WhisperNet/API/CaptureDeviceId.cs24
-rw-r--r--WhisperNet/API/Parameters.cs95
-rw-r--r--WhisperNet/API/SpecialTokens.cs23
-rw-r--r--WhisperNet/API/eCaptureStatus.cs19
-rw-r--r--WhisperNet/API/eLanguage.cs206
-rw-r--r--WhisperNet/API/eLogLevel.cs34
-rw-r--r--WhisperNet/API/eModelImplementation.cs25
-rw-r--r--WhisperNet/API/eResultFlags.cs21
-rw-r--r--WhisperNet/API/iAudioBuffer.cs27
-rw-r--r--WhisperNet/API/iAudioReader.cs23
-rw-r--r--WhisperNet/API/iMediaFoundation.cs36
-rw-r--r--WhisperNet/API/iModel.cs27
-rw-r--r--WhisperNet/API/sCaptureParams.cs37
13 files changed, 597 insertions, 0 deletions
diff --git a/WhisperNet/API/CaptureDeviceId.cs b/WhisperNet/API/CaptureDeviceId.cs
new file mode 100644
index 0000000..9636e53
--- /dev/null
+++ b/WhisperNet/API/CaptureDeviceId.cs
@@ -0,0 +1,24 @@
+using Whisper.Internal;
+
+namespace Whisper
+{
+ /// <summary>Identifiers for an audio capture device</summary>
+ public record struct CaptureDeviceId
+ {
+ /// <summary>The display name is suitable for showing to the user, but might not be unique.</summary>
+ public string displayName;
+
+ /// <summary>Endpoint ID for an audio capture device.<br/>
+ /// It uniquely identifies the device on the system, but is not a readable string.</summary>
+ public string endpoint;
+
+ internal CaptureDeviceId( in sCaptureDevice rsi )
+ {
+ displayName = rsi.displayName ?? "<display name unavailable>";
+ endpoint = rsi.endpoint ?? throw new ApplicationException( "The device has no endpoint ID" );
+ }
+
+ /// <summary>Returns a String which represents the object instance</summary>
+ public override string ToString() => $"Capture device: \"{displayName}\"";
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/Parameters.cs b/WhisperNet/API/Parameters.cs
new file mode 100644
index 0000000..d2b53f9
--- /dev/null
+++ b/WhisperNet/API/Parameters.cs
@@ -0,0 +1,95 @@
+// Missing XML comment for publicly visible type or member
+// TODO: remove this line and document them.
+#pragma warning disable CS1591
+
+namespace Whisper
+{
+ /// <summary>Available sampling strategies</summary>
+ public enum eSamplingStrategy: int
+ {
+ /// <summary>Always select the most probable token</summary>
+ Greedy,
+ /// <summary>TODO: not implemented yet!</summary>
+ BeamSearch,
+ };
+
+ [Flags]
+ public enum eFullParamsFlags: uint
+ {
+ None = 0,
+ Translate = 1,
+ NoContext = 2,
+ SingleSegment = 4,
+ PrintSpecial = 8,
+ PrintProgress = 0x10,
+ PrintRealtime = 0x20,
+ PrintTimestamps = 0x40,
+
+ // Experimental
+ TokenTimestamps = 0x100,
+ SpeedupAudio = 0x200,
+ };
+
+ /// <summary>Transcribe parameters</summary>
+ public struct Parameters
+ {
+ /// <summary>Sampling strategy</summary>
+ public eSamplingStrategy strategy;
+
+ /// <summary>Count of CPU worker threads to use</summary>
+ /// <remarks>So far, the GPU model only uses CPU threads for MEL spectrograms</remarks>
+ public int cpuThreads;
+
+ public int n_max_text_ctx;
+ /// <summary>start offset in ms</summary>
+ public int offset_ms;
+ /// <summary>audio duration to process in ms</summary>
+ public int duration_ms;
+ public eFullParamsFlags flags;
+
+ /// <summary>Set or clear the specified flag in the <see cref="flags" /> field of this structure</summary>
+ public void setFlag( eFullParamsFlags flag, bool set )
+ {
+ if( flag != eFullParamsFlags.None )
+ {
+ if( set )
+ flags |= flag;
+ else
+ flags &= ~flag;
+ return;
+ }
+ throw new ArgumentException();
+ }
+
+ /// <summary>Language</summary>
+ public eLanguage language;
+
+ // [EXPERIMENTAL] token-level timestamps
+ /// <summary>timestamp token probability threshold (~0.01)</summary>
+ public float thold_pt;
+ /// <summary>timestamp token sum probability threshold (~0.01)</summary>
+ public float thold_ptsum;
+ /// <summary>max segment length in characters</summary>
+ public int max_len;
+ /// <summary>max tokens per segment (0 = no limit)</summary>
+ public int max_tokens;
+
+ public struct sGreedy
+ {
+ public int n_past;
+ }
+ public sGreedy greedy;
+
+ public struct sBeamSearch
+ {
+ public int n_past;
+ public int beam_width;
+ public int n_best;
+ }
+ public sBeamSearch beamSearch;
+
+ // [EXPERIMENTAL] speed-up techniques
+ /// <summary>overwrite the audio context size (0 = use default)</summary>
+ public int audioContextSize;
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/SpecialTokens.cs b/WhisperNet/API/SpecialTokens.cs
new file mode 100644
index 0000000..d672369
--- /dev/null
+++ b/WhisperNet/API/SpecialTokens.cs
@@ -0,0 +1,23 @@
+namespace Whisper
+{
+ /// <summary>Special tokens defined in the model</summary>
+ public readonly struct SpecialTokens
+ {
+ /// <summary>The end of a transcription</summary>
+ public readonly int TranscriptionEnd; // token_eot
+ /// <summary>Start of a transcription</summary>
+ public readonly int TranscriptionStart; // token_sot
+ /// <summary>Represents the previous word in the transcription. It is used to help the model predict the current word based on the context of the words that came before it.</summary>
+ public readonly int PreviousWord; // token_prev
+ /// <summary>Start of a sentence</summary>
+ public readonly int SentenceStart; // token_solm
+ /// <summary>Represents the word "not" in the transcription</summary>
+ public readonly int Not; // token_not
+ /// <summary>New transcription</summary>
+ public readonly int TranscriptionBegin; // token_beg
+ /// <summary>token_translate</summary>
+ public readonly int TaskTranslate;
+ /// <summary>token_transcribe</summary>
+ public readonly int TaskTranscribe;
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/eCaptureStatus.cs b/WhisperNet/API/eCaptureStatus.cs
new file mode 100644
index 0000000..41f05fb
--- /dev/null
+++ b/WhisperNet/API/eCaptureStatus.cs
@@ -0,0 +1,19 @@
+namespace Whisper
+{
+ /// <summary>Status of the voice capture</summary>
+ [Flags]
+ public enum eCaptureStatus: byte
+ {
+ /// <summary>Doing nothing</summary>
+ None = 0,
+ /// <summary>Capturing the audio</summary>
+ Listening = 1,
+ /// <summary>A voice is detected in the captured audio, recording</summary>
+ Voice = 2,
+ /// <summary>Transcribing a recorded piece of the audio</summary>
+ Transcribing = 4,
+ /// <summary>The computer is unable to transcribe the audio quickly enough,<br/>
+ /// and the capture is dropping the incoming audio samples.</summary>
+ Stalled = 0x80,
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/eLanguage.cs b/WhisperNet/API/eLanguage.cs
new file mode 100644
index 0000000..1241077
--- /dev/null
+++ b/WhisperNet/API/eLanguage.cs
@@ -0,0 +1,206 @@
+// This file is generated by a tool, from the `languageCodez.tsv` file in this repository
+namespace Whisper
+{
+ /// <summary>Supported languages</summary>
+ public enum eLanguage: uint
+ {
+ /// <summary>Afrikaans</summary>
+ Afrikaans = 0x6661,
+ /// <summary>Albanian</summary>
+ Albanian = 0x7173,
+ /// <summary>Amharic</summary>
+ Amharic = 0x6D61,
+ /// <summary>Arabic</summary>
+ Arabic = 0x7261,
+ /// <summary>Armenian</summary>
+ Armenian = 0x7968,
+ /// <summary>Assamese</summary>
+ Assamese = 0x7361,
+ /// <summary>Azerbaijani</summary>
+ Azerbaijani = 0x7A61,
+ /// <summary>Bashkir</summary>
+ Bashkir = 0x6162,
+ /// <summary>Basque</summary>
+ Basque = 0x7565,
+ /// <summary>Belarusian</summary>
+ Belarusian = 0x6562,
+ /// <summary>Bengali</summary>
+ Bengali = 0x6E62,
+ /// <summary>Bosnian</summary>
+ Bosnian = 0x7362,
+ /// <summary>Breton</summary>
+ Breton = 0x7262,
+ /// <summary>Bulgarian</summary>
+ Bulgarian = 0x6762,
+ /// <summary>Catalan</summary>
+ Catalan = 0x6163,
+ /// <summary>Chinese</summary>
+ Chinese = 0x687A,
+ /// <summary>Croatian</summary>
+ Croatian = 0x7268,
+ /// <summary>Czech</summary>
+ Czech = 0x7363,
+ /// <summary>Danish</summary>
+ Danish = 0x6164,
+ /// <summary>Dutch</summary>
+ Dutch = 0x6C6E,
+ /// <summary>English</summary>
+ English = 0x6E65,
+ /// <summary>Estonian</summary>
+ Estonian = 0x7465,
+ /// <summary>Faroese</summary>
+ Faroese = 0x6F66,
+ /// <summary>Finnish</summary>
+ Finnish = 0x6966,
+ /// <summary>French</summary>
+ French = 0x7266,
+ /// <summary>Galician</summary>
+ Galician = 0x6C67,
+ /// <summary>Georgian</summary>
+ Georgian = 0x616B,
+ /// <summary>German</summary>
+ German = 0x6564,
+ /// <summary>Greek</summary>
+ Greek = 0x6C65,
+ /// <summary>Gujarati</summary>
+ Gujarati = 0x7567,
+ /// <summary>Haitian Creole</summary>
+ HaitianCreole = 0x7468,
+ /// <summary>Hausa</summary>
+ Hausa = 0x6168,
+ /// <summary>Hawaiian</summary>
+ Hawaiian = 0x776168,
+ /// <summary>Hebrew</summary>
+ Hebrew = 0x7769,
+ /// <summary>Hindi</summary>
+ Hindi = 0x6968,
+ /// <summary>Hungarian</summary>
+ Hungarian = 0x7568,
+ /// <summary>Icelandic</summary>
+ Icelandic = 0x7369,
+ /// <summary>Indonesian</summary>
+ Indonesian = 0x6469,
+ /// <summary>Italian</summary>
+ Italian = 0x7469,
+ /// <summary>Japanese</summary>
+ Japanese = 0x616A,
+ /// <summary>Javanese</summary>
+ Javanese = 0x776A,
+ /// <summary>Kannada</summary>
+ Kannada = 0x6E6B,
+ /// <summary>Kazakh</summary>
+ Kazakh = 0x6B6B,
+ /// <summary>Khmer</summary>
+ Khmer = 0x6D6B,
+ /// <summary>Korean</summary>
+ Korean = 0x6F6B,
+ /// <summary>Lao</summary>
+ Lao = 0x6F6C,
+ /// <summary>Latin</summary>
+ Latin = 0x616C,
+ /// <summary>Latvian</summary>
+ Latvian = 0x766C,
+ /// <summary>Lingala</summary>
+ Lingala = 0x6E6C,
+ /// <summary>Lithuanian</summary>
+ Lithuanian = 0x746C,
+ /// <summary>Luxembourgish</summary>
+ Luxembourgish = 0x626C,
+ /// <summary>Macedonian</summary>
+ Macedonian = 0x6B6D,
+ /// <summary>Malagasy</summary>
+ Malagasy = 0x676D,
+ /// <summary>Malay</summary>
+ Malay = 0x736D,
+ /// <summary>Malayalam</summary>
+ Malayalam = 0x6C6D,
+ /// <summary>Maltese</summary>
+ Maltese = 0x746D,
+ /// <summary>Maori</summary>
+ Maori = 0x696D,
+ /// <summary>Marathi</summary>
+ Marathi = 0x726D,
+ /// <summary>Mongolian</summary>
+ Mongolian = 0x6E6D,
+ /// <summary>Myanmar</summary>
+ Myanmar = 0x796D,
+ /// <summary>Nepali</summary>
+ Nepali = 0x656E,
+ /// <summary>Norwegian</summary>
+ Norwegian = 0x6F6E,
+ /// <summary>Nynorsk</summary>
+ Nynorsk = 0x6E6E,
+ /// <summary>Occitan</summary>
+ Occitan = 0x636F,
+ /// <summary>Pashto</summary>
+ Pashto = 0x7370,
+ /// <summary>Persian</summary>
+ Persian = 0x6166,
+ /// <summary>Polish</summary>
+ Polish = 0x6C70,
+ /// <summary>Portuguese</summary>
+ Portuguese = 0x7470,
+ /// <summary>Punjabi</summary>
+ Punjabi = 0x6170,
+ /// <summary>Romanian</summary>
+ Romanian = 0x6F72,
+ /// <summary>Russian</summary>
+ Russian = 0x7572,
+ /// <summary>Sanskrit</summary>
+ Sanskrit = 0x6173,
+ /// <summary>Serbian</summary>
+ Serbian = 0x7273,
+ /// <summary>Shona</summary>
+ Shona = 0x6E73,
+ /// <summary>Sindhi</summary>
+ Sindhi = 0x6473,
+ /// <summary>Sinhala</summary>
+ Sinhala = 0x6973,
+ /// <summary>Slovak</summary>
+ Slovak = 0x6B73,
+ /// <summary>Slovenian</summary>
+ Slovenian = 0x6C73,
+ /// <summary>Somali</summary>
+ Somali = 0x6F73,
+ /// <summary>Spanish</summary>
+ Spanish = 0x7365,
+ /// <summary>Sundanese</summary>
+ Sundanese = 0x7573,
+ /// <summary>Swahili</summary>
+ Swahili = 0x7773,
+ /// <summary>Swedish</summary>
+ Swedish = 0x7673,
+ /// <summary>Tagalog</summary>
+ Tagalog = 0x6C74,
+ /// <summary>Tajik</summary>
+ Tajik = 0x6774,
+ /// <summary>Tamil</summary>
+ Tamil = 0x6174,
+ /// <summary>Tatar</summary>
+ Tatar = 0x7474,
+ /// <summary>Telugu</summary>
+ Telugu = 0x6574,
+ /// <summary>Thai</summary>
+ Thai = 0x6874,
+ /// <summary>Tibetan</summary>
+ Tibetan = 0x6F62,
+ /// <summary>Turkish</summary>
+ Turkish = 0x7274,
+ /// <summary>Turkmen</summary>
+ Turkmen = 0x6B74,
+ /// <summary>Ukrainian</summary>
+ Ukrainian = 0x6B75,
+ /// <summary>Urdu</summary>
+ Urdu = 0x7275,
+ /// <summary>Uzbek</summary>
+ Uzbek = 0x7A75,
+ /// <summary>Vietnamese</summary>
+ Vietnamese = 0x6976,
+ /// <summary>Welsh</summary>
+ Welsh = 0x7963,
+ /// <summary>Yiddish</summary>
+ Yiddish = 0x6979,
+ /// <summary>Yoruba</summary>
+ Yoruba = 0x6F79,
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/eLogLevel.cs b/WhisperNet/API/eLogLevel.cs
new file mode 100644
index 0000000..ae494d4
--- /dev/null
+++ b/WhisperNet/API/eLogLevel.cs
@@ -0,0 +1,34 @@
+namespace Whisper
+{
+ /// <summary>Message log level</summary>
+ public enum eLogLevel: byte
+ {
+ /// <summary>Error message</summary>
+ Error = 0,
+ /// <summary>Warning message</summary>
+ Warning = 1,
+ /// <summary>Informational message</summary>
+ Info = 2,
+ /// <summary>Debug message</summary>
+ Debug = 3
+ }
+
+ /// <summary>A delegate to receive log messages from the library</summary>
+ public delegate void pfnLogMessage( eLogLevel level, string message );
+
+ /// <summary>Log destination flags</summary>
+ [Flags]
+ public enum eLoggerFlags: byte
+ {
+ /// <summary>No special flags</summary>
+ None = 0,
+
+ /// <summary>In addition to calling the delegate, print messaged to standard error</summary>
+ UseStandardError = 1,
+
+ /// <summary>Don’t format error codes into messages</summary>
+ /// <remarks>It’s recommended to use this flag in .NET.<br/>
+ /// The standard library already formats these messages automatically, as needed.</remarks>
+ SkipFormatMessage = 2,
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/eModelImplementation.cs b/WhisperNet/API/eModelImplementation.cs
new file mode 100644
index 0000000..1b0a079
--- /dev/null
+++ b/WhisperNet/API/eModelImplementation.cs
@@ -0,0 +1,25 @@
+namespace Whisper
+{
+ /// <summary>Implementation value for the <see cref="Library.loadModel(string, eModelImplementation)" /> factory function</summary>
+ public enum eModelImplementation: uint
+ {
+ /// <summary>GPGPU implementation based on Direct3D 11.0 compute shaders</summary>
+ GPU = 1,
+
+ /// <summary>A hybrid implementation which uses DirectCompute for encode, and decodes on CPU</summary>
+ /// <remarks>
+ /// <para>The build of the native DLL included into this nuget package doesn’t implement this version.<br/>
+ /// To enable, edit <c>stdafx.h</c> in Whisper project, change the value of <c>BUILD_HYBRID_VERSION</c> macro from zero to one, and build.</para>
+ /// <para>This implementation requires a CPU with AVX1, FMA3, F16C and BMI1 instruction set extensions.</para>
+ /// </remarks>
+ Hybrid = 2,
+
+ /// <summary>A reference implementation which uses the original GGML CPU-running code.</summary>
+ /// <remarks>
+ /// <para>The build of the native DLL included into this nuget package doesn’t implement this version either.<br/>
+ /// To enable, edit <c>stdafx.h</c> in Whisper project, change the value of <c>BUILD_BOTH_VERSIONS</c> macro from zero to one, and build the project.</para>
+ /// <para>This implementation requires a CPU with AVX1, FMA3, and F16C instruction set extensions.</para>
+ /// </remarks>
+ Reference = 3,
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/eResultFlags.cs b/WhisperNet/API/eResultFlags.cs
new file mode 100644
index 0000000..1de61ab
--- /dev/null
+++ b/WhisperNet/API/eResultFlags.cs
@@ -0,0 +1,21 @@
+namespace Whisper
+{
+ /// <summary>Flags for <see cref="Context.results(eResultFlags)" /> method</summary>
+ [Flags]
+ public enum eResultFlags: uint
+ {
+ /// <summary>No flags</summary>
+ None = 0,
+
+ /// <summary>Return individual tokens in addition to the segments</summary>
+ Tokens = 1,
+
+ /// <summary>Return timestamps</summary>
+ Timestamps = 2,
+
+ /// <summary>Create a new COM object for the results.</summary>
+ /// <remarks>Without this flag, the context returns a pointer to the COM object stored in the context.<br/>
+ /// The content of that object is replaced every time you call <see cref="Internal.iContext.getResults(eResultFlags)" /> method.</remarks>
+ NewObject = 0x100,
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/iAudioBuffer.cs b/WhisperNet/API/iAudioBuffer.cs
new file mode 100644
index 0000000..1b35621
--- /dev/null
+++ b/WhisperNet/API/iAudioBuffer.cs
@@ -0,0 +1,27 @@
+using ComLight;
+using System.Runtime.InteropServices;
+
+namespace Whisper
+{
+ /// <summary>A buffer with a chunk of audio.</summary>
+ /// <remarks>Note the interface supports both marshaling directions.<br/>
+ /// I have not tested, but you should be able to implement this interface in C#, to supply PCM audio data to the native code</remarks>
+ [ComInterface( "013583aa-c9eb-42bc-83db-633c2c317051", eMarshalDirection.BothWays )]
+ public interface iAudioBuffer: IDisposable
+ {
+ /// <summary>Count of samples in the buffer</summary>
+ int countSamples();
+
+ /// <summary>Unmanaged pointer to the internal buffer containing single-channel FP32 samples.</summary>
+ /// <remarks>If you implementing this interface in C# and your audio data is on the managed heap, use <see cref="GCHandle" /> to make sure it doesn't move.<br/>
+ /// Or better yet, move the data to unmanaged buffer allocated with <see cref="Marshal.AllocHGlobal(int)" /> or <see cref="Marshal.AllocCoTaskMem(int)" /> method.</remarks>
+ IntPtr getPcmMono();
+
+ /// <summary>Unmanaged pointer to the internal buffer containing stereo FP32 samples.</summary>
+ /// <remarks>When the buffer doesn’t have stereo data, the method gonna return <see cref="IntPtr.Zero" />.</remarks>
+ IntPtr getPcmStereo();
+
+ /// <summary>Start time of the buffer, relative to the start of the media</summary>
+ void getTime( out TimeSpan time );
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/iAudioReader.cs b/WhisperNet/API/iAudioReader.cs
new file mode 100644
index 0000000..68cf916
--- /dev/null
+++ b/WhisperNet/API/iAudioReader.cs
@@ -0,0 +1,23 @@
+using ComLight;
+
+namespace Whisper
+{
+ /// <summary>Audio stream reader object</summary>
+ /// <remarks>The implementation is forward-only, and these objects ain’t reusable.<br/>
+ /// To read a source file multiple time, dispose and re-create the reader.</remarks>
+ [ComInterface( "35b988da-04a6-476a-a193-d8891d5dc390", eMarshalDirection.ToManaged )]
+ public interface iAudioReader: IDisposable
+ {
+ /// <summary>Get duration of the media file</summary>
+ [RetValIndex]
+ TimeSpan getDuration();
+ }
+
+ /// <summary>Audio capture reader object</summary>
+ /// <remarks>This interface has no public methods callable from C#.<br/>
+ /// It’s only here to pass data between different functions implemented in C++.</remarks>
+ [ComInterface( "747752c2-d9fd-40df-8847-583c781bf013", eMarshalDirection.ToManaged )]
+ public interface iAudioCapture: IDisposable
+ {
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/iMediaFoundation.cs b/WhisperNet/API/iMediaFoundation.cs
new file mode 100644
index 0000000..535f904
--- /dev/null
+++ b/WhisperNet/API/iMediaFoundation.cs
@@ -0,0 +1,36 @@
+using ComLight;
+using System.Runtime.InteropServices;
+using Whisper.Internal;
+
+namespace Whisper
+{
+ /// <summary>Exposes a small subset of MS Media Foundation framework.</summary>
+ /// <remarks>That framework is a part of Windows OS, since Vista.</remarks>
+ /// <seealso href="https://learn.microsoft.com/en-us/windows/win32/medfound/microsoft-media-foundation-sdk" />
+ [ComInterface( "fb9763a5-d77d-4b6e-aff8-f494813cebd8", eMarshalDirection.ToManaged ), CustomConventions( typeof( NativeLogger ) )]
+ public interface iMediaFoundation: IDisposable
+ {
+ /// <summary>Decode complete audio file into a new memory buffer.</summary>
+ /// <returns>
+ /// Under the hood, the method asks MF to resample and convert audio into the suitable type for the Whisper model.<br/>
+ /// If the path is a video file, the method will decode the first audio track.
+ /// </returns>
+ [RetValIndex( 2 )]
+ iAudioBuffer loadAudioFile( [MarshalAs( UnmanagedType.LPWStr )] string path, [MarshalAs( UnmanagedType.U1 )] bool stereo = false );
+
+ /// <summary>Create a reader to stream the audio file from disk</summary>
+ /// <returns>
+ /// Under the hood, the method asks MF to resample and convert audio into the suitable type for the Whisper model.<br/>
+ /// If the path is a video file, the method will decode the first audio track.
+ /// </returns>
+ [RetValIndex( 2 )]
+ iAudioReader openAudioFile( [MarshalAs( UnmanagedType.LPWStr )] string path, [MarshalAs( UnmanagedType.U1 )] bool stereo = false );
+
+ /// <summary>List capture devices</summary>
+ void listCaptureDevices( [MarshalAs( UnmanagedType.FunctionPtr )] pfnFoundCaptureDevices pfn, IntPtr pv );
+
+ /// <summary>Open audio capture device</summary>
+ [RetValIndex( 2 )]
+ iAudioCapture openCaptureDevice( [MarshalAs( UnmanagedType.LPWStr )] string endpoint, [In] ref sCaptureParams captureParams );
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/iModel.cs b/WhisperNet/API/iModel.cs
new file mode 100644
index 0000000..8ec6d17
--- /dev/null
+++ b/WhisperNet/API/iModel.cs
@@ -0,0 +1,27 @@
+using ComLight;
+using System.ComponentModel;
+
+namespace Whisper
+{
+ /// <summary>A model in VRAM, loaded from GGML file.</summary>
+ /// <remarks>This objetc doesn't keep any mutable state, and can be safely used from multiple threads concurrently</remarks>
+ [ComInterface( "abefb4c9-e8d8-46a3-8747-5afbadef1adb", eMarshalDirection.ToManaged ), CustomConventions( typeof( Internal.NativeLogger ) )]
+ public interface iModel: IDisposable
+ {
+ /// <summary>Create a context to transcribe audio with this model</summary>
+ /// <remarks>Don't call this method, use <see cref="ExtensionMethods.createContext(iModel)" /> instead.</remarks>
+ [RetValIndex, EditorBrowsable( EditorBrowsableState.Never )]
+ Internal.iContext createContextInternal();
+
+ /// <summary>True if this model is multi-lingual</summary>
+ bool isMultilingual();
+
+ /// <summary>Retrieve integer IDs of the special tokens defined by the model</summary>
+ [RetValIndex]
+ SpecialTokens getSpecialTokens();
+
+ /// <summary>Try to resolve integer token ID into string.</summary>
+ /// <remarks>Don't call this method, use <see cref="ExtensionMethods.stringFromToken(iModel, int)" /> instead.</remarks>
+ IntPtr stringFromTokenInternal( int id );
+ }
+} \ No newline at end of file
diff --git a/WhisperNet/API/sCaptureParams.cs b/WhisperNet/API/sCaptureParams.cs
new file mode 100644
index 0000000..7595a69
--- /dev/null
+++ b/WhisperNet/API/sCaptureParams.cs
@@ -0,0 +1,37 @@
+namespace Whisper
+{
+ /// <summary>Flags for the audio capture</summary>
+ [Flags]
+ public enum eCaptureFlags: uint
+ {
+ /// <summary>No special flags</summary>
+ None = 0,
+ /// <summary>When the capture device supports stereo, keep stereo PCM samples in addition to mono</summary>
+ Stereo = 1,
+ }
+
+ /// <summary>Parameters for audio capture</summary>
+ public struct sCaptureParams
+ {
+ /// <summary>Minimum transcribe duration in seconds</summary>
+ public float minDuration;
+ /// <summary>Maximum transcribe duration in seconds</summary>
+ public float maxDuration;
+ /// <summary></summary>
+ public float dropStartSilence;
+ /// <summary></summary>
+ public float pauseDuration;
+ /// <summary>Flags for the audio capture</summary>
+ public eCaptureFlags flags;
+
+ /// <summary>Initialize the structure with some reasonable default values</summary>
+ public sCaptureParams()
+ {
+ minDuration = 7.0f; // 7 seconds
+ maxDuration = 11.0f; // 11 seconds
+ dropStartSilence = 0.25f; // 250 ms
+ pauseDuration = 0.333f; // 333 ms
+ flags = eCaptureFlags.None;
+ }
+ }
+} \ No newline at end of file