diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /WhisperNet/API | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'WhisperNet/API')
| -rw-r--r-- | WhisperNet/API/CaptureDeviceId.cs | 24 | ||||
| -rw-r--r-- | WhisperNet/API/Parameters.cs | 95 | ||||
| -rw-r--r-- | WhisperNet/API/SpecialTokens.cs | 23 | ||||
| -rw-r--r-- | WhisperNet/API/eCaptureStatus.cs | 19 | ||||
| -rw-r--r-- | WhisperNet/API/eLanguage.cs | 206 | ||||
| -rw-r--r-- | WhisperNet/API/eLogLevel.cs | 34 | ||||
| -rw-r--r-- | WhisperNet/API/eModelImplementation.cs | 25 | ||||
| -rw-r--r-- | WhisperNet/API/eResultFlags.cs | 21 | ||||
| -rw-r--r-- | WhisperNet/API/iAudioBuffer.cs | 27 | ||||
| -rw-r--r-- | WhisperNet/API/iAudioReader.cs | 23 | ||||
| -rw-r--r-- | WhisperNet/API/iMediaFoundation.cs | 36 | ||||
| -rw-r--r-- | WhisperNet/API/iModel.cs | 27 | ||||
| -rw-r--r-- | WhisperNet/API/sCaptureParams.cs | 37 |
13 files changed, 597 insertions, 0 deletions
diff --git a/WhisperNet/API/CaptureDeviceId.cs b/WhisperNet/API/CaptureDeviceId.cs new file mode 100644 index 0000000..9636e53 --- /dev/null +++ b/WhisperNet/API/CaptureDeviceId.cs @@ -0,0 +1,24 @@ +using Whisper.Internal; + +namespace Whisper +{ + /// <summary>Identifiers for an audio capture device</summary> + public record struct CaptureDeviceId + { + /// <summary>The display name is suitable for showing to the user, but might not be unique.</summary> + public string displayName; + + /// <summary>Endpoint ID for an audio capture device.<br/> + /// It uniquely identifies the device on the system, but is not a readable string.</summary> + public string endpoint; + + internal CaptureDeviceId( in sCaptureDevice rsi ) + { + displayName = rsi.displayName ?? "<display name unavailable>"; + endpoint = rsi.endpoint ?? throw new ApplicationException( "The device has no endpoint ID" ); + } + + /// <summary>Returns a String which represents the object instance</summary> + public override string ToString() => $"Capture device: \"{displayName}\""; + } +}
\ No newline at end of file diff --git a/WhisperNet/API/Parameters.cs b/WhisperNet/API/Parameters.cs new file mode 100644 index 0000000..d2b53f9 --- /dev/null +++ b/WhisperNet/API/Parameters.cs @@ -0,0 +1,95 @@ +// Missing XML comment for publicly visible type or member +// TODO: remove this line and document them. +#pragma warning disable CS1591 + +namespace Whisper +{ + /// <summary>Available sampling strategies</summary> + public enum eSamplingStrategy: int + { + /// <summary>Always select the most probable token</summary> + Greedy, + /// <summary>TODO: not implemented yet!</summary> + BeamSearch, + }; + + [Flags] + public enum eFullParamsFlags: uint + { + None = 0, + Translate = 1, + NoContext = 2, + SingleSegment = 4, + PrintSpecial = 8, + PrintProgress = 0x10, + PrintRealtime = 0x20, + PrintTimestamps = 0x40, + + // Experimental + TokenTimestamps = 0x100, + SpeedupAudio = 0x200, + }; + + /// <summary>Transcribe parameters</summary> + public struct Parameters + { + /// <summary>Sampling strategy</summary> + public eSamplingStrategy strategy; + + /// <summary>Count of CPU worker threads to use</summary> + /// <remarks>So far, the GPU model only uses CPU threads for MEL spectrograms</remarks> + public int cpuThreads; + + public int n_max_text_ctx; + /// <summary>start offset in ms</summary> + public int offset_ms; + /// <summary>audio duration to process in ms</summary> + public int duration_ms; + public eFullParamsFlags flags; + + /// <summary>Set or clear the specified flag in the <see cref="flags" /> field of this structure</summary> + public void setFlag( eFullParamsFlags flag, bool set ) + { + if( flag != eFullParamsFlags.None ) + { + if( set ) + flags |= flag; + else + flags &= ~flag; + return; + } + throw new ArgumentException(); + } + + /// <summary>Language</summary> + public eLanguage language; + + // [EXPERIMENTAL] token-level timestamps + /// <summary>timestamp token probability threshold (~0.01)</summary> + public float thold_pt; + /// <summary>timestamp token sum probability threshold (~0.01)</summary> + public float thold_ptsum; + /// <summary>max segment length in characters</summary> + public int max_len; + /// <summary>max tokens per segment (0 = no limit)</summary> + public int max_tokens; + + public struct sGreedy + { + public int n_past; + } + public sGreedy greedy; + + public struct sBeamSearch + { + public int n_past; + public int beam_width; + public int n_best; + } + public sBeamSearch beamSearch; + + // [EXPERIMENTAL] speed-up techniques + /// <summary>overwrite the audio context size (0 = use default)</summary> + public int audioContextSize; + } +}
\ No newline at end of file diff --git a/WhisperNet/API/SpecialTokens.cs b/WhisperNet/API/SpecialTokens.cs new file mode 100644 index 0000000..d672369 --- /dev/null +++ b/WhisperNet/API/SpecialTokens.cs @@ -0,0 +1,23 @@ +namespace Whisper +{ + /// <summary>Special tokens defined in the model</summary> + public readonly struct SpecialTokens + { + /// <summary>The end of a transcription</summary> + public readonly int TranscriptionEnd; // token_eot + /// <summary>Start of a transcription</summary> + public readonly int TranscriptionStart; // token_sot + /// <summary>Represents the previous word in the transcription. It is used to help the model predict the current word based on the context of the words that came before it.</summary> + public readonly int PreviousWord; // token_prev + /// <summary>Start of a sentence</summary> + public readonly int SentenceStart; // token_solm + /// <summary>Represents the word "not" in the transcription</summary> + public readonly int Not; // token_not + /// <summary>New transcription</summary> + public readonly int TranscriptionBegin; // token_beg + /// <summary>token_translate</summary> + public readonly int TaskTranslate; + /// <summary>token_transcribe</summary> + public readonly int TaskTranscribe; + } +}
\ No newline at end of file diff --git a/WhisperNet/API/eCaptureStatus.cs b/WhisperNet/API/eCaptureStatus.cs new file mode 100644 index 0000000..41f05fb --- /dev/null +++ b/WhisperNet/API/eCaptureStatus.cs @@ -0,0 +1,19 @@ +namespace Whisper +{ + /// <summary>Status of the voice capture</summary> + [Flags] + public enum eCaptureStatus: byte + { + /// <summary>Doing nothing</summary> + None = 0, + /// <summary>Capturing the audio</summary> + Listening = 1, + /// <summary>A voice is detected in the captured audio, recording</summary> + Voice = 2, + /// <summary>Transcribing a recorded piece of the audio</summary> + Transcribing = 4, + /// <summary>The computer is unable to transcribe the audio quickly enough,<br/> + /// and the capture is dropping the incoming audio samples.</summary> + Stalled = 0x80, + } +}
\ No newline at end of file diff --git a/WhisperNet/API/eLanguage.cs b/WhisperNet/API/eLanguage.cs new file mode 100644 index 0000000..1241077 --- /dev/null +++ b/WhisperNet/API/eLanguage.cs @@ -0,0 +1,206 @@ +// This file is generated by a tool, from the `languageCodez.tsv` file in this repository +namespace Whisper +{ + /// <summary>Supported languages</summary> + public enum eLanguage: uint + { + /// <summary>Afrikaans</summary> + Afrikaans = 0x6661, + /// <summary>Albanian</summary> + Albanian = 0x7173, + /// <summary>Amharic</summary> + Amharic = 0x6D61, + /// <summary>Arabic</summary> + Arabic = 0x7261, + /// <summary>Armenian</summary> + Armenian = 0x7968, + /// <summary>Assamese</summary> + Assamese = 0x7361, + /// <summary>Azerbaijani</summary> + Azerbaijani = 0x7A61, + /// <summary>Bashkir</summary> + Bashkir = 0x6162, + /// <summary>Basque</summary> + Basque = 0x7565, + /// <summary>Belarusian</summary> + Belarusian = 0x6562, + /// <summary>Bengali</summary> + Bengali = 0x6E62, + /// <summary>Bosnian</summary> + Bosnian = 0x7362, + /// <summary>Breton</summary> + Breton = 0x7262, + /// <summary>Bulgarian</summary> + Bulgarian = 0x6762, + /// <summary>Catalan</summary> + Catalan = 0x6163, + /// <summary>Chinese</summary> + Chinese = 0x687A, + /// <summary>Croatian</summary> + Croatian = 0x7268, + /// <summary>Czech</summary> + Czech = 0x7363, + /// <summary>Danish</summary> + Danish = 0x6164, + /// <summary>Dutch</summary> + Dutch = 0x6C6E, + /// <summary>English</summary> + English = 0x6E65, + /// <summary>Estonian</summary> + Estonian = 0x7465, + /// <summary>Faroese</summary> + Faroese = 0x6F66, + /// <summary>Finnish</summary> + Finnish = 0x6966, + /// <summary>French</summary> + French = 0x7266, + /// <summary>Galician</summary> + Galician = 0x6C67, + /// <summary>Georgian</summary> + Georgian = 0x616B, + /// <summary>German</summary> + German = 0x6564, + /// <summary>Greek</summary> + Greek = 0x6C65, + /// <summary>Gujarati</summary> + Gujarati = 0x7567, + /// <summary>Haitian Creole</summary> + HaitianCreole = 0x7468, + /// <summary>Hausa</summary> + Hausa = 0x6168, + /// <summary>Hawaiian</summary> + Hawaiian = 0x776168, + /// <summary>Hebrew</summary> + Hebrew = 0x7769, + /// <summary>Hindi</summary> + Hindi = 0x6968, + /// <summary>Hungarian</summary> + Hungarian = 0x7568, + /// <summary>Icelandic</summary> + Icelandic = 0x7369, + /// <summary>Indonesian</summary> + Indonesian = 0x6469, + /// <summary>Italian</summary> + Italian = 0x7469, + /// <summary>Japanese</summary> + Japanese = 0x616A, + /// <summary>Javanese</summary> + Javanese = 0x776A, + /// <summary>Kannada</summary> + Kannada = 0x6E6B, + /// <summary>Kazakh</summary> + Kazakh = 0x6B6B, + /// <summary>Khmer</summary> + Khmer = 0x6D6B, + /// <summary>Korean</summary> + Korean = 0x6F6B, + /// <summary>Lao</summary> + Lao = 0x6F6C, + /// <summary>Latin</summary> + Latin = 0x616C, + /// <summary>Latvian</summary> + Latvian = 0x766C, + /// <summary>Lingala</summary> + Lingala = 0x6E6C, + /// <summary>Lithuanian</summary> + Lithuanian = 0x746C, + /// <summary>Luxembourgish</summary> + Luxembourgish = 0x626C, + /// <summary>Macedonian</summary> + Macedonian = 0x6B6D, + /// <summary>Malagasy</summary> + Malagasy = 0x676D, + /// <summary>Malay</summary> + Malay = 0x736D, + /// <summary>Malayalam</summary> + Malayalam = 0x6C6D, + /// <summary>Maltese</summary> + Maltese = 0x746D, + /// <summary>Maori</summary> + Maori = 0x696D, + /// <summary>Marathi</summary> + Marathi = 0x726D, + /// <summary>Mongolian</summary> + Mongolian = 0x6E6D, + /// <summary>Myanmar</summary> + Myanmar = 0x796D, + /// <summary>Nepali</summary> + Nepali = 0x656E, + /// <summary>Norwegian</summary> + Norwegian = 0x6F6E, + /// <summary>Nynorsk</summary> + Nynorsk = 0x6E6E, + /// <summary>Occitan</summary> + Occitan = 0x636F, + /// <summary>Pashto</summary> + Pashto = 0x7370, + /// <summary>Persian</summary> + Persian = 0x6166, + /// <summary>Polish</summary> + Polish = 0x6C70, + /// <summary>Portuguese</summary> + Portuguese = 0x7470, + /// <summary>Punjabi</summary> + Punjabi = 0x6170, + /// <summary>Romanian</summary> + Romanian = 0x6F72, + /// <summary>Russian</summary> + Russian = 0x7572, + /// <summary>Sanskrit</summary> + Sanskrit = 0x6173, + /// <summary>Serbian</summary> + Serbian = 0x7273, + /// <summary>Shona</summary> + Shona = 0x6E73, + /// <summary>Sindhi</summary> + Sindhi = 0x6473, + /// <summary>Sinhala</summary> + Sinhala = 0x6973, + /// <summary>Slovak</summary> + Slovak = 0x6B73, + /// <summary>Slovenian</summary> + Slovenian = 0x6C73, + /// <summary>Somali</summary> + Somali = 0x6F73, + /// <summary>Spanish</summary> + Spanish = 0x7365, + /// <summary>Sundanese</summary> + Sundanese = 0x7573, + /// <summary>Swahili</summary> + Swahili = 0x7773, + /// <summary>Swedish</summary> + Swedish = 0x7673, + /// <summary>Tagalog</summary> + Tagalog = 0x6C74, + /// <summary>Tajik</summary> + Tajik = 0x6774, + /// <summary>Tamil</summary> + Tamil = 0x6174, + /// <summary>Tatar</summary> + Tatar = 0x7474, + /// <summary>Telugu</summary> + Telugu = 0x6574, + /// <summary>Thai</summary> + Thai = 0x6874, + /// <summary>Tibetan</summary> + Tibetan = 0x6F62, + /// <summary>Turkish</summary> + Turkish = 0x7274, + /// <summary>Turkmen</summary> + Turkmen = 0x6B74, + /// <summary>Ukrainian</summary> + Ukrainian = 0x6B75, + /// <summary>Urdu</summary> + Urdu = 0x7275, + /// <summary>Uzbek</summary> + Uzbek = 0x7A75, + /// <summary>Vietnamese</summary> + Vietnamese = 0x6976, + /// <summary>Welsh</summary> + Welsh = 0x7963, + /// <summary>Yiddish</summary> + Yiddish = 0x6979, + /// <summary>Yoruba</summary> + Yoruba = 0x6F79, + } +}
\ No newline at end of file diff --git a/WhisperNet/API/eLogLevel.cs b/WhisperNet/API/eLogLevel.cs new file mode 100644 index 0000000..ae494d4 --- /dev/null +++ b/WhisperNet/API/eLogLevel.cs @@ -0,0 +1,34 @@ +namespace Whisper +{ + /// <summary>Message log level</summary> + public enum eLogLevel: byte + { + /// <summary>Error message</summary> + Error = 0, + /// <summary>Warning message</summary> + Warning = 1, + /// <summary>Informational message</summary> + Info = 2, + /// <summary>Debug message</summary> + Debug = 3 + } + + /// <summary>A delegate to receive log messages from the library</summary> + public delegate void pfnLogMessage( eLogLevel level, string message ); + + /// <summary>Log destination flags</summary> + [Flags] + public enum eLoggerFlags: byte + { + /// <summary>No special flags</summary> + None = 0, + + /// <summary>In addition to calling the delegate, print messaged to standard error</summary> + UseStandardError = 1, + + /// <summary>Don’t format error codes into messages</summary> + /// <remarks>It’s recommended to use this flag in .NET.<br/> + /// The standard library already formats these messages automatically, as needed.</remarks> + SkipFormatMessage = 2, + } +}
\ No newline at end of file diff --git a/WhisperNet/API/eModelImplementation.cs b/WhisperNet/API/eModelImplementation.cs new file mode 100644 index 0000000..1b0a079 --- /dev/null +++ b/WhisperNet/API/eModelImplementation.cs @@ -0,0 +1,25 @@ +namespace Whisper +{ + /// <summary>Implementation value for the <see cref="Library.loadModel(string, eModelImplementation)" /> factory function</summary> + public enum eModelImplementation: uint + { + /// <summary>GPGPU implementation based on Direct3D 11.0 compute shaders</summary> + GPU = 1, + + /// <summary>A hybrid implementation which uses DirectCompute for encode, and decodes on CPU</summary> + /// <remarks> + /// <para>The build of the native DLL included into this nuget package doesn’t implement this version.<br/> + /// To enable, edit <c>stdafx.h</c> in Whisper project, change the value of <c>BUILD_HYBRID_VERSION</c> macro from zero to one, and build.</para> + /// <para>This implementation requires a CPU with AVX1, FMA3, F16C and BMI1 instruction set extensions.</para> + /// </remarks> + Hybrid = 2, + + /// <summary>A reference implementation which uses the original GGML CPU-running code.</summary> + /// <remarks> + /// <para>The build of the native DLL included into this nuget package doesn’t implement this version either.<br/> + /// To enable, edit <c>stdafx.h</c> in Whisper project, change the value of <c>BUILD_BOTH_VERSIONS</c> macro from zero to one, and build the project.</para> + /// <para>This implementation requires a CPU with AVX1, FMA3, and F16C instruction set extensions.</para> + /// </remarks> + Reference = 3, + } +}
\ No newline at end of file diff --git a/WhisperNet/API/eResultFlags.cs b/WhisperNet/API/eResultFlags.cs new file mode 100644 index 0000000..1de61ab --- /dev/null +++ b/WhisperNet/API/eResultFlags.cs @@ -0,0 +1,21 @@ +namespace Whisper +{ + /// <summary>Flags for <see cref="Context.results(eResultFlags)" /> method</summary> + [Flags] + public enum eResultFlags: uint + { + /// <summary>No flags</summary> + None = 0, + + /// <summary>Return individual tokens in addition to the segments</summary> + Tokens = 1, + + /// <summary>Return timestamps</summary> + Timestamps = 2, + + /// <summary>Create a new COM object for the results.</summary> + /// <remarks>Without this flag, the context returns a pointer to the COM object stored in the context.<br/> + /// The content of that object is replaced every time you call <see cref="Internal.iContext.getResults(eResultFlags)" /> method.</remarks> + NewObject = 0x100, + } +}
\ No newline at end of file diff --git a/WhisperNet/API/iAudioBuffer.cs b/WhisperNet/API/iAudioBuffer.cs new file mode 100644 index 0000000..1b35621 --- /dev/null +++ b/WhisperNet/API/iAudioBuffer.cs @@ -0,0 +1,27 @@ +using ComLight; +using System.Runtime.InteropServices; + +namespace Whisper +{ + /// <summary>A buffer with a chunk of audio.</summary> + /// <remarks>Note the interface supports both marshaling directions.<br/> + /// I have not tested, but you should be able to implement this interface in C#, to supply PCM audio data to the native code</remarks> + [ComInterface( "013583aa-c9eb-42bc-83db-633c2c317051", eMarshalDirection.BothWays )] + public interface iAudioBuffer: IDisposable + { + /// <summary>Count of samples in the buffer</summary> + int countSamples(); + + /// <summary>Unmanaged pointer to the internal buffer containing single-channel FP32 samples.</summary> + /// <remarks>If you implementing this interface in C# and your audio data is on the managed heap, use <see cref="GCHandle" /> to make sure it doesn't move.<br/> + /// Or better yet, move the data to unmanaged buffer allocated with <see cref="Marshal.AllocHGlobal(int)" /> or <see cref="Marshal.AllocCoTaskMem(int)" /> method.</remarks> + IntPtr getPcmMono(); + + /// <summary>Unmanaged pointer to the internal buffer containing stereo FP32 samples.</summary> + /// <remarks>When the buffer doesn’t have stereo data, the method gonna return <see cref="IntPtr.Zero" />.</remarks> + IntPtr getPcmStereo(); + + /// <summary>Start time of the buffer, relative to the start of the media</summary> + void getTime( out TimeSpan time ); + } +}
\ No newline at end of file diff --git a/WhisperNet/API/iAudioReader.cs b/WhisperNet/API/iAudioReader.cs new file mode 100644 index 0000000..68cf916 --- /dev/null +++ b/WhisperNet/API/iAudioReader.cs @@ -0,0 +1,23 @@ +using ComLight; + +namespace Whisper +{ + /// <summary>Audio stream reader object</summary> + /// <remarks>The implementation is forward-only, and these objects ain’t reusable.<br/> + /// To read a source file multiple time, dispose and re-create the reader.</remarks> + [ComInterface( "35b988da-04a6-476a-a193-d8891d5dc390", eMarshalDirection.ToManaged )] + public interface iAudioReader: IDisposable + { + /// <summary>Get duration of the media file</summary> + [RetValIndex] + TimeSpan getDuration(); + } + + /// <summary>Audio capture reader object</summary> + /// <remarks>This interface has no public methods callable from C#.<br/> + /// It’s only here to pass data between different functions implemented in C++.</remarks> + [ComInterface( "747752c2-d9fd-40df-8847-583c781bf013", eMarshalDirection.ToManaged )] + public interface iAudioCapture: IDisposable + { + } +}
\ No newline at end of file diff --git a/WhisperNet/API/iMediaFoundation.cs b/WhisperNet/API/iMediaFoundation.cs new file mode 100644 index 0000000..535f904 --- /dev/null +++ b/WhisperNet/API/iMediaFoundation.cs @@ -0,0 +1,36 @@ +using ComLight; +using System.Runtime.InteropServices; +using Whisper.Internal; + +namespace Whisper +{ + /// <summary>Exposes a small subset of MS Media Foundation framework.</summary> + /// <remarks>That framework is a part of Windows OS, since Vista.</remarks> + /// <seealso href="https://learn.microsoft.com/en-us/windows/win32/medfound/microsoft-media-foundation-sdk" /> + [ComInterface( "fb9763a5-d77d-4b6e-aff8-f494813cebd8", eMarshalDirection.ToManaged ), CustomConventions( typeof( NativeLogger ) )] + public interface iMediaFoundation: IDisposable + { + /// <summary>Decode complete audio file into a new memory buffer.</summary> + /// <returns> + /// Under the hood, the method asks MF to resample and convert audio into the suitable type for the Whisper model.<br/> + /// If the path is a video file, the method will decode the first audio track. + /// </returns> + [RetValIndex( 2 )] + iAudioBuffer loadAudioFile( [MarshalAs( UnmanagedType.LPWStr )] string path, [MarshalAs( UnmanagedType.U1 )] bool stereo = false ); + + /// <summary>Create a reader to stream the audio file from disk</summary> + /// <returns> + /// Under the hood, the method asks MF to resample and convert audio into the suitable type for the Whisper model.<br/> + /// If the path is a video file, the method will decode the first audio track. + /// </returns> + [RetValIndex( 2 )] + iAudioReader openAudioFile( [MarshalAs( UnmanagedType.LPWStr )] string path, [MarshalAs( UnmanagedType.U1 )] bool stereo = false ); + + /// <summary>List capture devices</summary> + void listCaptureDevices( [MarshalAs( UnmanagedType.FunctionPtr )] pfnFoundCaptureDevices pfn, IntPtr pv ); + + /// <summary>Open audio capture device</summary> + [RetValIndex( 2 )] + iAudioCapture openCaptureDevice( [MarshalAs( UnmanagedType.LPWStr )] string endpoint, [In] ref sCaptureParams captureParams ); + } +}
\ No newline at end of file diff --git a/WhisperNet/API/iModel.cs b/WhisperNet/API/iModel.cs new file mode 100644 index 0000000..8ec6d17 --- /dev/null +++ b/WhisperNet/API/iModel.cs @@ -0,0 +1,27 @@ +using ComLight; +using System.ComponentModel; + +namespace Whisper +{ + /// <summary>A model in VRAM, loaded from GGML file.</summary> + /// <remarks>This objetc doesn't keep any mutable state, and can be safely used from multiple threads concurrently</remarks> + [ComInterface( "abefb4c9-e8d8-46a3-8747-5afbadef1adb", eMarshalDirection.ToManaged ), CustomConventions( typeof( Internal.NativeLogger ) )] + public interface iModel: IDisposable + { + /// <summary>Create a context to transcribe audio with this model</summary> + /// <remarks>Don't call this method, use <see cref="ExtensionMethods.createContext(iModel)" /> instead.</remarks> + [RetValIndex, EditorBrowsable( EditorBrowsableState.Never )] + Internal.iContext createContextInternal(); + + /// <summary>True if this model is multi-lingual</summary> + bool isMultilingual(); + + /// <summary>Retrieve integer IDs of the special tokens defined by the model</summary> + [RetValIndex] + SpecialTokens getSpecialTokens(); + + /// <summary>Try to resolve integer token ID into string.</summary> + /// <remarks>Don't call this method, use <see cref="ExtensionMethods.stringFromToken(iModel, int)" /> instead.</remarks> + IntPtr stringFromTokenInternal( int id ); + } +}
\ No newline at end of file diff --git a/WhisperNet/API/sCaptureParams.cs b/WhisperNet/API/sCaptureParams.cs new file mode 100644 index 0000000..7595a69 --- /dev/null +++ b/WhisperNet/API/sCaptureParams.cs @@ -0,0 +1,37 @@ +namespace Whisper +{ + /// <summary>Flags for the audio capture</summary> + [Flags] + public enum eCaptureFlags: uint + { + /// <summary>No special flags</summary> + None = 0, + /// <summary>When the capture device supports stereo, keep stereo PCM samples in addition to mono</summary> + Stereo = 1, + } + + /// <summary>Parameters for audio capture</summary> + public struct sCaptureParams + { + /// <summary>Minimum transcribe duration in seconds</summary> + public float minDuration; + /// <summary>Maximum transcribe duration in seconds</summary> + public float maxDuration; + /// <summary></summary> + public float dropStartSilence; + /// <summary></summary> + public float pauseDuration; + /// <summary>Flags for the audio capture</summary> + public eCaptureFlags flags; + + /// <summary>Initialize the structure with some reasonable default values</summary> + public sCaptureParams() + { + minDuration = 7.0f; // 7 seconds + maxDuration = 11.0f; // 11 seconds + dropStartSilence = 0.25f; // 250 ms + pauseDuration = 0.333f; // 333 ms + flags = eCaptureFlags.None; + } + } +}
\ No newline at end of file |
