From 8c4603c73675958efc960fbd4bb599a2909d106a Mon Sep 17 00:00:00 2001 From: Konstantin Date: Mon, 16 Jan 2023 14:52:43 +0100 Subject: Source codes --- WhisperNet/API/CaptureDeviceId.cs | 24 ++++ WhisperNet/API/Parameters.cs | 95 +++++++++++++++ WhisperNet/API/SpecialTokens.cs | 23 ++++ WhisperNet/API/eCaptureStatus.cs | 19 +++ WhisperNet/API/eLanguage.cs | 206 +++++++++++++++++++++++++++++++++ WhisperNet/API/eLogLevel.cs | 34 ++++++ WhisperNet/API/eModelImplementation.cs | 25 ++++ WhisperNet/API/eResultFlags.cs | 21 ++++ WhisperNet/API/iAudioBuffer.cs | 27 +++++ WhisperNet/API/iAudioReader.cs | 23 ++++ WhisperNet/API/iMediaFoundation.cs | 36 ++++++ WhisperNet/API/iModel.cs | 27 +++++ WhisperNet/API/sCaptureParams.cs | 37 ++++++ 13 files changed, 597 insertions(+) create mode 100644 WhisperNet/API/CaptureDeviceId.cs create mode 100644 WhisperNet/API/Parameters.cs create mode 100644 WhisperNet/API/SpecialTokens.cs create mode 100644 WhisperNet/API/eCaptureStatus.cs create mode 100644 WhisperNet/API/eLanguage.cs create mode 100644 WhisperNet/API/eLogLevel.cs create mode 100644 WhisperNet/API/eModelImplementation.cs create mode 100644 WhisperNet/API/eResultFlags.cs create mode 100644 WhisperNet/API/iAudioBuffer.cs create mode 100644 WhisperNet/API/iAudioReader.cs create mode 100644 WhisperNet/API/iMediaFoundation.cs create mode 100644 WhisperNet/API/iModel.cs create mode 100644 WhisperNet/API/sCaptureParams.cs (limited to 'WhisperNet/API') diff --git a/WhisperNet/API/CaptureDeviceId.cs b/WhisperNet/API/CaptureDeviceId.cs new file mode 100644 index 0000000..9636e53 --- /dev/null +++ b/WhisperNet/API/CaptureDeviceId.cs @@ -0,0 +1,24 @@ +using Whisper.Internal; + +namespace Whisper +{ + /// Identifiers for an audio capture device + public record struct CaptureDeviceId + { + /// The display name is suitable for showing to the user, but might not be unique. + public string displayName; + + /// Endpoint ID for an audio capture device.
+ /// It uniquely identifies the device on the system, but is not a readable string.
+ public string endpoint; + + internal CaptureDeviceId( in sCaptureDevice rsi ) + { + displayName = rsi.displayName ?? ""; + endpoint = rsi.endpoint ?? throw new ApplicationException( "The device has no endpoint ID" ); + } + + /// Returns a String which represents the object instance + public override string ToString() => $"Capture device: \"{displayName}\""; + } +} \ No newline at end of file diff --git a/WhisperNet/API/Parameters.cs b/WhisperNet/API/Parameters.cs new file mode 100644 index 0000000..d2b53f9 --- /dev/null +++ b/WhisperNet/API/Parameters.cs @@ -0,0 +1,95 @@ +// Missing XML comment for publicly visible type or member +// TODO: remove this line and document them. +#pragma warning disable CS1591 + +namespace Whisper +{ + /// Available sampling strategies + public enum eSamplingStrategy: int + { + /// Always select the most probable token + Greedy, + /// TODO: not implemented yet! + BeamSearch, + }; + + [Flags] + public enum eFullParamsFlags: uint + { + None = 0, + Translate = 1, + NoContext = 2, + SingleSegment = 4, + PrintSpecial = 8, + PrintProgress = 0x10, + PrintRealtime = 0x20, + PrintTimestamps = 0x40, + + // Experimental + TokenTimestamps = 0x100, + SpeedupAudio = 0x200, + }; + + /// Transcribe parameters + public struct Parameters + { + /// Sampling strategy + public eSamplingStrategy strategy; + + /// Count of CPU worker threads to use + /// So far, the GPU model only uses CPU threads for MEL spectrograms + public int cpuThreads; + + public int n_max_text_ctx; + /// start offset in ms + public int offset_ms; + /// audio duration to process in ms + public int duration_ms; + public eFullParamsFlags flags; + + /// Set or clear the specified flag in the field of this structure + public void setFlag( eFullParamsFlags flag, bool set ) + { + if( flag != eFullParamsFlags.None ) + { + if( set ) + flags |= flag; + else + flags &= ~flag; + return; + } + throw new ArgumentException(); + } + + /// Language + public eLanguage language; + + // [EXPERIMENTAL] token-level timestamps + /// timestamp token probability threshold (~0.01) + public float thold_pt; + /// timestamp token sum probability threshold (~0.01) + public float thold_ptsum; + /// max segment length in characters + public int max_len; + /// max tokens per segment (0 = no limit) + public int max_tokens; + + public struct sGreedy + { + public int n_past; + } + public sGreedy greedy; + + public struct sBeamSearch + { + public int n_past; + public int beam_width; + public int n_best; + } + public sBeamSearch beamSearch; + + // [EXPERIMENTAL] speed-up techniques + /// overwrite the audio context size (0 = use default) + public int audioContextSize; + } +} \ No newline at end of file diff --git a/WhisperNet/API/SpecialTokens.cs b/WhisperNet/API/SpecialTokens.cs new file mode 100644 index 0000000..d672369 --- /dev/null +++ b/WhisperNet/API/SpecialTokens.cs @@ -0,0 +1,23 @@ +namespace Whisper +{ + /// Special tokens defined in the model + public readonly struct SpecialTokens + { + /// The end of a transcription + public readonly int TranscriptionEnd; // token_eot + /// Start of a transcription + public readonly int TranscriptionStart; // token_sot + /// Represents the previous word in the transcription. It is used to help the model predict the current word based on the context of the words that came before it. + public readonly int PreviousWord; // token_prev + /// Start of a sentence + public readonly int SentenceStart; // token_solm + /// Represents the word "not" in the transcription + public readonly int Not; // token_not + /// New transcription + public readonly int TranscriptionBegin; // token_beg + /// token_translate + public readonly int TaskTranslate; + /// token_transcribe + public readonly int TaskTranscribe; + } +} \ No newline at end of file diff --git a/WhisperNet/API/eCaptureStatus.cs b/WhisperNet/API/eCaptureStatus.cs new file mode 100644 index 0000000..41f05fb --- /dev/null +++ b/WhisperNet/API/eCaptureStatus.cs @@ -0,0 +1,19 @@ +namespace Whisper +{ + /// Status of the voice capture + [Flags] + public enum eCaptureStatus: byte + { + /// Doing nothing + None = 0, + /// Capturing the audio + Listening = 1, + /// A voice is detected in the captured audio, recording + Voice = 2, + /// Transcribing a recorded piece of the audio + Transcribing = 4, + /// The computer is unable to transcribe the audio quickly enough,
+ /// and the capture is dropping the incoming audio samples.
+ Stalled = 0x80, + } +} \ No newline at end of file diff --git a/WhisperNet/API/eLanguage.cs b/WhisperNet/API/eLanguage.cs new file mode 100644 index 0000000..1241077 --- /dev/null +++ b/WhisperNet/API/eLanguage.cs @@ -0,0 +1,206 @@ +// This file is generated by a tool, from the `languageCodez.tsv` file in this repository +namespace Whisper +{ + /// Supported languages + public enum eLanguage: uint + { + /// Afrikaans + Afrikaans = 0x6661, + /// Albanian + Albanian = 0x7173, + /// Amharic + Amharic = 0x6D61, + /// Arabic + Arabic = 0x7261, + /// Armenian + Armenian = 0x7968, + /// Assamese + Assamese = 0x7361, + /// Azerbaijani + Azerbaijani = 0x7A61, + /// Bashkir + Bashkir = 0x6162, + /// Basque + Basque = 0x7565, + /// Belarusian + Belarusian = 0x6562, + /// Bengali + Bengali = 0x6E62, + /// Bosnian + Bosnian = 0x7362, + /// Breton + Breton = 0x7262, + /// Bulgarian + Bulgarian = 0x6762, + /// Catalan + Catalan = 0x6163, + /// Chinese + Chinese = 0x687A, + /// Croatian + Croatian = 0x7268, + /// Czech + Czech = 0x7363, + /// Danish + Danish = 0x6164, + /// Dutch + Dutch = 0x6C6E, + /// English + English = 0x6E65, + /// Estonian + Estonian = 0x7465, + /// Faroese + Faroese = 0x6F66, + /// Finnish + Finnish = 0x6966, + /// French + French = 0x7266, + /// Galician + Galician = 0x6C67, + /// Georgian + Georgian = 0x616B, + /// German + German = 0x6564, + /// Greek + Greek = 0x6C65, + /// Gujarati + Gujarati = 0x7567, + /// Haitian Creole + HaitianCreole = 0x7468, + /// Hausa + Hausa = 0x6168, + /// Hawaiian + Hawaiian = 0x776168, + /// Hebrew + Hebrew = 0x7769, + /// Hindi + Hindi = 0x6968, + /// Hungarian + Hungarian = 0x7568, + /// Icelandic + Icelandic = 0x7369, + /// Indonesian + Indonesian = 0x6469, + /// Italian + Italian = 0x7469, + /// Japanese + Japanese = 0x616A, + /// Javanese + Javanese = 0x776A, + /// Kannada + Kannada = 0x6E6B, + /// Kazakh + Kazakh = 0x6B6B, + /// Khmer + Khmer = 0x6D6B, + /// Korean + Korean = 0x6F6B, + /// Lao + Lao = 0x6F6C, + /// Latin + Latin = 0x616C, + /// Latvian + Latvian = 0x766C, + /// Lingala + Lingala = 0x6E6C, + /// Lithuanian + Lithuanian = 0x746C, + /// Luxembourgish + Luxembourgish = 0x626C, + /// Macedonian + Macedonian = 0x6B6D, + /// Malagasy + Malagasy = 0x676D, + /// Malay + Malay = 0x736D, + /// Malayalam + Malayalam = 0x6C6D, + /// Maltese + Maltese = 0x746D, + /// Maori + Maori = 0x696D, + /// Marathi + Marathi = 0x726D, + /// Mongolian + Mongolian = 0x6E6D, + /// Myanmar + Myanmar = 0x796D, + /// Nepali + Nepali = 0x656E, + /// Norwegian + Norwegian = 0x6F6E, + /// Nynorsk + Nynorsk = 0x6E6E, + /// Occitan + Occitan = 0x636F, + /// Pashto + Pashto = 0x7370, + /// Persian + Persian = 0x6166, + /// Polish + Polish = 0x6C70, + /// Portuguese + Portuguese = 0x7470, + /// Punjabi + Punjabi = 0x6170, + /// Romanian + Romanian = 0x6F72, + /// Russian + Russian = 0x7572, + /// Sanskrit + Sanskrit = 0x6173, + /// Serbian + Serbian = 0x7273, + /// Shona + Shona = 0x6E73, + /// Sindhi + Sindhi = 0x6473, + /// Sinhala + Sinhala = 0x6973, + /// Slovak + Slovak = 0x6B73, + /// Slovenian + Slovenian = 0x6C73, + /// Somali + Somali = 0x6F73, + /// Spanish + Spanish = 0x7365, + /// Sundanese + Sundanese = 0x7573, + /// Swahili + Swahili = 0x7773, + /// Swedish + Swedish = 0x7673, + /// Tagalog + Tagalog = 0x6C74, + /// Tajik + Tajik = 0x6774, + /// Tamil + Tamil = 0x6174, + /// Tatar + Tatar = 0x7474, + /// Telugu + Telugu = 0x6574, + /// Thai + Thai = 0x6874, + /// Tibetan + Tibetan = 0x6F62, + /// Turkish + Turkish = 0x7274, + /// Turkmen + Turkmen = 0x6B74, + /// Ukrainian + Ukrainian = 0x6B75, + /// Urdu + Urdu = 0x7275, + /// Uzbek + Uzbek = 0x7A75, + /// Vietnamese + Vietnamese = 0x6976, + /// Welsh + Welsh = 0x7963, + /// Yiddish + Yiddish = 0x6979, + /// Yoruba + Yoruba = 0x6F79, + } +} \ No newline at end of file diff --git a/WhisperNet/API/eLogLevel.cs b/WhisperNet/API/eLogLevel.cs new file mode 100644 index 0000000..ae494d4 --- /dev/null +++ b/WhisperNet/API/eLogLevel.cs @@ -0,0 +1,34 @@ +namespace Whisper +{ + /// Message log level + public enum eLogLevel: byte + { + /// Error message + Error = 0, + /// Warning message + Warning = 1, + /// Informational message + Info = 2, + /// Debug message + Debug = 3 + } + + /// A delegate to receive log messages from the library + public delegate void pfnLogMessage( eLogLevel level, string message ); + + /// Log destination flags + [Flags] + public enum eLoggerFlags: byte + { + /// No special flags + None = 0, + + /// In addition to calling the delegate, print messaged to standard error + UseStandardError = 1, + + /// Don’t format error codes into messages + /// It’s recommended to use this flag in .NET.
+ /// The standard library already formats these messages automatically, as needed.
+ SkipFormatMessage = 2, + } +} \ No newline at end of file diff --git a/WhisperNet/API/eModelImplementation.cs b/WhisperNet/API/eModelImplementation.cs new file mode 100644 index 0000000..1b0a079 --- /dev/null +++ b/WhisperNet/API/eModelImplementation.cs @@ -0,0 +1,25 @@ +namespace Whisper +{ + /// Implementation value for the factory function + public enum eModelImplementation: uint + { + /// GPGPU implementation based on Direct3D 11.0 compute shaders + GPU = 1, + + /// A hybrid implementation which uses DirectCompute for encode, and decodes on CPU + /// + /// The build of the native DLL included into this nuget package doesn’t implement this version.
+ /// To enable, edit stdafx.h in Whisper project, change the value of BUILD_HYBRID_VERSION macro from zero to one, and build.
+ /// This implementation requires a CPU with AVX1, FMA3, F16C and BMI1 instruction set extensions. + ///
+ Hybrid = 2, + + /// A reference implementation which uses the original GGML CPU-running code. + /// + /// The build of the native DLL included into this nuget package doesn’t implement this version either.
+ /// To enable, edit stdafx.h in Whisper project, change the value of BUILD_BOTH_VERSIONS macro from zero to one, and build the project.
+ /// This implementation requires a CPU with AVX1, FMA3, and F16C instruction set extensions. + ///
+ Reference = 3, + } +} \ No newline at end of file diff --git a/WhisperNet/API/eResultFlags.cs b/WhisperNet/API/eResultFlags.cs new file mode 100644 index 0000000..1de61ab --- /dev/null +++ b/WhisperNet/API/eResultFlags.cs @@ -0,0 +1,21 @@ +namespace Whisper +{ + /// Flags for method + [Flags] + public enum eResultFlags: uint + { + /// No flags + None = 0, + + /// Return individual tokens in addition to the segments + Tokens = 1, + + /// Return timestamps + Timestamps = 2, + + /// Create a new COM object for the results. + /// Without this flag, the context returns a pointer to the COM object stored in the context.
+ /// The content of that object is replaced every time you call method.
+ NewObject = 0x100, + } +} \ No newline at end of file diff --git a/WhisperNet/API/iAudioBuffer.cs b/WhisperNet/API/iAudioBuffer.cs new file mode 100644 index 0000000..1b35621 --- /dev/null +++ b/WhisperNet/API/iAudioBuffer.cs @@ -0,0 +1,27 @@ +using ComLight; +using System.Runtime.InteropServices; + +namespace Whisper +{ + /// A buffer with a chunk of audio. + /// Note the interface supports both marshaling directions.
+ /// I have not tested, but you should be able to implement this interface in C#, to supply PCM audio data to the native code
+ [ComInterface( "013583aa-c9eb-42bc-83db-633c2c317051", eMarshalDirection.BothWays )] + public interface iAudioBuffer: IDisposable + { + /// Count of samples in the buffer + int countSamples(); + + /// Unmanaged pointer to the internal buffer containing single-channel FP32 samples. + /// If you implementing this interface in C# and your audio data is on the managed heap, use to make sure it doesn't move.
+ /// Or better yet, move the data to unmanaged buffer allocated with or method.
+ IntPtr getPcmMono(); + + /// Unmanaged pointer to the internal buffer containing stereo FP32 samples. + /// When the buffer doesn’t have stereo data, the method gonna return . + IntPtr getPcmStereo(); + + /// Start time of the buffer, relative to the start of the media + void getTime( out TimeSpan time ); + } +} \ No newline at end of file diff --git a/WhisperNet/API/iAudioReader.cs b/WhisperNet/API/iAudioReader.cs new file mode 100644 index 0000000..68cf916 --- /dev/null +++ b/WhisperNet/API/iAudioReader.cs @@ -0,0 +1,23 @@ +using ComLight; + +namespace Whisper +{ + /// Audio stream reader object + /// The implementation is forward-only, and these objects ain’t reusable.
+ /// To read a source file multiple time, dispose and re-create the reader.
+ [ComInterface( "35b988da-04a6-476a-a193-d8891d5dc390", eMarshalDirection.ToManaged )] + public interface iAudioReader: IDisposable + { + /// Get duration of the media file + [RetValIndex] + TimeSpan getDuration(); + } + + /// Audio capture reader object + /// This interface has no public methods callable from C#.
+ /// It’s only here to pass data between different functions implemented in C++.
+ [ComInterface( "747752c2-d9fd-40df-8847-583c781bf013", eMarshalDirection.ToManaged )] + public interface iAudioCapture: IDisposable + { + } +} \ No newline at end of file diff --git a/WhisperNet/API/iMediaFoundation.cs b/WhisperNet/API/iMediaFoundation.cs new file mode 100644 index 0000000..535f904 --- /dev/null +++ b/WhisperNet/API/iMediaFoundation.cs @@ -0,0 +1,36 @@ +using ComLight; +using System.Runtime.InteropServices; +using Whisper.Internal; + +namespace Whisper +{ + /// Exposes a small subset of MS Media Foundation framework. + /// That framework is a part of Windows OS, since Vista. + /// + [ComInterface( "fb9763a5-d77d-4b6e-aff8-f494813cebd8", eMarshalDirection.ToManaged ), CustomConventions( typeof( NativeLogger ) )] + public interface iMediaFoundation: IDisposable + { + /// Decode complete audio file into a new memory buffer. + /// + /// Under the hood, the method asks MF to resample and convert audio into the suitable type for the Whisper model.
+ /// If the path is a video file, the method will decode the first audio track. + ///
+ [RetValIndex( 2 )] + iAudioBuffer loadAudioFile( [MarshalAs( UnmanagedType.LPWStr )] string path, [MarshalAs( UnmanagedType.U1 )] bool stereo = false ); + + /// Create a reader to stream the audio file from disk + /// + /// Under the hood, the method asks MF to resample and convert audio into the suitable type for the Whisper model.
+ /// If the path is a video file, the method will decode the first audio track. + ///
+ [RetValIndex( 2 )] + iAudioReader openAudioFile( [MarshalAs( UnmanagedType.LPWStr )] string path, [MarshalAs( UnmanagedType.U1 )] bool stereo = false ); + + /// List capture devices + void listCaptureDevices( [MarshalAs( UnmanagedType.FunctionPtr )] pfnFoundCaptureDevices pfn, IntPtr pv ); + + /// Open audio capture device + [RetValIndex( 2 )] + iAudioCapture openCaptureDevice( [MarshalAs( UnmanagedType.LPWStr )] string endpoint, [In] ref sCaptureParams captureParams ); + } +} \ No newline at end of file diff --git a/WhisperNet/API/iModel.cs b/WhisperNet/API/iModel.cs new file mode 100644 index 0000000..8ec6d17 --- /dev/null +++ b/WhisperNet/API/iModel.cs @@ -0,0 +1,27 @@ +using ComLight; +using System.ComponentModel; + +namespace Whisper +{ + /// A model in VRAM, loaded from GGML file. + /// This objetc doesn't keep any mutable state, and can be safely used from multiple threads concurrently + [ComInterface( "abefb4c9-e8d8-46a3-8747-5afbadef1adb", eMarshalDirection.ToManaged ), CustomConventions( typeof( Internal.NativeLogger ) )] + public interface iModel: IDisposable + { + /// Create a context to transcribe audio with this model + /// Don't call this method, use instead. + [RetValIndex, EditorBrowsable( EditorBrowsableState.Never )] + Internal.iContext createContextInternal(); + + /// True if this model is multi-lingual + bool isMultilingual(); + + /// Retrieve integer IDs of the special tokens defined by the model + [RetValIndex] + SpecialTokens getSpecialTokens(); + + /// Try to resolve integer token ID into string. + /// Don't call this method, use instead. + IntPtr stringFromTokenInternal( int id ); + } +} \ No newline at end of file diff --git a/WhisperNet/API/sCaptureParams.cs b/WhisperNet/API/sCaptureParams.cs new file mode 100644 index 0000000..7595a69 --- /dev/null +++ b/WhisperNet/API/sCaptureParams.cs @@ -0,0 +1,37 @@ +namespace Whisper +{ + /// Flags for the audio capture + [Flags] + public enum eCaptureFlags: uint + { + /// No special flags + None = 0, + /// When the capture device supports stereo, keep stereo PCM samples in addition to mono + Stereo = 1, + } + + /// Parameters for audio capture + public struct sCaptureParams + { + /// Minimum transcribe duration in seconds + public float minDuration; + /// Maximum transcribe duration in seconds + public float maxDuration; + /// + public float dropStartSilence; + /// + public float pauseDuration; + /// Flags for the audio capture + public eCaptureFlags flags; + + /// Initialize the structure with some reasonable default values + public sCaptureParams() + { + minDuration = 7.0f; // 7 seconds + maxDuration = 11.0f; // 11 seconds + dropStartSilence = 0.25f; // 250 ms + pauseDuration = 0.333f; // 333 ms + flags = eCaptureFlags.None; + } + } +} \ No newline at end of file -- cgit v1.2.3