using System.Diagnostics; using Whisper.Internal; using Whisper.Internals; namespace Whisper { /// Stateful context, contains methods to transcribe audio public sealed class Context: IDisposable { iContext context; // Caching the results object here saves time spent in ComLight library creating these callable proxies over and over again for the same underlying C++ object readonly iTranscribeResult transcribeResult; sFullParams fullParams; sProgressSink progressSink; bool disposed = false; readonly Action pfnBuffer, pfnStream; internal Context( Internal.iContext context ) { this.context = context; transcribeResult = context.getResults( eResultFlags.None ); fullParams = context.fullDefaultParams( eSamplingStrategy.Greedy ); pfnBuffer = processBuffer; pfnStream = processStream; progressSink = default; } void IDisposable.Dispose() { if( disposed ) return; disposed = true; context?.Dispose(); GC.SuppressFinalize( this ); } /// Adjustable parameters public ref Parameters parameters => ref fullParams.publicParams; void processBuffer( object buffer ) { context.runFull( ref fullParams, (iAudioBuffer)buffer ); } void processStream( object reader ) { context.runStreamed( ref fullParams, ref progressSink, (iAudioReader)reader ); } void runImpl( object source, Callbacks? callbacks, ReadOnlySpan promptTokens, Action pfn ) { if( null != callbacks ) { // TODO [very low, performance]: the following code creates 2 new GC-allocated objects on each call. // Possible to optimize by caching these function pointers in static readonly fields, and use another [ThreadStatic] field for the callbacks object fullParams.newSegmentCallback = delegate ( IntPtr ctx, int countNew, IntPtr userData ) { return callbacks.newSegment( this, countNew ); }; fullParams.encoderBeginCallback = delegate ( IntPtr ctx, IntPtr userData ) { return callbacks.encoderBegin( this ); }; } try { if( promptTokens.IsEmpty ) { pfn( source ); return; } unsafe { fixed( int* tokens = promptTokens ) { fullParams.prompt_tokens = (IntPtr)tokens; fullParams.prompt_n_tokens = promptTokens.Length; pfn( source ); } } } finally { // Reset these delegates. // Otherwise, this class will retain the callbacks object preventing it from being garbage collected. fullParams.newSegmentCallback = null; fullParams.encoderBeginCallback = null; fullParams.prompt_tokens = IntPtr.Zero; fullParams.prompt_n_tokens = 0; } } /// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text public void runFull( iAudioBuffer buffer, Callbacks? callbacks, ReadOnlySpan promptTokens ) { runImpl( buffer, callbacks, promptTokens, pfnBuffer ); } /// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text public void runFull( iAudioBuffer buffer, Callbacks? callbacks = null ) => runFull( buffer, callbacks, ReadOnlySpan.Empty ); /// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text public void runFull( iAudioBuffer buffer, Callbacks? callbacks, int[]? promptTokens ) => runFull( buffer, callbacks, promptTokens ?? ReadOnlySpan.Empty ); /// Run the entire model, streaming audio from the provided reader object public void runFull( iAudioReader reader, Callbacks? callbacks, Action? pfnProgress, ReadOnlySpan promptTokens ) { if( null != pfnProgress ) { progressSink.pfn = delegate ( double value, IntPtr context, IntPtr pv ) { try { pfnProgress.Invoke( value ); return 0; } catch( Exception ex ) { return ex.HResult; } }; } try { runImpl( reader, callbacks, promptTokens, pfnStream ); } finally { progressSink.pfn = null; } } /// Run the entire model, streaming audio from the provided reader object public void runFull( iAudioReader reader, Action? pfnProgress = null, Callbacks? callbacks = null ) => runFull( reader, callbacks, pfnProgress, ReadOnlySpan.Empty ); /// Run the entire model, streaming audio from the provided reader object public void runFull( iAudioReader reader, Callbacks? callbacks, Action? pfnProgress, int[]? promptTokens ) => runFull( reader, callbacks, pfnProgress, promptTokens ?? ReadOnlySpan.Empty ); /// Get text results out of the context public TranscribeResult results( eResultFlags flags = eResultFlags.None ) { if( flags.HasFlag( eResultFlags.NewObject ) ) throw new ArgumentException(); iTranscribeResult res = context.getResults( flags ); Debug.Assert( ReferenceEquals( res, transcribeResult ) ); return new TranscribeResult( res ); } /// Print timing data public void timingsPrint() => context.timingsPrint(); /// Reset timing data public void timingsReset() => context.timingsReset(); /// Continuously process audio from microphone or a similar capture device /// It’s recommended to call this method on a background thread. public void runCapture( iAudioCapture capture, Callbacks? callbacks, CaptureCallbacks? captureCallbacks ) { if( null != callbacks ) { // TODO [very low, performance]: the following code creates 2 new GC-allocated objects on each call. // Possible to optimize by caching these function pointers in static readonly fields, and use another [ThreadStatic] field for the callbacks object fullParams.newSegmentCallback = delegate ( IntPtr ctx, int countNew, IntPtr userData ) { return callbacks.newSegment( this, countNew ); }; fullParams.encoderBeginCallback = delegate ( IntPtr ctx, IntPtr userData ) { return callbacks.encoderBegin( this ); }; } try { sCaptureCallbacks cc = default; if( captureCallbacks != null ) { cc.shouldCancel = captureCallbacks.cancel( this ); cc.captureStatus = captureCallbacks.status( this ); } context.runCapture( ref fullParams, ref cc, capture ); } finally { // Reset these delegates. // Otherwise, this class will retain the callbacks object preventing it from being garbage collected. fullParams.newSegmentCallback = null; fullParams.encoderBeginCallback = null; fullParams.prompt_tokens = IntPtr.Zero; fullParams.prompt_n_tokens = 0; } } /// Try to detect speaker by comparing channels of the stereo PCM data /// /// The feature requires stereo PCM data.
Pass stereo=true to or methods,
/// or to method.
/// It seems to work fine with Blue Yeti microphone, /// after switched the microphone to Stereo pattern.
With recorded sounds however, the performance varies depending on the recording.
///
public eSpeakerChannel detectSpeaker( sTimeInterval interval ) => context.detectSpeaker( ref interval ); } }