using System.Diagnostics;
using Whisper.Internal;
using Whisper.Internals;
namespace Whisper
{
/// Stateful context, contains methods to transcribe audio
public sealed class Context: IDisposable
{
iContext context;
// Caching the results object here saves time spent in ComLight library creating these callable proxies over and over again for the same underlying C++ object
readonly iTranscribeResult transcribeResult;
sFullParams fullParams;
sProgressSink progressSink;
bool disposed = false;
readonly Action pfnBuffer, pfnStream;
internal Context( Internal.iContext context )
{
this.context = context;
transcribeResult = context.getResults( eResultFlags.None );
fullParams = context.fullDefaultParams( eSamplingStrategy.Greedy );
pfnBuffer = processBuffer;
pfnStream = processStream;
progressSink = default;
}
void IDisposable.Dispose()
{
if( disposed )
return;
disposed = true;
context?.Dispose();
GC.SuppressFinalize( this );
}
/// Adjustable parameters
public ref Parameters parameters => ref fullParams.publicParams;
void processBuffer( object buffer )
{
context.runFull( ref fullParams, (iAudioBuffer)buffer );
}
void processStream( object reader )
{
context.runStreamed( ref fullParams, ref progressSink, (iAudioReader)reader );
}
void runImpl( object source, Callbacks? callbacks, ReadOnlySpan promptTokens, Action pfn )
{
if( null != callbacks )
{
// TODO [very low, performance]: the following code creates 2 new GC-allocated objects on each call.
// Possible to optimize by caching these function pointers in static readonly fields, and use another [ThreadStatic] field for the callbacks object
fullParams.newSegmentCallback = delegate ( IntPtr ctx, int countNew, IntPtr userData )
{
return callbacks.newSegment( this, countNew );
};
fullParams.encoderBeginCallback = delegate ( IntPtr ctx, IntPtr userData )
{
return callbacks.encoderBegin( this );
};
}
try
{
if( promptTokens.IsEmpty )
{
pfn( source );
return;
}
unsafe
{
fixed( int* tokens = promptTokens )
{
fullParams.prompt_tokens = (IntPtr)tokens;
fullParams.prompt_n_tokens = promptTokens.Length;
pfn( source );
}
}
}
finally
{
// Reset these delegates.
// Otherwise, this class will retain the callbacks object preventing it from being garbage collected.
fullParams.newSegmentCallback = null;
fullParams.encoderBeginCallback = null;
fullParams.prompt_tokens = IntPtr.Zero;
fullParams.prompt_n_tokens = 0;
}
}
/// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
public void runFull( iAudioBuffer buffer, Callbacks? callbacks, ReadOnlySpan promptTokens )
{
runImpl( buffer, callbacks, promptTokens, pfnBuffer );
}
/// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
public void runFull( iAudioBuffer buffer, Callbacks? callbacks = null ) =>
runFull( buffer, callbacks, ReadOnlySpan.Empty );
/// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
public void runFull( iAudioBuffer buffer, Callbacks? callbacks, int[]? promptTokens ) =>
runFull( buffer, callbacks, promptTokens ?? ReadOnlySpan.Empty );
/// Run the entire model, streaming audio from the provided reader object
public void runFull( iAudioReader reader, Callbacks? callbacks, Action? pfnProgress, ReadOnlySpan promptTokens )
{
if( null != pfnProgress )
{
progressSink.pfn = delegate ( double value, IntPtr context, IntPtr pv )
{
try
{
pfnProgress.Invoke( value );
return 0;
}
catch( Exception ex )
{
return ex.HResult;
}
};
}
try
{
runImpl( reader, callbacks, promptTokens, pfnStream );
}
finally
{
progressSink.pfn = null;
}
}
/// Run the entire model, streaming audio from the provided reader object
public void runFull( iAudioReader reader, Action? pfnProgress = null, Callbacks? callbacks = null ) =>
runFull( reader, callbacks, pfnProgress, ReadOnlySpan.Empty );
/// Run the entire model, streaming audio from the provided reader object
public void runFull( iAudioReader reader, Callbacks? callbacks, Action? pfnProgress, int[]? promptTokens ) =>
runFull( reader, callbacks, pfnProgress, promptTokens ?? ReadOnlySpan.Empty );
/// Get text results out of the context
public TranscribeResult results( eResultFlags flags = eResultFlags.None )
{
if( flags.HasFlag( eResultFlags.NewObject ) )
throw new ArgumentException();
iTranscribeResult res = context.getResults( flags );
Debug.Assert( ReferenceEquals( res, transcribeResult ) );
return new TranscribeResult( res );
}
/// Print timing data
public void timingsPrint() => context.timingsPrint();
/// Reset timing data
public void timingsReset() => context.timingsReset();
/// Continuously process audio from microphone or a similar capture device
/// It’s recommended to call this method on a background thread.
public void runCapture( iAudioCapture capture, Callbacks? callbacks, CaptureCallbacks? captureCallbacks )
{
if( null != callbacks )
{
// TODO [very low, performance]: the following code creates 2 new GC-allocated objects on each call.
// Possible to optimize by caching these function pointers in static readonly fields, and use another [ThreadStatic] field for the callbacks object
fullParams.newSegmentCallback = delegate ( IntPtr ctx, int countNew, IntPtr userData )
{
return callbacks.newSegment( this, countNew );
};
fullParams.encoderBeginCallback = delegate ( IntPtr ctx, IntPtr userData )
{
return callbacks.encoderBegin( this );
};
}
try
{
sCaptureCallbacks cc = default;
if( captureCallbacks != null )
{
cc.shouldCancel = captureCallbacks.cancel( this );
cc.captureStatus = captureCallbacks.status( this );
}
context.runCapture( ref fullParams, ref cc, capture );
}
finally
{
// Reset these delegates.
// Otherwise, this class will retain the callbacks object preventing it from being garbage collected.
fullParams.newSegmentCallback = null;
fullParams.encoderBeginCallback = null;
fullParams.prompt_tokens = IntPtr.Zero;
fullParams.prompt_n_tokens = 0;
}
}
/// Try to detect speaker by comparing channels of the stereo PCM data
///
/// The feature requires stereo PCM data. Pass stereo=true to or methods,
/// or to method.
/// It seems to work fine with Blue Yeti microphone,
/// after switched the microphone to Stereo pattern. With recorded sounds however, the performance varies depending on the recording.
///
public eSpeakerChannel detectSpeaker( sTimeInterval interval ) =>
context.detectSpeaker( ref interval );
}
}