From 8c4603c73675958efc960fbd4bb599a2909d106a Mon Sep 17 00:00:00 2001 From: Konstantin Date: Mon, 16 Jan 2023 14:52:43 +0100 Subject: Source codes --- Examples/MicrophoneCS/CaptureThread.cs | 61 +++++++++++ Examples/MicrophoneCS/CommandLineArgs.cs | 145 +++++++++++++++++++++++++++ Examples/MicrophoneCS/MicrophoneCS.cs | 56 +++++++++++ Examples/MicrophoneCS/MicrophoneCS.csproj | 27 +++++ Examples/MicrophoneCS/TranscribeCallbacks.cs | 114 +++++++++++++++++++++ 5 files changed, 403 insertions(+) create mode 100644 Examples/MicrophoneCS/CaptureThread.cs create mode 100644 Examples/MicrophoneCS/CommandLineArgs.cs create mode 100644 Examples/MicrophoneCS/MicrophoneCS.cs create mode 100644 Examples/MicrophoneCS/MicrophoneCS.csproj create mode 100644 Examples/MicrophoneCS/TranscribeCallbacks.cs (limited to 'Examples/MicrophoneCS') diff --git a/Examples/MicrophoneCS/CaptureThread.cs b/Examples/MicrophoneCS/CaptureThread.cs new file mode 100644 index 0000000..b76a929 --- /dev/null +++ b/Examples/MicrophoneCS/CaptureThread.cs @@ -0,0 +1,61 @@ +using System.Runtime.ExceptionServices; +using Whisper; + +namespace MicrophoneCS +{ + sealed class CaptureThread: CaptureCallbacks + { + public CaptureThread( CommandLineArgs args, Context context, iAudioCapture source ) + { + callbacks = new TranscribeCallbacks( args ); + this.context = context; + this.source = source; + + thread = new Thread( threadMain ) { Name = "Capture Thread" }; + Console.WriteLine( "Press any key to quit" ); + thread.Start(); + } + + static void readKeyCallback( object? state ) + { + CaptureThread ct = ( state as CaptureThread ) ?? throw new ApplicationException(); + Console.ReadKey(); + ct.shouldQuit = true; + } + + public void join() + { + ThreadPool.QueueUserWorkItem( readKeyCallback, this ); + thread.Join(); + edi?.Throw(); + } + + volatile bool shouldQuit = false; + + protected override bool shouldCancel( Context sender ) => + shouldQuit; + + protected override void captureStatusChanged( Context sender, eCaptureStatus status ) + { + Console.WriteLine( $"CaptureStatusChanged: {status}" ); + } + + readonly TranscribeCallbacks callbacks; + readonly Thread thread; + readonly Context context; + readonly iAudioCapture source; + ExceptionDispatchInfo? edi = null; + + void threadMain() + { + try + { + context.runCapture( source, callbacks, this ); + } + catch( Exception ex ) + { + edi = ExceptionDispatchInfo.Capture( ex ); + } + } + } +} \ No newline at end of file diff --git a/Examples/MicrophoneCS/CommandLineArgs.cs b/Examples/MicrophoneCS/CommandLineArgs.cs new file mode 100644 index 0000000..be5fbe9 --- /dev/null +++ b/Examples/MicrophoneCS/CommandLineArgs.cs @@ -0,0 +1,145 @@ +using System.Globalization; +using System.Reflection; +using Whisper; + +namespace MicrophoneCS +{ + sealed record class CommandLineArgs + { + public int n_threads = Environment.ProcessorCount; + public int offset_t_ms = 0; + public int offset_n = 0; + public int duration_ms = 0; + public int max_context = -1; + public int max_len = 0; + + public float word_thold = 0.01f; + + public bool speed_up = false; + public bool translate = false; + public bool diarize = false; + public bool output_txt = false; + public bool print_special = false; + public bool print_progress = false; + public bool print_colors = true; + public bool no_timestamps = false; + public int[]? prompt = null; + public int captureDeviceIndex = 0; + + public eLanguage language = eLanguage.English; + public string model = string.Empty; + + const bool output_wts = false; + public bool listDevices = false; + + public void apply( ref Parameters p ) + { + p.setFlag( eFullParamsFlags.PrintRealtime, false ); + p.setFlag( eFullParamsFlags.PrintProgress, print_progress ); + p.setFlag( eFullParamsFlags.PrintTimestamps, !no_timestamps ); + p.setFlag( eFullParamsFlags.PrintSpecial, print_special ); + p.setFlag( eFullParamsFlags.Translate, translate ); + p.language = language; + p.cpuThreads = n_threads; + if( max_context >= 0 ) + p.n_max_text_ctx = max_context; + p.offset_ms = offset_t_ms; + p.duration_ms = duration_ms; + p.setFlag( eFullParamsFlags.TokenTimestamps, output_wts || max_len > 0 ); + p.thold_pt = word_thold; + p.max_len = output_wts && max_len == 0 ? 60 : max_len; + p.setFlag( eFullParamsFlags.SpeedupAudio, speed_up ); + } + + public eResultFlags resultFlags() + { + eResultFlags flags = eResultFlags.None; + bool wts = output_wts || max_len > 0; + if( !no_timestamps || wts ) + flags |= eResultFlags.Timestamps; + if( wts || print_colors ) + flags |= eResultFlags.Tokens; + return flags; + } + + static eLanguage parseLanguage( string lang ) => + Library.languageFromCode( lang ) ?? throw new ArgumentException( $"Unknown language code \"{lang}\"" ); + + public CommandLineArgs( string[] argv ) + { + for( int i = 0; i < argv.Length; i++ ) + { + string arg = argv[ i ]; + if( arg == "-h" || arg == "--help" ) + { + printUsage(); + throw new OperationCanceledException(); + } + else if( arg == "-c" || arg == "--capture" ) captureDeviceIndex = int.Parse( argv[ ++i ] ); + else if( arg == "-ld" || arg == "--list-devices" ) listDevices = true; + else if( arg == "-t" || arg == "--threads" ) n_threads = int.Parse( argv[ ++i ] ); + else if( arg == "-ot" || arg == "--offset-t" ) offset_t_ms = int.Parse( argv[ ++i ] ); + else if( arg == "-on" || arg == "--offset-n" ) offset_n = int.Parse( argv[ ++i ] ); + else if( arg == "-d" || arg == "--duration" ) duration_ms = int.Parse( argv[ ++i ] ); + else if( arg == "-mc" || arg == "--max-context" ) max_context = int.Parse( argv[ ++i ] ); + else if( arg == "-ml" || arg == "--max-len" ) max_len = int.Parse( argv[ ++i ] ); + else if( arg == "-wt" || arg == "--word-thold" ) word_thold = float.Parse( argv[ ++i ], CultureInfo.InvariantCulture ); + else if( arg == "-su" || arg == "--speed-up" ) speed_up = true; + else if( arg == "-tr" || arg == "--translate" ) translate = true; + else if( arg == "-di" || arg == "--diarize" ) diarize = true; + else if( arg == "-otxt" || arg == "--output-txt" ) output_txt = true; + else if( arg == "-ps" || arg == "--print-special" ) print_special = true; + else if( arg == "-nc" || arg == "--no-colors" ) print_colors = false; + else if( arg == "-pp" || arg == "--print-progress" ) print_progress = true; + else if( arg == "-nt" || arg == "--no-timestamps" ) no_timestamps = true; + else if( arg == "-l" || arg == "--language" ) language = parseLanguage( argv[ ++i ] ); + else if( arg == "--prompt" ) prompt = parsePrompt( argv[ ++i ] ); + else if( arg == "-m" || arg == "--model" ) model = argv[ ++i ]; + else + throw new ArgumentException( $"Unknown argument: \"{arg}\"" ); + } + if( string.IsNullOrWhiteSpace( model ) ) + throw new ArgumentException( "The model file is not provided in the arguments" ); + if( !File.Exists( model ) ) + throw new FileNotFoundException( "Model not found", model ); + } + + static string cstr( bool b ) => b.ToString(); + + static int[]? parsePrompt( string str ) + { + if( string.IsNullOrWhiteSpace( str ) ) + return null; + // TODO: expose whisper_tokenize function, as a method of iModel COM interface + throw new NotImplementedException(); + } + + void printUsage() + { + Console.WriteLine(); + + Console.WriteLine( "usage: {0} [options] file0.mp3 file1.wma ...", Path.GetFileName( Assembly.GetExecutingAssembly().Location ) ); + Console.WriteLine(); + Console.WriteLine( "options:" ); + Console.WriteLine( " -h, --help [default] show this help message and exit" ); + Console.WriteLine( " -t N, --threads N [{0,-7:D}] number of threads to use during computation", n_threads ); + Console.WriteLine( " -ot N, --offset-t N [{0,-7:D}] time offset in milliseconds", offset_t_ms ); + Console.WriteLine( " -on N, --offset-n N [{0,-7:D}] segment index offset", offset_n ); + Console.WriteLine( " -d N, --duration N [{0,-7:D}] duration of audio to process in milliseconds", duration_ms ); + Console.WriteLine( " -mc N, --max-context N [{0,-7:D}] maximum number of text context tokens to store", max_context ); + Console.WriteLine( " -ml N, --max-len N [{0,-7:D}] maximum segment length in characters", max_len ); + Console.WriteLine( " -wt N, --word-thold N [{0,-7:F2}] word timestamp probability threshold", word_thold ); + Console.WriteLine( " -su, --speed-up [{0,-7}] speed up audio by x2 (reduced accuracy)", cstr( speed_up ) ); + Console.WriteLine( " -tr, --translate [{0,-7}] translate from source language to english", cstr( translate ) ); + Console.WriteLine( " -di, --diarize [{0,-7}] stereo audio diarization", cstr( diarize ) ); + Console.WriteLine( " -otxt, --output-txt [{0,-7}] output result in a text file", cstr( output_txt ) ); + Console.WriteLine( " -ps, --print-special [{0,-7}] print special tokens", cstr( print_special ) ); + Console.WriteLine( " -nc, --no-colors [{0,-7}] do not print colors", cstr( !print_colors ) ); + Console.WriteLine( " -nt, --no-timestamps [{0,-7}] do not print timestamps", cstr( no_timestamps ) ); + Console.WriteLine( " -l LANG, --language LANG [{0,-7}] spoken language", language.getCode() ); + Console.WriteLine( " --prompt PROMPT [ ] initial prompt" ); + Console.WriteLine( " -m FNAME, --model FNAME [{0,-7}] model path", model ); + Console.WriteLine( " -f FNAME, --file FNAME [{0,-7}] path of the input audio file", "" ); + } + } +} \ No newline at end of file diff --git a/Examples/MicrophoneCS/MicrophoneCS.cs b/Examples/MicrophoneCS/MicrophoneCS.cs new file mode 100644 index 0000000..c095ee1 --- /dev/null +++ b/Examples/MicrophoneCS/MicrophoneCS.cs @@ -0,0 +1,56 @@ +using Whisper; + +namespace MicrophoneCS +{ + static class Program + { + static int Main( string[] args ) + { + try + { + CommandLineArgs cla; + try + { + cla = new CommandLineArgs( args ); + } + catch( OperationCanceledException ) + { + return 1; + } + const eLoggerFlags loggerFlags = eLoggerFlags.UseStandardError | eLoggerFlags.SkipFormatMessage; + Library.setLogSink( eLogLevel.Debug, loggerFlags ); + + using iMediaFoundation mf = Library.initMediaFoundation(); + CaptureDeviceId[] devices = mf.listCaptureDevices() ?? + throw new ApplicationException( "This computer has no audio capture devices" ); + + if( cla.listDevices ) + { + for( int i = 0; i < devices.Length; i++ ) + Console.WriteLine( "#{0}: {1}", i, devices[ i ].displayName ); + return 0; + } + if( cla.captureDeviceIndex < 0 || cla.captureDeviceIndex >= devices.Length ) + throw new ApplicationException( $"Capture device index is out of range; the valid range is [ 0 .. {devices.Length - 1} ]" ); + + using iAudioCapture captureDev = mf.openCaptureDevice( devices[ cla.captureDeviceIndex ] ); + + using iModel model = Library.loadModel( cla.model ); + using Context context = model.createContext(); + cla.apply( ref context.parameters ); + + CaptureThread thread = new CaptureThread( cla, context, captureDev ); + thread.join(); + + context.timingsPrint(); + return 0; + } + catch( Exception ex ) + { + // Console.WriteLine( ex.Message ); + Console.WriteLine( ex.ToString() ); + return ex.HResult; + } + } + } +} \ No newline at end of file diff --git a/Examples/MicrophoneCS/MicrophoneCS.csproj b/Examples/MicrophoneCS/MicrophoneCS.csproj new file mode 100644 index 0000000..f417d20 --- /dev/null +++ b/Examples/MicrophoneCS/MicrophoneCS.csproj @@ -0,0 +1,27 @@ + + + + Exe + net6.0-windows + enable + enable + true + false + x64 + + + + + + + + + PreserveNewest + + + + + + + + \ No newline at end of file diff --git a/Examples/MicrophoneCS/TranscribeCallbacks.cs b/Examples/MicrophoneCS/TranscribeCallbacks.cs new file mode 100644 index 0000000..e4d14f4 --- /dev/null +++ b/Examples/MicrophoneCS/TranscribeCallbacks.cs @@ -0,0 +1,114 @@ +using System.Globalization; +using Whisper; + +namespace MicrophoneCS +{ + /// Implementation of Callbacks abstract class, to print these segments as soon as they’re produced by the library. + sealed class TranscribeCallbacks: Callbacks + { + readonly CommandLineArgs args; + readonly eResultFlags resultFlags; + + public TranscribeCallbacks( CommandLineArgs args ) + { + this.args = args; + resultFlags = args.resultFlags(); + Console.OutputEncoding = System.Text.Encoding.UTF8; + } + + // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9] + // Lowest is red, middle is yellow, highest is green. + readonly string[] k_colors = new string[] + { + "\x1B[38;5;196m", "\x1B[38;5;202m", "\x1B[38;5;208m", "\x1B[38;5;214m", "\x1B[38;5;220m", + "\x1B[38;5;226m", "\x1B[38;5;190m", "\x1B[38;5;154m", "\x1B[38;5;118m", "\x1B[38;5;82m" + }; + + int colorIndex( in sToken tok ) + { + float p = tok.probability; + float p3 = p * p * p; + int col = (int)( p3 * k_colors.Length ); + col = Math.Clamp( col, 0, k_colors.Length - 1 ); + return col; + } + + public static string printTime( TimeSpan ts ) => + ts.ToString( "hh':'mm':'ss'.'fff", CultureInfo.InvariantCulture ); + public static string printTimeWithComma( TimeSpan ts ) => + ts.ToString( "hh':'mm':'ss','fff", CultureInfo.InvariantCulture ); + + protected override void onNewSegment( Context sender, int countNew ) + { + TranscribeResult res = sender.results( resultFlags ); + ReadOnlySpan tokens = res.tokens; + + int s0 = res.segments.Length - countNew; + if( s0 == 0 ) + Console.WriteLine(); + + for( int i = s0; i < res.segments.Length; i++ ) + { + sSegment seg = res.segments[ i ]; + + if( args.no_timestamps ) + { + if( args.print_colors && AnsiCodes.enabled ) + { + foreach( sToken tok in res.getTokens( seg ) ) + { + if( !args.print_special && tok.hasFlag( eTokenFlags.Special ) ) + continue; + Console.Write( "{0}{1}{2}", k_colors[ colorIndex( tok ) ], tok.text, "\x1B[0m" ); + } + } + else + Console.Write( seg.text ); + Console.Out.Flush(); + continue; + } + + string speaker = ""; +#if false + if( args.diarize && pcmf32s.size() == 2 ) + { + const size_t n_samples = pcmf32s[ 0 ].size(); + const int64_t is0 = SourceAudio::sampleFromTimestamp( seg.time.begin, n_samples ); + const int64_t is1 = SourceAudio::sampleFromTimestamp( seg.time.end, n_samples ); + + double energy0 = 0.0f; + double energy1 = 0.0f; + + for( int64_t j = is0; j < is1; j++ ) + { + energy0 += fabs( pcmf32s[ 0 ][ j ] ); + energy1 += fabs( pcmf32s[ 1 ][ j ] ); + } + + if( energy0 > 1.1 * energy1 ) + speaker = "(speaker 0)"; + else if( energy1 > 1.1 * energy0 ) + speaker = "(speaker 1)"; + else + speaker = "(speaker ?)"; + + //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str()); + } +#endif + if( args.print_colors && AnsiCodes.enabled ) + { + Console.Write( "[{0} --> {1}] ", printTime( seg.time.begin ), printTime( seg.time.end ) ); + foreach( sToken tok in res.getTokens( seg ) ) + { + if( !args.print_special && tok.hasFlag( eTokenFlags.Special ) ) + continue; + Console.Write( "{0}{1}{2}{3}", speaker, k_colors[ colorIndex( tok ) ], tok.text, "\x1B[0m" ); + } + Console.WriteLine(); + } + else + Console.WriteLine( "[{0} --> {1}] {2}{3}", printTime( seg.time.begin ), printTime( seg.time.end ), speaker, seg.text ); + } + } + } +} \ No newline at end of file -- cgit v1.2.3