diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /Examples/TranscribeCS | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'Examples/TranscribeCS')
| -rw-r--r-- | Examples/TranscribeCS/AnsiCodes.cs | 68 | ||||
| -rw-r--r-- | Examples/TranscribeCS/CommandLineArgs.cs | 155 | ||||
| -rw-r--r-- | Examples/TranscribeCS/Transcribe.cs | 114 | ||||
| -rw-r--r-- | Examples/TranscribeCS/TranscribeCS.cs | 102 | ||||
| -rw-r--r-- | Examples/TranscribeCS/TranscribeCS.csproj | 19 |
5 files changed, 458 insertions, 0 deletions
diff --git a/Examples/TranscribeCS/AnsiCodes.cs b/Examples/TranscribeCS/AnsiCodes.cs new file mode 100644 index 0000000..be04ce3 --- /dev/null +++ b/Examples/TranscribeCS/AnsiCodes.cs @@ -0,0 +1,68 @@ +using System.Runtime.InteropServices; + +/// <summary>Utility class to setup console coloring with ANSI codes.</summary> +/// <remarks>The feature requires Windows 10 or newer</remarks> +static class AnsiCodes +{ + const string dll = "kernel32.dll"; + + [DllImport( dll, SetLastError = true )] + static extern IntPtr GetStdHandle( int nStdHandle ); + + const int STD_OUTPUT_HANDLE = -11; + + [Flags] + enum ConsoleModes: uint + { + // Input flags + ENABLE_PROCESSED_INPUT = 0x0001, + ENABLE_LINE_INPUT = 0x0002, + ENABLE_ECHO_INPUT = 0x0004, + ENABLE_WINDOW_INPUT = 0x0008, + ENABLE_MOUSE_INPUT = 0x0010, + ENABLE_INSERT_MODE = 0x0020, + ENABLE_QUICK_EDIT_MODE = 0x0040, + ENABLE_EXTENDED_FLAGS = 0x0080, + ENABLE_AUTO_POSITION = 0x0100, + ENABLE_VIRTUAL_TERMINAL_INPUT = 0x0200, + + // Output flags + ENABLE_PROCESSED_OUTPUT = 0x0001, + ENABLE_WRAP_AT_EOL_OUTPUT = 0x0002, + ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004, + DISABLE_NEWLINE_AUTO_RETURN = 0x0008, + ENABLE_LVB_GRID_WORLDWIDE = 0x0010 + } + + [DllImport( dll, SetLastError = true )] + static extern bool GetConsoleMode( IntPtr hConsoleHandle, out ConsoleModes mode ); + + [DllImport( dll, SetLastError = true )] + static extern bool SetConsoleMode( IntPtr hConsoleHandle, ConsoleModes mode ); + + static AnsiCodes() + { + IntPtr h = GetStdHandle( STD_OUTPUT_HANDLE ); + IntPtr INVALID_HANDLE_VALUE = (IntPtr)( -1 ); + if( h == INVALID_HANDLE_VALUE ) + return; + + if( !GetConsoleMode( h, out ConsoleModes mode ) ) + return; + + if( mode.HasFlag( ConsoleModes.ENABLE_VIRTUAL_TERMINAL_PROCESSING ) ) + { + enabled = true; + return; + } + + mode |= ConsoleModes.ENABLE_VIRTUAL_TERMINAL_PROCESSING; + if( SetConsoleMode( h, mode ) ) + { + enabled = true; + return; + } + } + + public static readonly bool enabled = false; +}
\ No newline at end of file diff --git a/Examples/TranscribeCS/CommandLineArgs.cs b/Examples/TranscribeCS/CommandLineArgs.cs new file mode 100644 index 0000000..4f9fb74 --- /dev/null +++ b/Examples/TranscribeCS/CommandLineArgs.cs @@ -0,0 +1,155 @@ +using System.Globalization; +using System.Reflection; +using Whisper; + +namespace TranscribeCS +{ + sealed record class CommandLineArgs + { + public int n_threads = Environment.ProcessorCount; + public int offset_t_ms = 0; + public int offset_n = 0; + public int duration_ms = 0; + public int max_context = -1; + public int max_len = 0; + + public float word_thold = 0.01f; + + public bool speed_up = false; + public bool translate = false; + public bool diarize = false; + public bool output_txt = false; + public bool output_vtt = false; + public bool output_srt = false; + public bool print_special = false; + public bool print_progress = false; + public bool print_colors = true; + public bool no_timestamps = false; + public int[]? prompt = null; + + public eLanguage language = eLanguage.English; + public string model = string.Empty; + public readonly List<string> fileNames = new List<string>(); + + const bool output_wts = false; + public void apply( ref Parameters p ) + { + p.setFlag( eFullParamsFlags.PrintRealtime, false ); + p.setFlag( eFullParamsFlags.PrintProgress, print_progress ); + p.setFlag( eFullParamsFlags.PrintTimestamps, !no_timestamps ); + p.setFlag( eFullParamsFlags.PrintSpecial, print_special ); + p.setFlag( eFullParamsFlags.Translate, translate ); + p.language = language; + p.cpuThreads = n_threads; + if( max_context >= 0 ) + p.n_max_text_ctx = max_context; + p.offset_ms = offset_t_ms; + p.duration_ms = duration_ms; + p.setFlag( eFullParamsFlags.TokenTimestamps, output_wts || max_len > 0 ); + p.thold_pt = word_thold; + p.max_len = output_wts && max_len == 0 ? 60 : max_len; + p.setFlag( eFullParamsFlags.SpeedupAudio, speed_up ); + } + + public eResultFlags resultFlags() + { + eResultFlags flags = eResultFlags.None; + bool wts = output_wts || max_len > 0; + if( !no_timestamps || wts ) + flags |= eResultFlags.Timestamps; + if( wts || print_colors ) + flags |= eResultFlags.Tokens; + return flags; + } + + static eLanguage parseLanguage( string lang ) => + Library.languageFromCode( lang ) ?? throw new ArgumentException( $"Unknown language code \"{lang}\"" ); + + public CommandLineArgs( string[] argv ) + { + for( int i = 0; i < argv.Length; i++ ) + { + string arg = argv[ i ]; + if( arg[ 0 ] != '-' ) + { + fileNames.Add( arg ); + continue; + } + if( arg == "-h" || arg == "--help" ) + { + printUsage(); + throw new OperationCanceledException(); + } + else if( arg == "-t" || arg == "--threads" ) n_threads = int.Parse( argv[ ++i ] ); + else if( arg == "-ot" || arg == "--offset-t" ) offset_t_ms = int.Parse( argv[ ++i ] ); + else if( arg == "-on" || arg == "--offset-n" ) offset_n = int.Parse( argv[ ++i ] ); + else if( arg == "-d" || arg == "--duration" ) duration_ms = int.Parse( argv[ ++i ] ); + else if( arg == "-mc" || arg == "--max-context" ) max_context = int.Parse( argv[ ++i ] ); + else if( arg == "-ml" || arg == "--max-len" ) max_len = int.Parse( argv[ ++i ] ); + else if( arg == "-wt" || arg == "--word-thold" ) word_thold = float.Parse( argv[ ++i ], CultureInfo.InvariantCulture ); + else if( arg == "-su" || arg == "--speed-up" ) speed_up = true; + else if( arg == "-tr" || arg == "--translate" ) translate = true; + else if( arg == "-di" || arg == "--diarize" ) diarize = true; + else if( arg == "-otxt" || arg == "--output-txt" ) output_txt = true; + else if( arg == "-ovtt" || arg == "--output-vtt" ) output_vtt = true; + else if( arg == "-osrt" || arg == "--output-srt" ) output_srt = true; + else if( arg == "-ps" || arg == "--print-special" ) print_special = true; + else if( arg == "-nc" || arg == "--no-colors" ) print_colors = false; + else if( arg == "-pp" || arg == "--print-progress" ) print_progress = true; + else if( arg == "-nt" || arg == "--no-timestamps" ) no_timestamps = true; + else if( arg == "-l" || arg == "--language" ) language = parseLanguage( argv[ ++i ] ); + else if( arg == "--prompt" ) prompt = parsePrompt( argv[ ++i ] ); + else if( arg == "-m" || arg == "--model" ) model = argv[ ++i ]; + else if( arg == "-f" || arg == "--file" ) fileNames.Add( argv[ ++i ] ); + else + throw new ArgumentException( $"Unknown argument: \"{arg}\"" ); + } + if( string.IsNullOrWhiteSpace( model ) ) + throw new ArgumentException( "The model file is not provided in the arguments" ); + if( !File.Exists( model ) ) + throw new FileNotFoundException( "Model not found", model ); + if( fileNames.Count <= 0 ) + throw new ArgumentException( "Please supply at least 1 input audio file to process" ); + } + + static string cstr( bool b ) => b.ToString(); + + static int[]? parsePrompt( string str ) + { + if( string.IsNullOrWhiteSpace( str ) ) + return null; + // TODO: expose whisper_tokenize function, as a method of iModel COM interface + throw new NotImplementedException(); + } + + void printUsage() + { + Console.WriteLine(); + + Console.WriteLine( "usage: {0} [options] file0.mp3 file1.wma ...", Path.GetFileName( Assembly.GetExecutingAssembly().Location ) ); + Console.WriteLine(); + Console.WriteLine( "options:" ); + Console.WriteLine( " -h, --help [default] show this help message and exit" ); + Console.WriteLine( " -t N, --threads N [{0,-7:D}] number of threads to use during computation", n_threads ); + Console.WriteLine( " -ot N, --offset-t N [{0,-7:D}] time offset in milliseconds", offset_t_ms ); + Console.WriteLine( " -on N, --offset-n N [{0,-7:D}] segment index offset", offset_n ); + Console.WriteLine( " -d N, --duration N [{0,-7:D}] duration of audio to process in milliseconds", duration_ms ); + Console.WriteLine( " -mc N, --max-context N [{0,-7:D}] maximum number of text context tokens to store", max_context ); + Console.WriteLine( " -ml N, --max-len N [{0,-7:D}] maximum segment length in characters", max_len ); + Console.WriteLine( " -wt N, --word-thold N [{0,-7:F2}] word timestamp probability threshold", word_thold ); + Console.WriteLine( " -su, --speed-up [{0,-7}] speed up audio by x2 (reduced accuracy)", cstr( speed_up ) ); + Console.WriteLine( " -tr, --translate [{0,-7}] translate from source language to english", cstr( translate ) ); + Console.WriteLine( " -di, --diarize [{0,-7}] stereo audio diarization", cstr( diarize ) ); + Console.WriteLine( " -otxt, --output-txt [{0,-7}] output result in a text file", cstr( output_txt ) ); + Console.WriteLine( " -ovtt, --output-vtt [{0,-7}] output result in a vtt file", cstr( output_vtt ) ); + Console.WriteLine( " -osrt, --output-srt [{0,-7}] output result in a srt file", cstr( output_srt ) ); + Console.WriteLine( " -ps, --print-special [{0,-7}] print special tokens", cstr( print_special ) ); + Console.WriteLine( " -nc, --no-colors [{0,-7}] do not print colors", cstr( !print_colors ) ); + Console.WriteLine( " -nt, --no-timestamps [{0,-7}] do not print timestamps", cstr( no_timestamps ) ); + Console.WriteLine( " -l LANG, --language LANG [{0,-7}] spoken language", language.getCode() ); + Console.WriteLine( " --prompt PROMPT [ ] initial prompt" ); + Console.WriteLine( " -m FNAME, --model FNAME [{0,-7}] model path", model ); + Console.WriteLine( " -f FNAME, --file FNAME [{0,-7}] path of the input audio file", "" ); + } + } +}
\ No newline at end of file diff --git a/Examples/TranscribeCS/Transcribe.cs b/Examples/TranscribeCS/Transcribe.cs new file mode 100644 index 0000000..6a1e500 --- /dev/null +++ b/Examples/TranscribeCS/Transcribe.cs @@ -0,0 +1,114 @@ +using System.Globalization; +using Whisper; + +namespace TranscribeCS +{ + /// <summary>Implementation of Callbacks abstract class, to print these segments as soon as they’re produced by the library.</summary> + sealed class Transcribe: Callbacks + { + readonly CommandLineArgs args; + readonly eResultFlags resultFlags; + + public Transcribe( CommandLineArgs args ) + { + this.args = args; + resultFlags = args.resultFlags(); + Console.OutputEncoding = System.Text.Encoding.UTF8; + } + + // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9] + // Lowest is red, middle is yellow, highest is green. + readonly string[] k_colors = new string[] + { + "\x1B[38;5;196m", "\x1B[38;5;202m", "\x1B[38;5;208m", "\x1B[38;5;214m", "\x1B[38;5;220m", + "\x1B[38;5;226m", "\x1B[38;5;190m", "\x1B[38;5;154m", "\x1B[38;5;118m", "\x1B[38;5;82m" + }; + + int colorIndex( in sToken tok ) + { + float p = tok.probability; + float p3 = p * p * p; + int col = (int)( p3 * k_colors.Length ); + col = Math.Clamp( col, 0, k_colors.Length - 1 ); + return col; + } + + public static string printTime( TimeSpan ts ) => + ts.ToString( "hh':'mm':'ss'.'fff", CultureInfo.InvariantCulture ); + public static string printTimeWithComma( TimeSpan ts ) => + ts.ToString( "hh':'mm':'ss','fff", CultureInfo.InvariantCulture ); + + protected override void onNewSegment( Context sender, int countNew ) + { + TranscribeResult res = sender.results( resultFlags ); + ReadOnlySpan<sToken> tokens = res.tokens; + + int s0 = res.segments.Length - countNew; + if( s0 == 0 ) + Console.WriteLine(); + + for( int i = s0; i < res.segments.Length; i++ ) + { + sSegment seg = res.segments[ i ]; + + if( args.no_timestamps ) + { + if( args.print_colors && AnsiCodes.enabled ) + { + foreach( sToken tok in res.getTokens( seg ) ) + { + if( !args.print_special && tok.hasFlag( eTokenFlags.Special ) ) + continue; + Console.Write( "{0}{1}{2}", k_colors[ colorIndex( tok ) ], tok.text, "\x1B[0m" ); + } + } + else + Console.Write( seg.text ); + Console.Out.Flush(); + continue; + } + + string speaker = ""; +#if false + if( args.diarize && pcmf32s.size() == 2 ) + { + const size_t n_samples = pcmf32s[ 0 ].size(); + const int64_t is0 = SourceAudio::sampleFromTimestamp( seg.time.begin, n_samples ); + const int64_t is1 = SourceAudio::sampleFromTimestamp( seg.time.end, n_samples ); + + double energy0 = 0.0f; + double energy1 = 0.0f; + + for( int64_t j = is0; j < is1; j++ ) + { + energy0 += fabs( pcmf32s[ 0 ][ j ] ); + energy1 += fabs( pcmf32s[ 1 ][ j ] ); + } + + if( energy0 > 1.1 * energy1 ) + speaker = "(speaker 0)"; + else if( energy1 > 1.1 * energy0 ) + speaker = "(speaker 1)"; + else + speaker = "(speaker ?)"; + + //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str()); + } +#endif + if( args.print_colors && AnsiCodes.enabled ) + { + Console.Write( "[{0} --> {1}] ", printTime( seg.time.begin ), printTime( seg.time.end ) ); + foreach( sToken tok in res.getTokens( seg ) ) + { + if( !args.print_special && tok.hasFlag( eTokenFlags.Special ) ) + continue; + Console.Write( "{0}{1}{2}{3}", speaker, k_colors[ colorIndex( tok ) ], tok.text, "\x1B[0m" ); + } + Console.WriteLine(); + } + else + Console.WriteLine( "[{0} --> {1}] {2}{3}", printTime( seg.time.begin ), printTime( seg.time.end ), speaker, seg.text ); + } + } + } +}
\ No newline at end of file diff --git a/Examples/TranscribeCS/TranscribeCS.cs b/Examples/TranscribeCS/TranscribeCS.cs new file mode 100644 index 0000000..9b828e3 --- /dev/null +++ b/Examples/TranscribeCS/TranscribeCS.cs @@ -0,0 +1,102 @@ +using Whisper; + +namespace TranscribeCS +{ + static class Program + { + static readonly bool streamAudio = true; + + static int Main( string[] args ) + { + try + { + CommandLineArgs cla; + try + { + cla = new CommandLineArgs( args ); + } + catch( OperationCanceledException ) + { + return 1; + } + const eLoggerFlags loggerFlags = eLoggerFlags.UseStandardError | eLoggerFlags.SkipFormatMessage; + Library.setLogSink( eLogLevel.Debug, loggerFlags ); + + using iModel model = Library.loadModel( cla.model ); + using Context context = model.createContext(); + cla.apply( ref context.parameters ); + using iMediaFoundation mf = Library.initMediaFoundation(); + Transcribe transcribe = new Transcribe( cla ); + + foreach( string audioFile in cla.fileNames ) + { + if( streamAudio ) + { + using iAudioReader reader = mf.openAudioFile( audioFile ); + context.runFull( reader, transcribe, null, cla.prompt ); + } + else + { + using iAudioBuffer buffer = mf.loadAudioFile( audioFile ); + context.runFull( buffer, transcribe, cla.prompt ); + } + // When asked to, produce these text files + if( cla.output_txt ) + writeTextFile( context, audioFile ); + if( cla.output_srt ) + writeSubRip( context, audioFile, cla ); + if( cla.output_vtt ) + writeWebVTT( context, audioFile ); + } + + context.timingsPrint(); + return 0; + } + catch( Exception ex ) + { + Console.WriteLine( ex.Message ); + return ex.HResult; + } + } + + static void writeTextFile( Context context, string audioPath ) + { + using var stream = File.CreateText( Path.ChangeExtension( audioPath, ".txt" ) ); + foreach( sSegment seg in context.results().segments ) + stream.WriteLine( seg.text ); + } + + static void writeSubRip( Context context, string audioPath, CommandLineArgs cliArgs ) + { + using var stream = File.CreateText( Path.ChangeExtension( audioPath, ".srt" ) ); + var segments = context.results( eResultFlags.Timestamps ).segments; + + for( int i = 0; i < segments.Length; i++ ) + { + stream.WriteLine( i + 1 + cliArgs.offset_n ); + sSegment seg = segments[ i ]; + string begin = Transcribe.printTimeWithComma( seg.time.begin ); + string end = Transcribe.printTimeWithComma( seg.time.end ); + stream.WriteLine( "{0} --> {1}", begin, end ); + stream.WriteLine( seg.text ); + stream.WriteLine(); + } + } + + static void writeWebVTT( Context context, string audioPath ) + { + using var stream = File.CreateText( Path.ChangeExtension( audioPath, ".vtt" ) ); + stream.WriteLine( "WEBVTT" ); + stream.WriteLine(); + + foreach( sSegment seg in context.results( eResultFlags.Timestamps ).segments ) + { + string begin = Transcribe.printTime( seg.time.begin ); + string end = Transcribe.printTime( seg.time.end ); + stream.WriteLine( "{0} --> {1}", begin, end ); + stream.WriteLine( seg.text ); + stream.WriteLine(); + } + } + } +}
\ No newline at end of file diff --git a/Examples/TranscribeCS/TranscribeCS.csproj b/Examples/TranscribeCS/TranscribeCS.csproj new file mode 100644 index 0000000..e9b8d0f --- /dev/null +++ b/Examples/TranscribeCS/TranscribeCS.csproj @@ -0,0 +1,19 @@ +<Project Sdk="Microsoft.NET.Sdk"> + <PropertyGroup> + <OutputType>Exe</OutputType> + <TargetFramework>net6.0-windows</TargetFramework> + <ImplicitUsings>enable</ImplicitUsings> + <Nullable>enable</Nullable> + <CheckForOverflowUnderflow>true</CheckForOverflowUnderflow> + <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath> + <Platforms>x64</Platforms> + </PropertyGroup> + <ItemGroup> + <Content Include="..\..\x64\$(Configuration)\Whisper.dll" Link="Whisper.dll"> + <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> + </Content> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\..\WhisperNet\WhisperNet.csproj" /> + </ItemGroup> +</Project>
\ No newline at end of file |
