summaryrefslogtreecommitdiffstats
path: root/Examples/TranscribeCS
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-16 14:52:43 +0100
committerKonstantin <const@const.me>2023-01-16 14:52:43 +0100
commit8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree714dc6fc9a1672d5fd7f89676b97e10959662abc /Examples/TranscribeCS
parent990a8d0dbaefc996244097397259e92758b15cce (diff)
Source codes
Diffstat (limited to 'Examples/TranscribeCS')
-rw-r--r--Examples/TranscribeCS/AnsiCodes.cs68
-rw-r--r--Examples/TranscribeCS/CommandLineArgs.cs155
-rw-r--r--Examples/TranscribeCS/Transcribe.cs114
-rw-r--r--Examples/TranscribeCS/TranscribeCS.cs102
-rw-r--r--Examples/TranscribeCS/TranscribeCS.csproj19
5 files changed, 458 insertions, 0 deletions
diff --git a/Examples/TranscribeCS/AnsiCodes.cs b/Examples/TranscribeCS/AnsiCodes.cs
new file mode 100644
index 0000000..be04ce3
--- /dev/null
+++ b/Examples/TranscribeCS/AnsiCodes.cs
@@ -0,0 +1,68 @@
+using System.Runtime.InteropServices;
+
+/// <summary>Utility class to setup console coloring with ANSI codes.</summary>
+/// <remarks>The feature requires Windows 10 or newer</remarks>
+static class AnsiCodes
+{
+ const string dll = "kernel32.dll";
+
+ [DllImport( dll, SetLastError = true )]
+ static extern IntPtr GetStdHandle( int nStdHandle );
+
+ const int STD_OUTPUT_HANDLE = -11;
+
+ [Flags]
+ enum ConsoleModes: uint
+ {
+ // Input flags
+ ENABLE_PROCESSED_INPUT = 0x0001,
+ ENABLE_LINE_INPUT = 0x0002,
+ ENABLE_ECHO_INPUT = 0x0004,
+ ENABLE_WINDOW_INPUT = 0x0008,
+ ENABLE_MOUSE_INPUT = 0x0010,
+ ENABLE_INSERT_MODE = 0x0020,
+ ENABLE_QUICK_EDIT_MODE = 0x0040,
+ ENABLE_EXTENDED_FLAGS = 0x0080,
+ ENABLE_AUTO_POSITION = 0x0100,
+ ENABLE_VIRTUAL_TERMINAL_INPUT = 0x0200,
+
+ // Output flags
+ ENABLE_PROCESSED_OUTPUT = 0x0001,
+ ENABLE_WRAP_AT_EOL_OUTPUT = 0x0002,
+ ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004,
+ DISABLE_NEWLINE_AUTO_RETURN = 0x0008,
+ ENABLE_LVB_GRID_WORLDWIDE = 0x0010
+ }
+
+ [DllImport( dll, SetLastError = true )]
+ static extern bool GetConsoleMode( IntPtr hConsoleHandle, out ConsoleModes mode );
+
+ [DllImport( dll, SetLastError = true )]
+ static extern bool SetConsoleMode( IntPtr hConsoleHandle, ConsoleModes mode );
+
+ static AnsiCodes()
+ {
+ IntPtr h = GetStdHandle( STD_OUTPUT_HANDLE );
+ IntPtr INVALID_HANDLE_VALUE = (IntPtr)( -1 );
+ if( h == INVALID_HANDLE_VALUE )
+ return;
+
+ if( !GetConsoleMode( h, out ConsoleModes mode ) )
+ return;
+
+ if( mode.HasFlag( ConsoleModes.ENABLE_VIRTUAL_TERMINAL_PROCESSING ) )
+ {
+ enabled = true;
+ return;
+ }
+
+ mode |= ConsoleModes.ENABLE_VIRTUAL_TERMINAL_PROCESSING;
+ if( SetConsoleMode( h, mode ) )
+ {
+ enabled = true;
+ return;
+ }
+ }
+
+ public static readonly bool enabled = false;
+} \ No newline at end of file
diff --git a/Examples/TranscribeCS/CommandLineArgs.cs b/Examples/TranscribeCS/CommandLineArgs.cs
new file mode 100644
index 0000000..4f9fb74
--- /dev/null
+++ b/Examples/TranscribeCS/CommandLineArgs.cs
@@ -0,0 +1,155 @@
+using System.Globalization;
+using System.Reflection;
+using Whisper;
+
+namespace TranscribeCS
+{
+ sealed record class CommandLineArgs
+ {
+ public int n_threads = Environment.ProcessorCount;
+ public int offset_t_ms = 0;
+ public int offset_n = 0;
+ public int duration_ms = 0;
+ public int max_context = -1;
+ public int max_len = 0;
+
+ public float word_thold = 0.01f;
+
+ public bool speed_up = false;
+ public bool translate = false;
+ public bool diarize = false;
+ public bool output_txt = false;
+ public bool output_vtt = false;
+ public bool output_srt = false;
+ public bool print_special = false;
+ public bool print_progress = false;
+ public bool print_colors = true;
+ public bool no_timestamps = false;
+ public int[]? prompt = null;
+
+ public eLanguage language = eLanguage.English;
+ public string model = string.Empty;
+ public readonly List<string> fileNames = new List<string>();
+
+ const bool output_wts = false;
+ public void apply( ref Parameters p )
+ {
+ p.setFlag( eFullParamsFlags.PrintRealtime, false );
+ p.setFlag( eFullParamsFlags.PrintProgress, print_progress );
+ p.setFlag( eFullParamsFlags.PrintTimestamps, !no_timestamps );
+ p.setFlag( eFullParamsFlags.PrintSpecial, print_special );
+ p.setFlag( eFullParamsFlags.Translate, translate );
+ p.language = language;
+ p.cpuThreads = n_threads;
+ if( max_context >= 0 )
+ p.n_max_text_ctx = max_context;
+ p.offset_ms = offset_t_ms;
+ p.duration_ms = duration_ms;
+ p.setFlag( eFullParamsFlags.TokenTimestamps, output_wts || max_len > 0 );
+ p.thold_pt = word_thold;
+ p.max_len = output_wts && max_len == 0 ? 60 : max_len;
+ p.setFlag( eFullParamsFlags.SpeedupAudio, speed_up );
+ }
+
+ public eResultFlags resultFlags()
+ {
+ eResultFlags flags = eResultFlags.None;
+ bool wts = output_wts || max_len > 0;
+ if( !no_timestamps || wts )
+ flags |= eResultFlags.Timestamps;
+ if( wts || print_colors )
+ flags |= eResultFlags.Tokens;
+ return flags;
+ }
+
+ static eLanguage parseLanguage( string lang ) =>
+ Library.languageFromCode( lang ) ?? throw new ArgumentException( $"Unknown language code \"{lang}\"" );
+
+ public CommandLineArgs( string[] argv )
+ {
+ for( int i = 0; i < argv.Length; i++ )
+ {
+ string arg = argv[ i ];
+ if( arg[ 0 ] != '-' )
+ {
+ fileNames.Add( arg );
+ continue;
+ }
+ if( arg == "-h" || arg == "--help" )
+ {
+ printUsage();
+ throw new OperationCanceledException();
+ }
+ else if( arg == "-t" || arg == "--threads" ) n_threads = int.Parse( argv[ ++i ] );
+ else if( arg == "-ot" || arg == "--offset-t" ) offset_t_ms = int.Parse( argv[ ++i ] );
+ else if( arg == "-on" || arg == "--offset-n" ) offset_n = int.Parse( argv[ ++i ] );
+ else if( arg == "-d" || arg == "--duration" ) duration_ms = int.Parse( argv[ ++i ] );
+ else if( arg == "-mc" || arg == "--max-context" ) max_context = int.Parse( argv[ ++i ] );
+ else if( arg == "-ml" || arg == "--max-len" ) max_len = int.Parse( argv[ ++i ] );
+ else if( arg == "-wt" || arg == "--word-thold" ) word_thold = float.Parse( argv[ ++i ], CultureInfo.InvariantCulture );
+ else if( arg == "-su" || arg == "--speed-up" ) speed_up = true;
+ else if( arg == "-tr" || arg == "--translate" ) translate = true;
+ else if( arg == "-di" || arg == "--diarize" ) diarize = true;
+ else if( arg == "-otxt" || arg == "--output-txt" ) output_txt = true;
+ else if( arg == "-ovtt" || arg == "--output-vtt" ) output_vtt = true;
+ else if( arg == "-osrt" || arg == "--output-srt" ) output_srt = true;
+ else if( arg == "-ps" || arg == "--print-special" ) print_special = true;
+ else if( arg == "-nc" || arg == "--no-colors" ) print_colors = false;
+ else if( arg == "-pp" || arg == "--print-progress" ) print_progress = true;
+ else if( arg == "-nt" || arg == "--no-timestamps" ) no_timestamps = true;
+ else if( arg == "-l" || arg == "--language" ) language = parseLanguage( argv[ ++i ] );
+ else if( arg == "--prompt" ) prompt = parsePrompt( argv[ ++i ] );
+ else if( arg == "-m" || arg == "--model" ) model = argv[ ++i ];
+ else if( arg == "-f" || arg == "--file" ) fileNames.Add( argv[ ++i ] );
+ else
+ throw new ArgumentException( $"Unknown argument: \"{arg}\"" );
+ }
+ if( string.IsNullOrWhiteSpace( model ) )
+ throw new ArgumentException( "The model file is not provided in the arguments" );
+ if( !File.Exists( model ) )
+ throw new FileNotFoundException( "Model not found", model );
+ if( fileNames.Count <= 0 )
+ throw new ArgumentException( "Please supply at least 1 input audio file to process" );
+ }
+
+ static string cstr( bool b ) => b.ToString();
+
+ static int[]? parsePrompt( string str )
+ {
+ if( string.IsNullOrWhiteSpace( str ) )
+ return null;
+ // TODO: expose whisper_tokenize function, as a method of iModel COM interface
+ throw new NotImplementedException();
+ }
+
+ void printUsage()
+ {
+ Console.WriteLine();
+
+ Console.WriteLine( "usage: {0} [options] file0.mp3 file1.wma ...", Path.GetFileName( Assembly.GetExecutingAssembly().Location ) );
+ Console.WriteLine();
+ Console.WriteLine( "options:" );
+ Console.WriteLine( " -h, --help [default] show this help message and exit" );
+ Console.WriteLine( " -t N, --threads N [{0,-7:D}] number of threads to use during computation", n_threads );
+ Console.WriteLine( " -ot N, --offset-t N [{0,-7:D}] time offset in milliseconds", offset_t_ms );
+ Console.WriteLine( " -on N, --offset-n N [{0,-7:D}] segment index offset", offset_n );
+ Console.WriteLine( " -d N, --duration N [{0,-7:D}] duration of audio to process in milliseconds", duration_ms );
+ Console.WriteLine( " -mc N, --max-context N [{0,-7:D}] maximum number of text context tokens to store", max_context );
+ Console.WriteLine( " -ml N, --max-len N [{0,-7:D}] maximum segment length in characters", max_len );
+ Console.WriteLine( " -wt N, --word-thold N [{0,-7:F2}] word timestamp probability threshold", word_thold );
+ Console.WriteLine( " -su, --speed-up [{0,-7}] speed up audio by x2 (reduced accuracy)", cstr( speed_up ) );
+ Console.WriteLine( " -tr, --translate [{0,-7}] translate from source language to english", cstr( translate ) );
+ Console.WriteLine( " -di, --diarize [{0,-7}] stereo audio diarization", cstr( diarize ) );
+ Console.WriteLine( " -otxt, --output-txt [{0,-7}] output result in a text file", cstr( output_txt ) );
+ Console.WriteLine( " -ovtt, --output-vtt [{0,-7}] output result in a vtt file", cstr( output_vtt ) );
+ Console.WriteLine( " -osrt, --output-srt [{0,-7}] output result in a srt file", cstr( output_srt ) );
+ Console.WriteLine( " -ps, --print-special [{0,-7}] print special tokens", cstr( print_special ) );
+ Console.WriteLine( " -nc, --no-colors [{0,-7}] do not print colors", cstr( !print_colors ) );
+ Console.WriteLine( " -nt, --no-timestamps [{0,-7}] do not print timestamps", cstr( no_timestamps ) );
+ Console.WriteLine( " -l LANG, --language LANG [{0,-7}] spoken language", language.getCode() );
+ Console.WriteLine( " --prompt PROMPT [ ] initial prompt" );
+ Console.WriteLine( " -m FNAME, --model FNAME [{0,-7}] model path", model );
+ Console.WriteLine( " -f FNAME, --file FNAME [{0,-7}] path of the input audio file", "" );
+ }
+ }
+} \ No newline at end of file
diff --git a/Examples/TranscribeCS/Transcribe.cs b/Examples/TranscribeCS/Transcribe.cs
new file mode 100644
index 0000000..6a1e500
--- /dev/null
+++ b/Examples/TranscribeCS/Transcribe.cs
@@ -0,0 +1,114 @@
+using System.Globalization;
+using Whisper;
+
+namespace TranscribeCS
+{
+ /// <summary>Implementation of Callbacks abstract class, to print these segments as soon as they’re produced by the library.</summary>
+ sealed class Transcribe: Callbacks
+ {
+ readonly CommandLineArgs args;
+ readonly eResultFlags resultFlags;
+
+ public Transcribe( CommandLineArgs args )
+ {
+ this.args = args;
+ resultFlags = args.resultFlags();
+ Console.OutputEncoding = System.Text.Encoding.UTF8;
+ }
+
+ // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
+ // Lowest is red, middle is yellow, highest is green.
+ readonly string[] k_colors = new string[]
+ {
+ "\x1B[38;5;196m", "\x1B[38;5;202m", "\x1B[38;5;208m", "\x1B[38;5;214m", "\x1B[38;5;220m",
+ "\x1B[38;5;226m", "\x1B[38;5;190m", "\x1B[38;5;154m", "\x1B[38;5;118m", "\x1B[38;5;82m"
+ };
+
+ int colorIndex( in sToken tok )
+ {
+ float p = tok.probability;
+ float p3 = p * p * p;
+ int col = (int)( p3 * k_colors.Length );
+ col = Math.Clamp( col, 0, k_colors.Length - 1 );
+ return col;
+ }
+
+ public static string printTime( TimeSpan ts ) =>
+ ts.ToString( "hh':'mm':'ss'.'fff", CultureInfo.InvariantCulture );
+ public static string printTimeWithComma( TimeSpan ts ) =>
+ ts.ToString( "hh':'mm':'ss','fff", CultureInfo.InvariantCulture );
+
+ protected override void onNewSegment( Context sender, int countNew )
+ {
+ TranscribeResult res = sender.results( resultFlags );
+ ReadOnlySpan<sToken> tokens = res.tokens;
+
+ int s0 = res.segments.Length - countNew;
+ if( s0 == 0 )
+ Console.WriteLine();
+
+ for( int i = s0; i < res.segments.Length; i++ )
+ {
+ sSegment seg = res.segments[ i ];
+
+ if( args.no_timestamps )
+ {
+ if( args.print_colors && AnsiCodes.enabled )
+ {
+ foreach( sToken tok in res.getTokens( seg ) )
+ {
+ if( !args.print_special && tok.hasFlag( eTokenFlags.Special ) )
+ continue;
+ Console.Write( "{0}{1}{2}", k_colors[ colorIndex( tok ) ], tok.text, "\x1B[0m" );
+ }
+ }
+ else
+ Console.Write( seg.text );
+ Console.Out.Flush();
+ continue;
+ }
+
+ string speaker = "";
+#if false
+ if( args.diarize && pcmf32s.size() == 2 )
+ {
+ const size_t n_samples = pcmf32s[ 0 ].size();
+ const int64_t is0 = SourceAudio::sampleFromTimestamp( seg.time.begin, n_samples );
+ const int64_t is1 = SourceAudio::sampleFromTimestamp( seg.time.end, n_samples );
+
+ double energy0 = 0.0f;
+ double energy1 = 0.0f;
+
+ for( int64_t j = is0; j < is1; j++ )
+ {
+ energy0 += fabs( pcmf32s[ 0 ][ j ] );
+ energy1 += fabs( pcmf32s[ 1 ][ j ] );
+ }
+
+ if( energy0 > 1.1 * energy1 )
+ speaker = "(speaker 0)";
+ else if( energy1 > 1.1 * energy0 )
+ speaker = "(speaker 1)";
+ else
+ speaker = "(speaker ?)";
+
+ //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
+ }
+#endif
+ if( args.print_colors && AnsiCodes.enabled )
+ {
+ Console.Write( "[{0} --> {1}] ", printTime( seg.time.begin ), printTime( seg.time.end ) );
+ foreach( sToken tok in res.getTokens( seg ) )
+ {
+ if( !args.print_special && tok.hasFlag( eTokenFlags.Special ) )
+ continue;
+ Console.Write( "{0}{1}{2}{3}", speaker, k_colors[ colorIndex( tok ) ], tok.text, "\x1B[0m" );
+ }
+ Console.WriteLine();
+ }
+ else
+ Console.WriteLine( "[{0} --> {1}] {2}{3}", printTime( seg.time.begin ), printTime( seg.time.end ), speaker, seg.text );
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/Examples/TranscribeCS/TranscribeCS.cs b/Examples/TranscribeCS/TranscribeCS.cs
new file mode 100644
index 0000000..9b828e3
--- /dev/null
+++ b/Examples/TranscribeCS/TranscribeCS.cs
@@ -0,0 +1,102 @@
+using Whisper;
+
+namespace TranscribeCS
+{
+ static class Program
+ {
+ static readonly bool streamAudio = true;
+
+ static int Main( string[] args )
+ {
+ try
+ {
+ CommandLineArgs cla;
+ try
+ {
+ cla = new CommandLineArgs( args );
+ }
+ catch( OperationCanceledException )
+ {
+ return 1;
+ }
+ const eLoggerFlags loggerFlags = eLoggerFlags.UseStandardError | eLoggerFlags.SkipFormatMessage;
+ Library.setLogSink( eLogLevel.Debug, loggerFlags );
+
+ using iModel model = Library.loadModel( cla.model );
+ using Context context = model.createContext();
+ cla.apply( ref context.parameters );
+ using iMediaFoundation mf = Library.initMediaFoundation();
+ Transcribe transcribe = new Transcribe( cla );
+
+ foreach( string audioFile in cla.fileNames )
+ {
+ if( streamAudio )
+ {
+ using iAudioReader reader = mf.openAudioFile( audioFile );
+ context.runFull( reader, transcribe, null, cla.prompt );
+ }
+ else
+ {
+ using iAudioBuffer buffer = mf.loadAudioFile( audioFile );
+ context.runFull( buffer, transcribe, cla.prompt );
+ }
+ // When asked to, produce these text files
+ if( cla.output_txt )
+ writeTextFile( context, audioFile );
+ if( cla.output_srt )
+ writeSubRip( context, audioFile, cla );
+ if( cla.output_vtt )
+ writeWebVTT( context, audioFile );
+ }
+
+ context.timingsPrint();
+ return 0;
+ }
+ catch( Exception ex )
+ {
+ Console.WriteLine( ex.Message );
+ return ex.HResult;
+ }
+ }
+
+ static void writeTextFile( Context context, string audioPath )
+ {
+ using var stream = File.CreateText( Path.ChangeExtension( audioPath, ".txt" ) );
+ foreach( sSegment seg in context.results().segments )
+ stream.WriteLine( seg.text );
+ }
+
+ static void writeSubRip( Context context, string audioPath, CommandLineArgs cliArgs )
+ {
+ using var stream = File.CreateText( Path.ChangeExtension( audioPath, ".srt" ) );
+ var segments = context.results( eResultFlags.Timestamps ).segments;
+
+ for( int i = 0; i < segments.Length; i++ )
+ {
+ stream.WriteLine( i + 1 + cliArgs.offset_n );
+ sSegment seg = segments[ i ];
+ string begin = Transcribe.printTimeWithComma( seg.time.begin );
+ string end = Transcribe.printTimeWithComma( seg.time.end );
+ stream.WriteLine( "{0} --> {1}", begin, end );
+ stream.WriteLine( seg.text );
+ stream.WriteLine();
+ }
+ }
+
+ static void writeWebVTT( Context context, string audioPath )
+ {
+ using var stream = File.CreateText( Path.ChangeExtension( audioPath, ".vtt" ) );
+ stream.WriteLine( "WEBVTT" );
+ stream.WriteLine();
+
+ foreach( sSegment seg in context.results( eResultFlags.Timestamps ).segments )
+ {
+ string begin = Transcribe.printTime( seg.time.begin );
+ string end = Transcribe.printTime( seg.time.end );
+ stream.WriteLine( "{0} --> {1}", begin, end );
+ stream.WriteLine( seg.text );
+ stream.WriteLine();
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/Examples/TranscribeCS/TranscribeCS.csproj b/Examples/TranscribeCS/TranscribeCS.csproj
new file mode 100644
index 0000000..e9b8d0f
--- /dev/null
+++ b/Examples/TranscribeCS/TranscribeCS.csproj
@@ -0,0 +1,19 @@
+<Project Sdk="Microsoft.NET.Sdk">
+ <PropertyGroup>
+ <OutputType>Exe</OutputType>
+ <TargetFramework>net6.0-windows</TargetFramework>
+ <ImplicitUsings>enable</ImplicitUsings>
+ <Nullable>enable</Nullable>
+ <CheckForOverflowUnderflow>true</CheckForOverflowUnderflow>
+ <AppendTargetFrameworkToOutputPath>false</AppendTargetFrameworkToOutputPath>
+ <Platforms>x64</Platforms>
+ </PropertyGroup>
+ <ItemGroup>
+ <Content Include="..\..\x64\$(Configuration)\Whisper.dll" Link="Whisper.dll">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </Content>
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\..\WhisperNet\WhisperNet.csproj" />
+ </ItemGroup>
+</Project> \ No newline at end of file