Examples/main/params.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

#include "params.h"
#include <algorithm>
#include <thread>
#include "miscUtils.h"

whisper_params::whisper_params()
{
#ifdef _DEBUG
	n_threads = 2;
#else
	n_threads = std::min( 4u, std::thread::hardware_concurrency() );
#endif	
}

namespace
{
	const char* cstr( bool b )
	{
		return b ? "true" : "false";
	}
}

void whisper_print_usage( int argc, wchar_t** argv, const whisper_params& params )
{
	fprintf( stderr, "\n" );
	fprintf( stderr, "usage: %S [options] file0.wav file1.wav ...\n", argv[ 0 ] );
	fprintf( stderr, "\n" );
	fprintf( stderr, "options:\n" );
	fprintf( stderr, "  -h,       --help          [default] show this help message and exit\n" );
	fprintf( stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads );
	fprintf( stderr, "  -p N,     --processors N  [%-7d] number of processors to use during computation\n", params.n_processors );
	fprintf( stderr, "  -ot N,    --offset-t N    [%-7d] time offset in milliseconds\n", params.offset_t_ms );
	fprintf( stderr, "  -on N,    --offset-n N    [%-7d] segment index offset\n", params.offset_n );
	fprintf( stderr, "  -d  N,    --duration N    [%-7d] duration of audio to process in milliseconds\n", params.duration_ms );
	fprintf( stderr, "  -mc N,    --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context );
	fprintf( stderr, "  -ml N,    --max-len N     [%-7d] maximum segment length in characters\n", params.max_len );
	fprintf( stderr, "  -wt N,    --word-thold N  [%-7.2f] word timestamp probability threshold\n", params.word_thold );
	fprintf( stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n", cstr( params.speed_up ) );
	fprintf( stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n", cstr( params.translate ) );
	fprintf( stderr, "  -di,      --diarize       [%-7s] stereo audio diarization\n", cstr( params.diarize ) );
	fprintf( stderr, "  -otxt,    --output-txt    [%-7s] output result in a text file\n", cstr( params.output_txt ) );
	fprintf( stderr, "  -ovtt,    --output-vtt    [%-7s] output result in a vtt file\n", cstr( params.output_vtt ) );
	fprintf( stderr, "  -osrt,    --output-srt    [%-7s] output result in a srt file\n", cstr( params.output_srt ) );
	fprintf( stderr, "  -owts,    --output-words  [%-7s] output script for generating karaoke video\n", cstr( params.output_wts ) );
	fprintf( stderr, "  -ps,      --print-special [%-7s] print special tokens\n", cstr( params.print_special ) );
	fprintf( stderr, "  -nc,      --no-colors     [%-7s] do not print colors\n", cstr( !params.print_colors ) );
	fprintf( stderr, "  -nt,      --no-timestamps [%-7s] do not print timestamps\n", cstr( params.no_timestamps ) );
	fprintf( stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n", params.language.c_str() );
	fprintf( stderr, "  -m FNAME, --model FNAME   [%-7S] model path\n", params.model.c_str() );
	fprintf( stderr, "  -f FNAME, --file FNAME    [%-7s] path of the input audio file\n", "" );
	fprintf( stderr, "\n" );
}

bool whisper_params::parse( int argc, wchar_t* argv[] )
{
	for( int i = 1; i < argc; i++ )
	{
		std::wstring arg = argv[ i ];

		if( arg[ 0 ] != '-' )
		{
			fname_inp.push_back( arg );
			continue;
		}

		if( arg == L"-h" || arg == L"--help" )
		{
			whisper_print_usage( argc, argv, *this );
			return false;
		}

		else if( arg == L"-t" || arg == L"--threads" ) { n_threads = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-p" || arg == L"--processors" ) { n_processors = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-ot" || arg == L"--offset-t" ) { offset_t_ms = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-on" || arg == L"--offset-n" ) { offset_n = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-d" || arg == L"--duration" ) { duration_ms = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-mc" || arg == L"--max-context" ) { max_context = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-ml" || arg == L"--max-len" ) { max_len = std::stoul( argv[ ++i ] ); }
		else if( arg == L"-wt" || arg == L"--word-thold" ) { word_thold = std::stof( argv[ ++i ] ); }
		else if( arg == L"-su" || arg == L"--speed-up" ) { speed_up = true; }
		else if( arg == L"-tr" || arg == L"--translate" ) { translate = true; }
		else if( arg == L"-di" || arg == L"--diarize" ) { diarize = true; }
		else if( arg == L"-otxt" || arg == L"--output-txt" ) { output_txt = true; }
		else if( arg == L"-ovtt" || arg == L"--output-vtt" ) { output_vtt = true; }
		else if( arg == L"-osrt" || arg == L"--output-srt" ) { output_srt = true; }
		else if( arg == L"-owts" || arg == L"--output-words" ) { output_wts = true; }
		else if( arg == L"-ps" || arg == L"--print-special" ) { print_special = true; }
		else if( arg == L"-nc" || arg == L"--no-colors" ) { print_colors = false; }
		else if( arg == L"-nt" || arg == L"--no-timestamps" ) { no_timestamps = true; }
		else if( arg == L"-l" || arg == L"--language" ) { language = utf8( argv[ ++i ] ); }
		else if( arg == L"-m" || arg == L"--model" ) { model = argv[ ++i ]; }
		else if( arg == L"-f" || arg == L"--file" ) { fname_inp.push_back( argv[ ++i ] ); }
		else
		{
			fprintf( stderr, "error: unknown argument: %S\n", arg.c_str() );
			whisper_print_usage( argc, argv, *this );
			return false;
		}
	}
	return true;
}