1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
using System.Globalization;
using System.Text.RegularExpressions;
namespace CompressShaders
{
static class LanguageCodes
{
record struct Row
{
public string keySource;
public uint keyValue;
public int code;
public string name;
}
static uint makeKey( string str )
{
if( str.Length > 4 )
throw new ArgumentException();
uint k = 0;
int shift = 0;
foreach( char c in str )
{
if( c >= 0x80 )
throw new ArgumentException();
uint u = (uint)c;
k |= ( u << shift );
shift += 8;
}
return k;
}
static IEnumerable<Row> load( string path )
{
using var stm = File.OpenText( path );
while( true )
{
string? line = stm.ReadLine();
if( null == line )
break;
if( string.IsNullOrWhiteSpace( line ) )
continue;
string[] fields = line.Split( '\t' );
yield return new Row()
{
keySource = fields[ 0 ],
keyValue = makeKey( fields[ 0 ] ),
code = int.Parse( fields[ 1 ] ),
name = fields[ 2 ]
};
}
}
static void writeCpp( string inl, Row[] data )
{
// TODO [very low]: sort them by the key here, then in C++ use binary search instead of the hash map
using var stm = File.CreateText( inl );
stm.WriteLine( "// This file is generated by a tool, from the `languageCodez.tsv` file in this repository" );
foreach( Row row in data )
stm.WriteLine( "Lang{{ 0x{0:X}, {1}, \"{2}\" }},", row.keyValue, row.code, row.name );
}
static readonly CultureInfo ci = new CultureInfo( "en-US", false );
static string titleCase( this string name ) =>
ci.TextInfo.ToTitleCase( name.ToLower( ci ) );
static void writeCs( string cs, Row[] data )
{
using var stm = File.CreateText( cs );
stm.WriteLine( @"// This file is generated by a tool, from the `languageCodez.tsv` file in this repository
namespace Whisper
{
/// <summary>Supported languages</summary>
/// <remarks>The values of this enum are zero-padded ASCII strings.<br/>
/// It seems OpenAI tried to implement ISO 639-1, but they used the version of the standard from 1988.</remarks>
public enum eLanguage: uint
{" );
foreach( Row row in data )
{
string tc = row.name.titleCase();
stm.WriteLine( " /// <summary>{0}</summary>", tc );
tc = Regex.Replace( tc, @"\s+", string.Empty );
stm.WriteLine( " {0} = 0x{1:X}, // \"{2}\"", tc, row.keyValue, row.keySource );
}
stm.Write( @" }
}" );
}
static void produce( string tsv, string inl, string cs )
{
Row[] data = load( tsv ).OrderBy( r => r.name ).ToArray();
writeCpp( inl, data );
writeCs( cs, data );
}
public static void produce( string solutionRoot )
{
string tsv = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.tsv" );
string inl = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.inl" );
string cs = Path.Combine( solutionRoot, "WhisperNet\\API\\eLanguage.cs" );
produce( tsv, inl, cs );
}
}
}
|