summaryrefslogtreecommitdiffstats
path: root/Tools/CompressShaders/LanguageCodes.cs
blob: 0843fc72c24b4af6745a499bcd3b260fe821bf8e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
using System.Globalization;
using System.Text.RegularExpressions;

namespace CompressShaders
{
	static class LanguageCodes
	{
		record struct Row
		{
			public string keySource;
			public uint keyValue;
			public int code;
			public string name;
		}

		static uint makeKey( string str )
		{
			if( str.Length > 4 )
				throw new ArgumentException();
			uint k = 0;
			int shift = 0;
			foreach( char c in str )
			{
				if( c >= 0x80 )
					throw new ArgumentException();
				uint u = (uint)c;
				k |= ( u << shift );
				shift += 8;
			}
			return k;
		}

		static IEnumerable<Row> load( string path )
		{
			using var stm = File.OpenText( path );
			while( true )
			{
				string? line = stm.ReadLine();
				if( null == line )
					break;
				if( string.IsNullOrWhiteSpace( line ) )
					continue;
				string[] fields = line.Split( '\t' );
				yield return new Row()
				{
					keySource = fields[ 0 ],
					keyValue = makeKey( fields[ 0 ] ),
					code = int.Parse( fields[ 1 ] ),
					name = fields[ 2 ]
				};
			}
		}

		static void writeCpp( string inl, Row[] data )
		{
			// TODO [very low]: sort them by the key here, then in C++ use binary search instead of the hash map
			using var stm = File.CreateText( inl );
			stm.WriteLine( "// This file is generated by a tool, from the `languageCodez.tsv` file in this repository" );
			foreach( Row row in data )
				stm.WriteLine( "Lang{{ 0x{0:X}, {1}, \"{2}\" }},", row.keyValue, row.code, row.name );
		}

		static readonly CultureInfo ci = new CultureInfo( "en-US", false );
		static string titleCase( this string name ) =>
			ci.TextInfo.ToTitleCase( name.ToLower( ci ) );

		static void writeCs( string cs, Row[] data )
		{
			using var stm = File.CreateText( cs );
			stm.WriteLine( @"// This file is generated by a tool, from the `languageCodez.tsv` file in this repository
namespace Whisper
{
	/// <summary>Supported languages</summary>
	/// <remarks>The values of this enum are zero-padded ASCII strings.<br/>
	/// It seems OpenAI tried to implement ISO 639-1, but they used the version of the standard from 1988.</remarks>
	public enum eLanguage: uint
	{" );

			foreach( Row row in data )
			{
				string tc = row.name.titleCase();
				stm.WriteLine( "		/// <summary>{0}</summary>", tc );
				tc = Regex.Replace( tc, @"\s+", string.Empty );
				stm.WriteLine( "		{0} = 0x{1:X},  // \"{2}\"", tc, row.keyValue, row.keySource );
			}
			stm.Write( @"	}
}" );
		}

		static void produce( string tsv, string inl, string cs )
		{
			Row[] data = load( tsv ).OrderBy( r => r.name ).ToArray();
			writeCpp( inl, data );
			writeCs( cs, data );
		}

		public static void produce( string solutionRoot )
		{
			string tsv = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.tsv" );
			string inl = Path.Combine( solutionRoot, "Whisper\\Whisper\\languageCodez.inl" );
			string cs = Path.Combine( solutionRoot, "WhisperNet\\API\\eLanguage.cs" );
			produce( tsv, inl, cs );
		}
	}
}