1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
# This file provides mappings between language codes used by different
# third-party libraries.
# Whisper to NLLB.
whisper_to_nllb = {
"catalan": "cat_Ltn", # catalan
"czech": "ces_Latn", # czech
"danish": "dan_Latn", # danish
"dutch": "nld_Latn", # dutch
"english": "eng_Latn", # english
"finnish": "fin_Latn", # finnish
"french": "fra_Latn", # french
"german": "deu_Latn", # german
"greek": "ell_Grek", # greek
"hungarian": "hun_Latn", # hungarian
"icelandic": "isl_Latn", # icelandic
"italian": "ita_Latn", # italian
"latvian": "lvs_Latn", # latvian
"lithuanian": "lit_Latn", # lithuanian
"norwegian": "nob_Latn", # norwegian (bokmal)
"polish": "pol_Latn", # polish
"portugese": "por_Latn", # portugese
"romanian": "ron_Latn", # romanian
"russian": "rus_Cyrl", # russian
"slovak": "slk_Latn", # slovak
"slovene": "slv_Latn", # slovene
"spanish": "spa_Latn", # spanish
"swedish": "swe_Latn", # swedish
"turkish": "tur_Latn", # turkish
}
# NLLB to sentence_splitter (SS).
nllb_to_ss = {
"cat_Ltn": "ca", # catalan
"ces_Latn": "cs", # czech
"dan_Latn": "da", # danish
"nld_Latn": "nl", # dutch
"eng_Latn": "en", # english
"fin_Latn": "fi", # finnish
"fra_Latn": "fr", # french
"deu_Latn": "de", # german
"ell_Grek": "el", # greek
"hun_Latn": "hu", # hungarian
"isl_Latn": "is", # icelandic
"ita_Latn": "it", # italian
"lvs_Latn": "lv", # latvian
"lit_Latn": "lt", # lithuanian
"nob_Latn": "no", # norwegian (bokmal)
"pol_Latn": "pl", # polish
"por_Latn": "pt", # portugese
"ron_Latn": "ro", # romanian
"rus_Cyrl": "ru", # russian
"slk_Latn": "sk", # slovak
"slv_Latn": "sl", # slovene
"spa_Latn": "es", # spanish
"swe_Latn": "sv", # swedish
"tur_Latn": "tr", # turkish
}
|