summaryrefslogtreecommitdiffstats
path: root/Scripts/lang_compat.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-05-30 19:01:56 -0700
committeryum <yum.food.vr@gmail.com>2023-05-30 19:13:25 -0700
commit0bda49279ec80187d49a922ff2a47141ffb2fd8f (patch)
treebd2521c2b2cdca422eb94e1dbef7c85e8dcefe4b /Scripts/lang_compat.py
parent84f09e1fdf15644d1ea5f955889581932e4f6a8e (diff)
Finish translation for Western European language speakersv0.12.0
NLLB needs its input to be split up into sentences. I use the sentence_splitter Python package to do this. It supports ~20 Western European languages, but notably, no Asian languages. * Sort spoken language list. English is still at the top. * Remove 'Translation source' dropdown. Infer this from the spoken language. * Add lang_compat.py to map language codes between the various libraries (whisper, nllb, sentence_splitter). * Fix bug where old text would appear in textbox when you first bring it up.
Diffstat (limited to 'Scripts/lang_compat.py')
-rw-r--r--Scripts/lang_compat.py58
1 files changed, 58 insertions, 0 deletions
diff --git a/Scripts/lang_compat.py b/Scripts/lang_compat.py
new file mode 100644
index 0000000..af35921
--- /dev/null
+++ b/Scripts/lang_compat.py
@@ -0,0 +1,58 @@
+# This file provides mappings between language codes used by different
+# third-party libraries.
+
+# Whisper to NLLB.
+whisper_to_nllb = {
+ "catalan": "cat_Ltn", # catalan
+ "czech": "ces_Latn", # czech
+ "danish": "dan_Latn", # danish
+ "dutch": "nld_Latn", # dutch
+ "english": "eng_Latn", # english
+ "finnish": "fin_Latn", # finnish
+ "french": "fra_Latn", # french
+ "german": "deu_Latn", # german
+ "greek": "ell_Grek", # greek
+ "hungarian": "hun_Latn", # hungarian
+ "icelandic": "isl_Latn", # icelandic
+ "italian": "ita_Latn", # italian
+ "latvian": "lvs_Latn", # latvian
+ "lithuanian": "lit_Latn", # lithuanian
+ "norwegian": "nob_Latn", # norwegian (bokmal)
+ "polish": "pol_Latn", # polish
+ "portugese": "por_Latn", # portugese
+ "romanian": "ron_Latn", # romanian
+ "russian": "rus_Cyrl", # russian
+ "slovak": "slk_Latn", # slovak
+ "slovene": "slv_Latn", # slovene
+ "spanish": "spa_Latn", # spanish
+ "swedish": "swe_Latn", # swedish
+ "turkish": "tur_Latn", # turkish
+ }
+
+# NLLB to sentence_splitter (SS).
+nllb_to_ss = {
+ "cat_Ltn": "ca", # catalan
+ "ces_Latn": "cs", # czech
+ "dan_Latn": "da", # danish
+ "nld_Latn": "nl", # dutch
+ "eng_Latn": "en", # english
+ "fin_Latn": "fi", # finnish
+ "fra_Latn": "fr", # french
+ "deu_Latn": "de", # german
+ "ell_Grek": "el", # greek
+ "hun_Latn": "hu", # hungarian
+ "isl_Latn": "is", # icelandic
+ "ita_Latn": "it", # italian
+ "lvs_Latn": "lv", # latvian
+ "lit_Latn": "lt", # lithuanian
+ "nob_Latn": "no", # norwegian (bokmal)
+ "pol_Latn": "pl", # polish
+ "por_Latn": "pt", # portugese
+ "ron_Latn": "ro", # romanian
+ "rus_Cyrl": "ru", # russian
+ "slk_Latn": "sk", # slovak
+ "slv_Latn": "sl", # slovene
+ "spa_Latn": "es", # spanish
+ "swe_Latn": "sv", # swedish
+ "tur_Latn": "tr", # turkish
+ }