diff options
| author | yum <yum.food.vr@gmail.com> | 2023-05-30 19:01:56 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2023-05-30 19:13:25 -0700 |
| commit | 0bda49279ec80187d49a922ff2a47141ffb2fd8f (patch) | |
| tree | bd2521c2b2cdca422eb94e1dbef7c85e8dcefe4b /Scripts/lang_compat.py | |
| parent | 84f09e1fdf15644d1ea5f955889581932e4f6a8e (diff) | |
Finish translation for Western European language speakersv0.12.0
NLLB needs its input to be split up into sentences. I use the
sentence_splitter Python package to do this. It supports ~20 Western
European languages, but notably, no Asian languages.
* Sort spoken language list. English is still at the top.
* Remove 'Translation source' dropdown. Infer this from the spoken
language.
* Add lang_compat.py to map language codes between the various libraries
(whisper, nllb, sentence_splitter).
* Fix bug where old text would appear in textbox when you first bring it
up.
Diffstat (limited to 'Scripts/lang_compat.py')
| -rw-r--r-- | Scripts/lang_compat.py | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/Scripts/lang_compat.py b/Scripts/lang_compat.py new file mode 100644 index 0000000..af35921 --- /dev/null +++ b/Scripts/lang_compat.py @@ -0,0 +1,58 @@ +# This file provides mappings between language codes used by different +# third-party libraries. + +# Whisper to NLLB. +whisper_to_nllb = { + "catalan": "cat_Ltn", # catalan + "czech": "ces_Latn", # czech + "danish": "dan_Latn", # danish + "dutch": "nld_Latn", # dutch + "english": "eng_Latn", # english + "finnish": "fin_Latn", # finnish + "french": "fra_Latn", # french + "german": "deu_Latn", # german + "greek": "ell_Grek", # greek + "hungarian": "hun_Latn", # hungarian + "icelandic": "isl_Latn", # icelandic + "italian": "ita_Latn", # italian + "latvian": "lvs_Latn", # latvian + "lithuanian": "lit_Latn", # lithuanian + "norwegian": "nob_Latn", # norwegian (bokmal) + "polish": "pol_Latn", # polish + "portugese": "por_Latn", # portugese + "romanian": "ron_Latn", # romanian + "russian": "rus_Cyrl", # russian + "slovak": "slk_Latn", # slovak + "slovene": "slv_Latn", # slovene + "spanish": "spa_Latn", # spanish + "swedish": "swe_Latn", # swedish + "turkish": "tur_Latn", # turkish + } + +# NLLB to sentence_splitter (SS). +nllb_to_ss = { + "cat_Ltn": "ca", # catalan + "ces_Latn": "cs", # czech + "dan_Latn": "da", # danish + "nld_Latn": "nl", # dutch + "eng_Latn": "en", # english + "fin_Latn": "fi", # finnish + "fra_Latn": "fr", # french + "deu_Latn": "de", # german + "ell_Grek": "el", # greek + "hun_Latn": "hu", # hungarian + "isl_Latn": "is", # icelandic + "ita_Latn": "it", # italian + "lvs_Latn": "lv", # latvian + "lit_Latn": "lt", # lithuanian + "nob_Latn": "no", # norwegian (bokmal) + "pol_Latn": "pl", # polish + "por_Latn": "pt", # portugese + "ron_Latn": "ro", # romanian + "rus_Cyrl": "ru", # russian + "slk_Latn": "sk", # slovak + "slv_Latn": "sl", # slovene + "spa_Latn": "es", # spanish + "swe_Latn": "sv", # swedish + "tur_Latn": "tr", # turkish + } |
