|
| 1 | +from collections import defaultdict |
| 2 | +from typing import Dict |
| 3 | + |
| 4 | +from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT |
| 5 | +from i6_experiments.common.datasets.tedlium2_v2.corpus import get_corpus_object_dict |
| 6 | +from i6_experiments.common.datasets.tedlium2_v2.lexicon import ( |
| 7 | + get_g2p_augmented_bliss_lexicon, |
| 8 | +) |
| 9 | +from i6_experiments.common.setups.rasr.util import RasrDataInput |
| 10 | + |
| 11 | +from i6_experiments.common.setups.rasr.config.lex_config import ( |
| 12 | + LexiconRasrConfig, |
| 13 | +) |
| 14 | +from i6_experiments.common.setups.rasr.config.lm_config import ArpaLmRasrConfig |
| 15 | +from i6_experiments.common.baselines.tedlium2_v2.lm.ngram_config import run_tedlium2_ngram_lm |
| 16 | + |
| 17 | + |
| 18 | +def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dict[str, Dict[str, RasrDataInput]]: |
| 19 | + corpus_object_dict = get_corpus_object_dict(audio_format="wav", output_prefix="corpora") |
| 20 | + |
| 21 | + train_lexicon = LexiconRasrConfig( |
| 22 | + get_g2p_augmented_bliss_lexicon( |
| 23 | + add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping, output_prefix="lexicon" |
| 24 | + ), |
| 25 | + False, |
| 26 | + ) |
| 27 | + |
| 28 | + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping) |
| 29 | + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] |
| 30 | + comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm) |
| 31 | + |
| 32 | + rasr_data_input_dict = defaultdict(dict) |
| 33 | + |
| 34 | + for name, crp_obj in corpus_object_dict.items(): |
| 35 | + rasr_data_input_dict[name][name] = RasrDataInput( |
| 36 | + corpus_object=crp_obj, |
| 37 | + lexicon=train_lexicon.get_dict(), |
| 38 | + concurrent=CONCURRENT[name], |
| 39 | + lm=comb_lm.get_dict() if name == "dev" or name == "test" else None, |
| 40 | + ) |
| 41 | + |
| 42 | + return rasr_data_input_dict |
0 commit comments