diff --git a/common/baselines/tedlium2_v2/__init__.py b/common/baselines/tedlium2_v2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/common/baselines/tedlium2_v2/data.py b/common/baselines/tedlium2_v2/data.py new file mode 100644 index 000000000..ee09aec98 --- /dev/null +++ b/common/baselines/tedlium2_v2/data.py @@ -0,0 +1,42 @@ +from collections import defaultdict +from typing import Dict + +from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT +from i6_experiments.common.datasets.tedlium2_v2.corpus import get_corpus_object_dict +from i6_experiments.common.datasets.tedlium2_v2.lexicon import ( + get_g2p_augmented_bliss_lexicon, +) +from i6_experiments.common.setups.rasr.util import RasrDataInput + +from i6_experiments.common.setups.rasr.config.lex_config import ( + LexiconRasrConfig, +) +from i6_experiments.common.setups.rasr.config.lm_config import ArpaLmRasrConfig +from i6_experiments.common.baselines.tedlium2_v2.lm.ngram_config import run_tedlium2_ngram_lm + + +def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dict[str, Dict[str, RasrDataInput]]: + corpus_object_dict = get_corpus_object_dict(audio_format="wav", output_prefix="corpora") + + train_lexicon = LexiconRasrConfig( + get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping, output_prefix="lexicon" + ), + False, + ) + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm) + + rasr_data_input_dict = defaultdict(dict) + + for name, crp_obj in corpus_object_dict.items(): + rasr_data_input_dict[name][name] = RasrDataInput( + corpus_object=crp_obj, + lexicon=train_lexicon.get_dict(), + concurrent=CONCURRENT[name], + lm=comb_lm.get_dict() if name == "dev" or name == "test" else None, + ) + + return rasr_data_input_dict diff --git a/common/baselines/tedlium2_v2/default_tools.py b/common/baselines/tedlium2_v2/default_tools.py new file mode 100644 index 000000000..42390b0c8 --- /dev/null +++ b/common/baselines/tedlium2_v2/default_tools.py @@ -0,0 +1,27 @@ +""" +List of default tools and software to be defined as default independent from hashing +by setting one explicit hash. + +In order to use different software paths without hash changes, just use the same explicit hash string as given here. + +If you want a stronger guarantee that you get the intended results, please consider using the explicit software +version listed here. Nevertheless, the most recent "head" should be safe to be used as well. +""" +from sisyphus import tk +from i6_experiments.common.tools.audio import compile_ffmpeg_binary +from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode +from i6_experiments.common.tools.sctk import compile_sctk + +RASR_BINARY_PATH = compile_rasr_binaries_i6mode( + branch="apptainer_tf_2_8", configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"] +) # use most recent RASR +# RASR_BINARY_PATH = tk.Path("/work/asr4/rossenbach/neon_test/rasr_versions/rasr_no_tf/arch/linux-x86_64-standard/") +assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline" +RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" + + +SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version +SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH" + +SRILM_PATH = tk.Path("/work/tools/users/luescher/srilm-1.7.3/bin/i686-m64/") +SRILM_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SRILM_PATH" diff --git a/common/baselines/tedlium2_v2/gmm/__init__.py b/common/baselines/tedlium2_v2/gmm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/common/baselines/tedlium2_v2/gmm/baseline_args.py b/common/baselines/tedlium2_v2/gmm/baseline_args.py new file mode 100644 index 000000000..b227cccd1 --- /dev/null +++ b/common/baselines/tedlium2_v2/gmm/baseline_args.py @@ -0,0 +1,423 @@ +from i6_core.features.filterbank import filter_width_from_channels +from i6_core import cart + +from i6_experiments.common.setups.rasr import util +from i6_experiments.common.datasets.tedlium2.cart import CartQuestions +from i6_experiments.common.baselines.librispeech.default_tools import SCTK_BINARY_PATH + + +def get_init_args(): + samples_options = { + "audio_format": "wav", + "dc_detection": False, + } + + am_args = { + "state_tying": "monophone", + "states_per_phone": 3, + "state_repetitions": 1, + "across_word_model": True, + "early_recombination": False, + "tdp_scale": 1.0, + "tdp_transition": (3.0, 0.0, "infinity", 0.0), + "tdp_silence": (0.0, 3.0, "infinity", 20.0), + "tying_type": "global", + "nonword_phones": "", + "tdp_nonword": ( + 0.0, + 3.0, + "infinity", + 6.0, + ), # only used when tying_type = global-and-nonword + } + + costa_args = {"eval_recordings": True, "eval_lm": False} + + feature_extraction_args = { + "mfcc": { + "num_deriv": 2, + "num_features": None, # confusing name: number of max features, above number -> clipped + "mfcc_options": { + "warping_function": "mel", + # to be compatible with our old magic number, we have to use 20 features + "filter_width": filter_width_from_channels(channels=20, warping_function="mel", f_max=8000), + "normalize": True, + "normalization_options": None, + "without_samples": False, + "samples_options": samples_options, + "cepstrum_options": { + "normalize": False, + "outputs": 16, # this is the actual output feature dimension + "add_epsilon": True, # when there is no dc-detection we can have log(0) otherwise + "epsilon": 1e-10, + }, + "fft_options": None, + "add_features_output": True, + }, + }, + "energy": { + "energy_options": { + "without_samples": False, + "samples_options": samples_options, + "fft_options": None, + } + }, + } + + scorer_args = {"sctk_binary_path": SCTK_BINARY_PATH} + + return util.RasrInitArgs( + costa_args=costa_args, + am_args=am_args, + feature_extraction_args=feature_extraction_args, + scorer_args=scorer_args, + ) + + +def get_monophone_args(): + linear_alignment_args = { + "minimum_segment_length": 0, + "maximum_segment_length": 6000, + "iterations": 5, + "penalty": 0, + "minimum_speech_proportion": 0.7, + "save_alignment": False, + "keep_accumulators": False, + "extra_merge_args": None, + "extra_config": None, + "extra_post_config": None, + "use_corrected_applicator": False, + } + + monophone_training_args = { + "name": "mono", + "feature_flow": "mfcc+deriv+norm", + "feature_energy_flow_key": "energy,mfcc+deriv+norm", + "align_iter": 35, + "splits": 10, + "accs_per_split": 2, + "dump_alignment_score_report": True, + "use_corrected_applicator": False, + } + + monophone_recognition_args = { + # GmmSystem.recognition() args: + "iters": [10], + "lm_scales": [10], + "optimize_am_lm_scale": True, + # meta.System.recog() args: + "feature_flow": "mfcc+deriv+norm", + "pronunciation_scales": [1.0], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam-pruning": 15.0, + "beam-pruning-limit": 100000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "parallelize_conversion": False, + "lattice_to_ctm_kwargs": {}, + "rtf": 20, + "mem": 4, + "use_gpu": False, + } + + return util.GmmMonophoneArgs(linear_alignment_args, monophone_training_args, monophone_recognition_args) + + +def get_cart_args( + max_leaves: int = 9001, + min_obs: int = 1000, + hmm_states: int = 3, + feature_flow: str = "mfcc+deriv+norm", +): + """ + + :param max_leaves: + :param min_obs: + :param hmm_states: + :param feature_flow: + :return: + """ + + cart_questions_class = CartQuestions( + max_leaves=max_leaves, + min_obs=min_obs, + ) + + cart_questions = cart.PythonCartQuestions( + phonemes=cart_questions_class.phonemes_boundary_silence, + steps=cart_questions_class.steps, + max_leaves=max_leaves, + hmm_states=hmm_states, + ) + + cart_lda_args = { + "name": "cart_mono", + "alignment": "train_mono", + "initial_flow_key": feature_flow, + "context_flow_key": feature_flow.split("+")[0], + "context_size": 9, + "num_dim": 48, + "num_iter": 2, + "eigenvalue_args": {}, + "generalized_eigenvalue_args": {"all": {"verification_tolerance": 1e16}}, + } + + return util.GmmCartArgs( + cart_questions=cart_questions, + cart_lda_args=cart_lda_args, + ) + + +def get_triphone_args(): + triphone_training_args = { + "name": "tri", + "initial_alignment": "train_mono", + "feature_flow": "mfcc+context+lda", + "splits": 10, + "accs_per_split": 2, + "align_extra_rqmt": {"mem": 8}, + "accumulate_extra_rqmt": {"mem": 8}, + "split_extra_rqmt": {"mem": 8}, + "use_corrected_applicator": False, + } + + triphone_recognition_args = { + "iters": [8, 10], + "feature_flow": "mfcc+context+lda", + "pronunciation_scales": [1.0], + "lm_scales": [25], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam_pruning": 15.0, + "beam-pruning-limit": 100000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": False, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 20, + "mem": 4, + "parallelize_conversion": True, + } + + sdm_args = { + "name": "sdm.tri", + "alignment": "train_tri", + "feature_flow_key": "mfcc+context+lda", + } + + return util.GmmTriphoneArgs( + training_args=triphone_training_args, + recognition_args=triphone_recognition_args, + sdm_args=sdm_args, + ) + + +def get_vtln_args(): + vtln_training_args = { + "feature_flow": { + "name": "uncached_mfcc+context+lda", + "lda_matrix_key": "cart_mono", + "base_flow_key": "uncached_mfcc", + "context_size": 9, + }, + "warp_mix": { + "name": "tri", + "alignment": "train_tri", + "feature_scorer": "estimate_mixtures_sdm.tri", + "splits": 8, + "accs_per_split": 2, + }, + "train": { + "name": "vtln", + "initial_alignment_key": "train_tri", + "splits": 10, + "accs_per_split": 2, + "feature_flow": "mfcc+context+lda+vtln", + "align_extra_rqmt": {"mem": 8}, + "accumulate_extra_rqmt": {"mem": 8}, + "split_extra_rqmt": {"mem": 8}, + "use_corrected_applicator": False, + }, + } + + vtln_recognition_args = { + "iters": [8, 10], + "feature_flow": "uncached_mfcc+context+lda+vtln", + "pronunciation_scales": [1.0], + "lm_scales": [25], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam_pruning": 15.0, + "beam-pruning-limit": 100000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": False, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 20, + "mem": 4, + "parallelize_conversion": True, + } + + sdm_args = { + "name": "sdm.vtln", + "alignment": "train_vtln", + "feature_flow_key": "mfcc+context+lda+vtln", + } + + return util.GmmVtlnArgs( + training_args=vtln_training_args, + recognition_args=vtln_recognition_args, + sdm_args=sdm_args, + ) + + +def get_sat_args(): + sat_training_args = { + "name": "sat", + "mixtures": "estimate_mixtures_sdm.tri", + "alignment": "train_tri", + "feature_cache": "mfcc", + "feature_flow_key": "mfcc+context+lda", + "cache_regex": "^mfcc.*$", + "splits": 10, + "accs_per_split": 2, + "align_extra_rqmt": {"mem": 8}, + "accumulate_extra_rqmt": {"mem": 8}, + "split_extra_rqmt": {"mem": 8}, + "use_corrected_applicator": False, + } + + sat_recognition_args = { + "prev_ctm": util.PrevCtm( + prev_step_key="tri", + pronunciation_scale=1.0, + lm_scale=25, + iteration=10, + optimized_lm=True, + ), + "feature_cache": "mfcc", + "cache_regex": "^mfcc.*$", + "cmllr_mixtures": "estimate_mixtures_sdm.tri", + "iters": [8, 10], + "feature_flow": "uncached_mfcc+context+lda", + "pronunciation_scales": [1.0], + "lm_scales": [25], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam_pruning": 15.0, + "beam-pruning-limit": 100000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": False, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 20, + "mem": 4, + "parallelize_conversion": True, + } + + sdm_args = { + "name": "sdm.sat", + "alignment": "train_sat", + "feature_flow_key": "mfcc+context+lda+cmllr", + } + + return util.GmmSatArgs( + training_args=sat_training_args, + recognition_args=sat_recognition_args, + sdm_args=sdm_args, + ) + + +def get_vtln_sat_args(): + vtln_sat_training_args = { + "name": "vtln+sat", + "mixtures": "estimate_mixtures_sdm.vtln", + "alignment": "train_vtln", + "feature_cache": "mfcc+context+lda+vtln", + "feature_flow_key": "mfcc+context+lda+vtln", + "cache_regex": "^.*\\+vtln$", + "splits": 10, + "accs_per_split": 2, + "align_extra_rqmt": {"mem": 8}, + "accumulate_extra_rqmt": {"mem": 8}, + "split_extra_rqmt": {"mem": 8}, + "use_corrected_applicator": False, + } + + vtln_sat_recognition_args = { + "prev_ctm": util.PrevCtm( + prev_step_key="vtln", + pronunciation_scale=1.0, + lm_scale=25, + iteration=10, + optimized_lm=True, + ), + "feature_cache": "mfcc", + "cache_regex": "^mfcc.*$", + "cmllr_mixtures": "estimate_mixtures_sdm.vtln", + "iters": [8, 10], + "feature_flow": "uncached_mfcc+context+lda+vtln", + "pronunciation_scales": [1.0], + "lm_scales": [25], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam_pruning": 15.0, + "beam-pruning-limit": 100000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": False, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 20, + "mem": 4, + "parallelize_conversion": True, + } + + sdm_args = { + "name": "sdm.vtln+sat", + "alignment": "train_vtln+sat", + "feature_flow_key": "mfcc+context+lda+vtln+cmllr", + } + + return util.GmmVtlnSatArgs( + training_args=vtln_sat_training_args, + recognition_args=vtln_sat_recognition_args, + sdm_args=sdm_args, + ) diff --git a/common/baselines/tedlium2_v2/gmm/baseline_config.py b/common/baselines/tedlium2_v2/gmm/baseline_config.py new file mode 100644 index 000000000..fc462fc28 --- /dev/null +++ b/common/baselines/tedlium2_v2/gmm/baseline_config.py @@ -0,0 +1,55 @@ +from sisyphus import gs + +from i6_experiments.common.setups.rasr import gmm_system +from i6_experiments.common.setups.rasr.util import RasrSteps, OutputArgs + +from i6_experiments.common.baselines.tedlium2_v2.gmm import baseline_args +from i6_experiments.common.baselines.tedlium2_v2.data import get_corpus_data_inputs + +from ..default_tools import RASR_BINARY_PATH + + +def run_tedlium2_common_baseline( + alias_prefix="baselines/tedlium2/gmm/common_baseline", +): + stored_alias_subdir = gs.ALIAS_AND_OUTPUT_SUBDIR + gs.ALIAS_AND_OUTPUT_SUBDIR = alias_prefix + + rasr_init_args = baseline_args.get_init_args() + mono_args = baseline_args.get_monophone_args() + cart_args = baseline_args.get_cart_args() + tri_args = baseline_args.get_triphone_args() + vtln_args = baseline_args.get_vtln_args() + sat_args = baseline_args.get_sat_args() + vtln_sat_args = baseline_args.get_vtln_sat_args() + + final_output_args = OutputArgs("final") + final_output_args.define_corpus_type("train", "train") + final_output_args.define_corpus_type("dev", "dev") + final_output_args.define_corpus_type("test", "test") + # final_output_args.add_feature_to_extract("gt") + + steps = RasrSteps() + steps.add_step("extract", rasr_init_args.feature_extraction_args) + steps.add_step("mono", mono_args) + steps.add_step("cart", cart_args) + steps.add_step("tri", tri_args) + steps.add_step("vtln", vtln_args) + steps.add_step("sat", sat_args) + steps.add_step("vtln+sat", vtln_sat_args) + steps.add_step("output", final_output_args) + + corpus_data = get_corpus_data_inputs() + + system = gmm_system.GmmSystem(rasr_binary_path=RASR_BINARY_PATH) + system.init_system( + rasr_init_args=rasr_init_args, + train_data=corpus_data["train"], + dev_data=corpus_data["dev"], + test_data={}, # corpus_data["test"], + ) + system.run(steps) + + gs.ALIAS_AND_OUTPUT_SUBDIR = stored_alias_subdir + + return system diff --git a/common/baselines/tedlium2_v2/lm/__init__.py b/common/baselines/tedlium2_v2/lm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/common/baselines/tedlium2_v2/lm/ngram_config.py b/common/baselines/tedlium2_v2/lm/ngram_config.py new file mode 100644 index 000000000..f6b3b3c82 --- /dev/null +++ b/common/baselines/tedlium2_v2/lm/ngram_config.py @@ -0,0 +1,62 @@ +from sisyphus import gs + +from i6_core.corpus.convert import CorpusToTxtJob +from i6_core.lexicon.conversion import LexiconToWordListJob + +from i6_experiments.common.datasets.tedlium2_v2.corpus import get_bliss_corpus_dict +from i6_experiments.common.datasets.tedlium2_v2.lexicon import ( + get_g2p_augmented_bliss_lexicon, +) +from i6_experiments.common.datasets.tedlium2_v2.textual_data import get_text_data_dict +from i6_experiments.common.baselines.tedlium2.default_tools import SRILM_PATH + +from i6_experiments.common.setups.lm.srilm_system import SriLmSystem + + +def run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping: bool = False, alias_prefix="baselines/tedlium2/lm/ngram"): + stored_alias_subdir = gs.ALIAS_AND_OUTPUT_SUBDIR + gs.ALIAS_AND_OUTPUT_SUBDIR = alias_prefix + + dev_data = CorpusToTxtJob(get_bliss_corpus_dict(audio_format="wav", output_prefix="corpora")["dev"]).out_txt + test_data = CorpusToTxtJob(get_bliss_corpus_dict(audio_format="wav", output_prefix="corpora")["test"]).out_txt + + train_data_dict = get_text_data_dict() + dev_data_dict = {"dev": dev_data} + test_data_dict = { + "dev": dev_data, + "test": test_data, + } + + vocab = LexiconToWordListJob( + get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping, output_prefix="lexicon" + ) + ).out_word_list + + ngram_system = SriLmSystem( + name="tedlium2", + train_data=train_data_dict, + dev_data=dev_data_dict, + eval_data=test_data_dict, + ngram_order=[3, 4, 5], + vocab=vocab, + ngram_args=[ + "-gt1min 1", + "-gt2min 1", + "-gt3min 1", + "-gt4min 1", + "-gt5min 1", + "-gt6min 1", + "-interpolate", + "-kndiscount", + ], + perplexity_args="-debug 2", + srilm_path=SRILM_PATH, + ngram_rqmt=None, + perplexity_rqmt=None, + mail_address=gs.MAIL_ADDRESS, + ) + ngram_system.run_training() + + gs.ALIAS_AND_OUTPUT_SUBDIR = stored_alias_subdir + return ngram_system diff --git a/common/datasets/tedlium2_v2/corpus.py b/common/datasets/tedlium2_v2/corpus.py new file mode 100644 index 000000000..f74a7acbf --- /dev/null +++ b/common/datasets/tedlium2_v2/corpus.py @@ -0,0 +1,136 @@ +import os +from functools import lru_cache +from typing import Dict, Optional, Any + +from sisyphus import tk + +from i6_core.audio.encoding import BlissChangeEncodingJob + +from i6_core.meta import CorpusObject + +from ..tedlium2.constants import DURATIONS +from .download import download_data_dict + + +@lru_cache() +def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datasets") -> Dict[str, tk.Path]: + """ + creates a dictionary of all corpora in the TedLiumV2 dataset in the bliss xml format + + :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. + :param output_prefix: + :return: + """ + assert audio_format in ["flac", "ogg", "wav", "sph", "nist"] + + output_prefix = os.path.join(output_prefix, "Ted-Lium-2") + + bliss_corpus_dict = download_data_dict(output_prefix=output_prefix).bliss_nist + + audio_format_options = { + "wav": { + "output_format": "wav", + "codec": "pcm_s16le", + }, + "ogg": {"output_format": "ogg", "codec": "libvorbis"}, + "flac": {"output_format": "flac", "codec": "flac"}, + } + + converted_bliss_corpus_dict = {} + if audio_format not in ["sph", "nist"]: + for corpus_name, sph_corpus in bliss_corpus_dict.items(): + bliss_change_encoding_job = BlissChangeEncodingJob( + corpus_file=sph_corpus, + sample_rate=16000, + recover_duration=False, + **audio_format_options[audio_format], + ) + bliss_change_encoding_job.add_alias( + os.path.join( + output_prefix, + "%s_conversion" % audio_format, + corpus_name, + ) + ) + converted_bliss_corpus_dict[corpus_name] = bliss_change_encoding_job.out_corpus + else: + converted_bliss_corpus_dict = bliss_corpus_dict + + return converted_bliss_corpus_dict + + +@lru_cache() +def get_corpus_object_dict(audio_format: str = "flac", output_prefix: str = "datasets") -> Dict[str, CorpusObject]: + """ + creates a dict of all corpora in the TedLiumV2 dataset as a `meta.CorpusObject` + + :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. + :param output_prefix: + :return: + """ + bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) + + corpus_object_dict = {} + + for corpus_name, bliss_corpus in bliss_corpus_dict.items(): + corpus_object = CorpusObject() + corpus_object.corpus_file = bliss_corpus + corpus_object.audio_format = audio_format + corpus_object.audio_dir = None + corpus_object.duration = DURATIONS[corpus_name] + + corpus_object_dict[corpus_name] = corpus_object + + return corpus_object_dict + + +@lru_cache() +def get_stm_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: + """ + fetches the STM files for TedLiumV2 dataset + + :param output_prefix: + :return: + """ + return download_data_dict(output_prefix=output_prefix).stm + + +def get_ogg_zip_dict( + subdir_prefix: str = "datasets", + returnn_python_exe: Optional[tk.Path] = None, + returnn_root: Optional[tk.Path] = None, + bliss_to_ogg_job_rqmt: Optional[Dict[str, Any]] = None, + extra_args: Optional[Dict[str, Dict[str, Any]]] = None, +) -> Dict[str, tk.Path]: + """ + Get a dictionary containing the paths to the ogg_zip for each corpus part. + + No outputs will be registered. + + :param subdir_prefix: dir name prefix for aliases and outputs + :param returnn_python_exe: path to returnn python executable + :param returnn_root: python to returnn root + :param bliss_to_ogg_job_rqmt: rqmt for bliss to ogg job + :param extra_args: extra args for each dataset for bliss to ogg job + :return: dictionary with ogg zip paths for each corpus (train, dev, test) + """ + from i6_core.returnn.oggzip import BlissToOggZipJob + + ogg_zip_dict = {} + bliss_corpus_dict = get_bliss_corpus_dict(audio_format="wav", output_prefix=subdir_prefix) + if extra_args is None: + extra_args = {} + for name, bliss_corpus in bliss_corpus_dict.items(): + ogg_zip_job = BlissToOggZipJob( + bliss_corpus, + no_conversion=False, # cannot be used for corpus with multiple segments per recording + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + **extra_args.get(name, {}), + ) + if bliss_to_ogg_job_rqmt: + ogg_zip_job.rqmt = bliss_to_ogg_job_rqmt + ogg_zip_job.add_alias(os.path.join(subdir_prefix, "Ted-Lium-2", "%s_ogg_zip_job" % name)) + ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip + + return ogg_zip_dict diff --git a/common/datasets/tedlium2_v2/download.py b/common/datasets/tedlium2_v2/download.py new file mode 100644 index 000000000..948224ae7 --- /dev/null +++ b/common/datasets/tedlium2_v2/download.py @@ -0,0 +1,48 @@ +import os +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, Dict + +from sisyphus import tk + +from i6_core.datasets.tedlium2 import ( + DownloadTEDLIUM2CorpusJob, + CreateTEDLIUM2BlissCorpusJobV2, +) + + +@dataclass(frozen=True) +class TedLium2Data: + """Class for storing the TedLium2 data""" + + data_dir: Dict[str, tk.Path] + lm_dir: tk.Path + vocab: tk.Path + bliss_nist: Dict[str, tk.Path] + stm: Dict[str, tk.Path] + + +@lru_cache() +def download_data_dict(output_prefix: str = "datasets") -> TedLium2Data: + """ + downloads the TedLiumV2 dataset and performs the initial data processing steps + Uses the fixed job CreateTEDLIUM2BlissCorpusJobV2 from: https://github.com/rwth-i6/i6_core/pull/490 + + :param output_prefix: + :return: + """ + download_tedlium2_job = DownloadTEDLIUM2CorpusJob() + download_tedlium2_job.add_alias(os.path.join(output_prefix, "download", "raw_corpus_job")) + + bliss_corpus_tedlium2_job = CreateTEDLIUM2BlissCorpusJobV2(download_tedlium2_job.out_corpus_folders) + bliss_corpus_tedlium2_job.add_alias(os.path.join(output_prefix, "create_bliss", "bliss_corpus_job")) + + tl2_data = TedLium2Data( + data_dir=download_tedlium2_job.out_corpus_folders, + lm_dir=download_tedlium2_job.out_lm_folder, + vocab=download_tedlium2_job.out_vocab_dict, + bliss_nist=bliss_corpus_tedlium2_job.out_corpus_files, + stm=bliss_corpus_tedlium2_job.out_stm_files, + ) + + return tl2_data diff --git a/common/datasets/tedlium2_v2/export.py b/common/datasets/tedlium2_v2/export.py new file mode 100644 index 000000000..1919fa8c0 --- /dev/null +++ b/common/datasets/tedlium2_v2/export.py @@ -0,0 +1,96 @@ +import os + +from sisyphus import tk + +from .corpus import get_bliss_corpus_dict, get_stm_dict +from .lexicon import get_bliss_lexicon, get_g2p_augmented_bliss_lexicon +from .textual_data import get_text_data_dict + +TEDLIUM_PREFIX = "Ted-Lium-2" + + +def _export_datasets(output_prefix: str = "datasets"): + """ + exports all datasets for TedLiumV2 with all available audio formats + + :param output_prefix: + :return: + """ + for audio_format in ["flac", "ogg", "wav", "nist", "sph"]: + bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) + for name, bliss_corpus in bliss_corpus_dict.items(): + tk.register_output( + os.path.join( + output_prefix, + TEDLIUM_PREFIX, + "corpus", + f"{name}-{audio_format}.xml.gz", + ), + bliss_corpus, + ) + + +def _export_stms(output_prefix: str = "datasets"): + """ + exports all STMs for TedLiumV2 + + :param output_prefix: + :return: + """ + stm_dict = get_stm_dict(output_prefix=output_prefix) + for name, stm_file in stm_dict.items(): + tk.register_output( + os.path.join( + output_prefix, + TEDLIUM_PREFIX, + "stm", + f"{name}.txt", + ), + stm_file, + ) + + +def _export_text_data(output_prefix: str = "datasets"): + """ + exports all the textual data for TedLiumV2 dataset + + :param output_prefix: + :return: + """ + txt_data_dict = get_text_data_dict(output_prefix=output_prefix) + for k, v in txt_data_dict.items(): + tk.register_output(os.path.join(output_prefix, TEDLIUM_PREFIX, "text_data", f"{k}.gz"), v) + + +def _export_lexicon(output_prefix: str = "datasets"): + """ + exports the lexicon for TedLiumV2 + + :param output_prefix: + :return: + """ + lexicon_output_prefix = os.path.join(output_prefix, TEDLIUM_PREFIX, "lexicon") + + bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix) + tk.register_output(os.path.join(lexicon_output_prefix, "tedlium2.lexicon.xml.gz"), bliss_lexicon) + + g2p_bliss_lexicon = get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping=False, output_prefix=output_prefix + ) + tk.register_output( + os.path.join(lexicon_output_prefix, "tedlium2.lexicon_with_g2p.xml.gz"), + g2p_bliss_lexicon, + ) + + +def export_all(output_prefix: str = "datasets"): + """ + exports everything for TedLiumV2 + + :param output_prefix: + :return: + """ + _export_datasets(output_prefix=output_prefix) + _export_stms(output_prefix=output_prefix) + _export_text_data(output_prefix=output_prefix) + _export_lexicon(output_prefix=output_prefix) diff --git a/common/datasets/tedlium2_v2/lexicon.py b/common/datasets/tedlium2_v2/lexicon.py new file mode 100644 index 000000000..4d8366155 --- /dev/null +++ b/common/datasets/tedlium2_v2/lexicon.py @@ -0,0 +1,171 @@ +import os +from functools import lru_cache +from sisyphus import tk + +from i6_core.lexicon import LexiconFromTextFileJob +from i6_core.lexicon.modification import WriteLexiconJob, MergeLexiconJob +from i6_core.lib import lexicon +from i6_experiments.common.helpers.g2p import G2PBasedOovAugmenter + +from ..tedlium2.constants import SILENCE_PHONEME, UNKNOWN_PHONEME +from .corpus import get_bliss_corpus_dict +from .download import download_data_dict + + +@lru_cache() +def _get_special_lemma_lexicon( + add_unknown_phoneme_and_mapping: bool = False, + add_silence: bool = True, +) -> lexicon.Lexicon: + """ + creates the special lemma used in RASR + + :param add_unknown_phoneme_and_mapping: adds [unknown] as label with [UNK] as phoneme and as LM token + :param add_silence: adds [silence] label with [SILENCE] phoneme, + use False for CTC/RNN-T setups without silence modelling. + :return: + """ + lex = lexicon.Lexicon() + if add_silence: + lex.add_lemma( + lexicon.Lemma( + orth=["[silence]", ""], + phon=[SILENCE_PHONEME], + synt=[], + special="silence", + eval=[[]], + ) + ) + if add_unknown_phoneme_and_mapping: + lex.add_lemma( + lexicon.Lemma( + orth=["[unknown]"], + phon=[UNKNOWN_PHONEME], + synt=[""], + special="unknown", + eval=[[]], + ) + ) + else: + lex.add_lemma( + lexicon.Lemma( + orth=["[unknown]"], + synt=[""], + special="unknown", + eval=[[]], + ) + ) + + lex.add_lemma( + lexicon.Lemma( + orth=["[sentence-begin]"], + synt=[""], + special="sentence-begin", + eval=[[]], + ) + ) + lex.add_lemma( + lexicon.Lemma( + orth=["[sentence-end]"], + synt=[""], + special="sentence-end", + eval=[[]], + ) + ) + if add_silence: + lex.add_phoneme(SILENCE_PHONEME, variation="none") + if add_unknown_phoneme_and_mapping: + lex.add_phoneme(UNKNOWN_PHONEME, variation="none") + + return lex + + +@lru_cache() +def _get_raw_bliss_lexicon( + output_prefix: str, +) -> tk.Path: + """ + downloads the vocabulary file from the TedLiumV2 dataset and creates a bliss lexicon + + :param output_prefix: + :return: + """ + vocab = download_data_dict(output_prefix=output_prefix).vocab + + convert_lexicon_job = LexiconFromTextFileJob( + text_file=vocab, + compressed=True, + ) + convert_lexicon_job.add_alias(os.path.join(output_prefix, "convert_text_to_bliss_lexicon_job")) + + return convert_lexicon_job.out_bliss_lexicon + + +@lru_cache() +def get_bliss_lexicon( + add_unknown_phoneme_and_mapping: bool = True, + add_silence: bool = True, + output_prefix: str = "datasets", +) -> tk.Path: + """ + merges the lexicon with special RASR tokens with the lexicon created from the downloaded TedLiumV2 vocabulary + + :param add_unknown_phoneme_and_mapping: add an unknown phoneme and mapping unknown phoneme:lemma + :param add_silence: include silence lemma and phoneme + :param output_prefix: + :return: + """ + static_lexicon = _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping, add_silence) + static_lexicon_job = WriteLexiconJob(static_lexicon, sort_phonemes=True, sort_lemmata=False) + static_lexicon_job.add_alias(os.path.join(output_prefix, "static_lexicon_job")) + + raw_tedlium2_lexicon = _get_raw_bliss_lexicon(output_prefix=output_prefix) + + merge_lexicon_job = MergeLexiconJob( + bliss_lexica=[ + static_lexicon_job.out_bliss_lexicon, + raw_tedlium2_lexicon, + ], + sort_phonemes=True, + sort_lemmata=True, + compressed=True, + ) + merge_lexicon_job.add_alias(os.path.join(output_prefix, "merge_lexicon_job")) + + return merge_lexicon_job.out_bliss_lexicon + + +@lru_cache() +def get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping: bool = False, + add_silence: bool = True, + audio_format: str = "wav", + output_prefix: str = "datasets", +) -> tk.Path: + """ + augment the kernel lexicon with unknown words from the training corpus + + :param add_unknown_phoneme_and_mapping: add an unknown phoneme and mapping unknown phoneme:lemma + :param add_silence: include silence lemma and phoneme + :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. + :param output_prefix: + :return: + """ + original_bliss_lexicon = get_bliss_lexicon( + add_unknown_phoneme_and_mapping, add_silence=add_silence, output_prefix=output_prefix + ) + corpus_name = "train" + bliss_corpus = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)[corpus_name] + + g2p_augmenter = G2PBasedOovAugmenter( + original_bliss_lexicon=original_bliss_lexicon, + train_lexicon=original_bliss_lexicon, + ) + augmented_bliss_lexicon = g2p_augmenter.get_g2p_augmented_bliss_lexicon( + bliss_corpus=bliss_corpus, + corpus_name=corpus_name, + alias_path=os.path.join(output_prefix, "g2p"), + casing="lower", + ) + + return augmented_bliss_lexicon diff --git a/common/datasets/tedlium2_v2/textual_data.py b/common/datasets/tedlium2_v2/textual_data.py new file mode 100644 index 000000000..553489a0d --- /dev/null +++ b/common/datasets/tedlium2_v2/textual_data.py @@ -0,0 +1,39 @@ +from functools import lru_cache +from typing import Dict + +from sisyphus import tk + +from i6_core.corpus import CorpusToTxtJob +from i6_core.text import ConcatenateJob + +from i6_experiments.common.datasets.tedlium2.corpus_v2 import get_bliss_corpus_dict + +from .download import download_data_dict + + +@lru_cache() +def get_text_data_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: + """ + gather all the textual data provided within the TedLiumV2 dataset + + :param output_prefix: + :return: + """ + lm_dir = download_data_dict(output_prefix=output_prefix).lm_dir + + text_corpora = [ + "commoncrawl-9pc", + "europarl-v7-6pc", + "giga-fren-4pc", + "news-18pc", + "news-commentary-v8-9pc", + "yandex-1m-31pc", + ] + + txt_dict = {name: lm_dir.join_right("%s.en.gz" % name) for name in text_corpora} + txt_dict["audio-transcriptions"] = CorpusToTxtJob( + get_bliss_corpus_dict(audio_format="wav", output_prefix="corpora")["train"] + ).out_txt + txt_dict["background-data"] = ConcatenateJob(list(txt_dict.values())).out + + return txt_dict diff --git a/common/datasets/tedlium2_v2/vocab.py b/common/datasets/tedlium2_v2/vocab.py new file mode 100644 index 000000000..14d4455f5 --- /dev/null +++ b/common/datasets/tedlium2_v2/vocab.py @@ -0,0 +1,51 @@ +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import ( + get_returnn_subword_nmt, + get_bpe_settings, + BPESettings, +) +from .corpus import get_bliss_corpus_dict + + +def get_subword_nmt_bpe(bpe_size: int, unk_label: str = "", subdir_prefix: str = "") -> BPESettings: + """ + Get the BPE tokens via the Returnn subword-nmt for a Tedlium2 setup. + + :param bpe_size: the number of BPE merge operations. This is NOT the resulting vocab size! + :param unk_label: unknown label symbol + :param subdir_prefix: dir name prefix for aliases and outputs + """ + subword_nmt_repo = get_returnn_subword_nmt(output_prefix=subdir_prefix) + train_corpus = get_bliss_corpus_dict()["train"] + bpe_settings = get_bpe_settings( + train_corpus, + bpe_size=bpe_size, + unk_label=unk_label, + output_prefix=subdir_prefix, + subword_nmt_repo_path=subword_nmt_repo, + ) + return bpe_settings + + +def get_subword_nmt_bpe_v2(bpe_size: int, unk_label: str = "", subdir_prefix: str = "") -> BPESettings: + """ + Get the BPE tokens via the Returnn subword-nmt for a Tedlium2 setup. + + V2: Uses subword-nmt version corrected for Apptainer related bug, adds hash overwrite for repo + + :param bpe_size: the number of BPE merge operations. This is NOT the resulting vocab size! + :param unk_label: unknown label symbol + :param subdir_prefix: dir name prefix for aliases and outputs + """ + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=subdir_prefix + ) + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + train_corpus = get_bliss_corpus_dict()["train"] + bpe_settings = get_bpe_settings( + train_corpus, + bpe_size=bpe_size, + unk_label=unk_label, + output_prefix=subdir_prefix, + subword_nmt_repo_path=subword_nmt_repo, + ) + return bpe_settings