From ce3fb26dc8f4c0eab10145a90974dd7c4b1a792d Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:06:00 +0200 Subject: [PATCH 01/26] initial hybrid baseline commit --- common/baselines/tedlium2/data.py | 10 + common/baselines/tedlium2/default_tools.py | 36 +- .../baselines/tedlium2/gmm/baseline_args.py | 14 +- .../baselines/tedlium2/gmm/baseline_config.py | 1 + common/baselines/tedlium2/hybrid/__init__.py | 0 .../tedlium2/hybrid/baseline_args.py | 51 ++ .../tedlium2/hybrid/baseline_config.py | 76 ++ common/baselines/tedlium2/hybrid/data.py | 270 ++++++ .../tedlium2/hybrid/nn_config/experiment.py | 17 + .../tedlium2/hybrid/nn_config/helper.py | 86 ++ .../tedlium2/hybrid/nn_config/nn_args.py | 91 ++ .../tedlium2/hybrid/nn_config/nn_setup.py | 854 ++++++++++++++++++ .../hybrid/nn_config/spec_augment_mask.py | 131 +++ common/setups/rasr/gmm_system.py | 37 +- common/setups/rasr/hybrid_decoder.py | 250 ++++- common/setups/rasr/hybrid_system.py | 113 ++- common/setups/rasr/nn_system.py | 56 +- common/setups/rasr/rasr_system.py | 13 +- common/setups/rasr/util/nn/__init__.py | 4 + common/setups/rasr/util/nn/common.py | 53 ++ common/setups/rasr/util/nn/data.py | 615 +++++++++++++ common/setups/rasr/util/nn/decode.py | 88 ++ common/setups/rasr/util/nn/training.py | 49 + common/tools/sctk.py | 13 +- 24 files changed, 2840 insertions(+), 88 deletions(-) create mode 100644 common/baselines/tedlium2/hybrid/__init__.py create mode 100644 common/baselines/tedlium2/hybrid/baseline_args.py create mode 100644 common/baselines/tedlium2/hybrid/baseline_config.py create mode 100644 common/baselines/tedlium2/hybrid/data.py create mode 100644 common/baselines/tedlium2/hybrid/nn_config/experiment.py create mode 100644 common/baselines/tedlium2/hybrid/nn_config/helper.py create mode 100644 common/baselines/tedlium2/hybrid/nn_config/nn_args.py create mode 100644 common/baselines/tedlium2/hybrid/nn_config/nn_setup.py create mode 100644 common/baselines/tedlium2/hybrid/nn_config/spec_augment_mask.py create mode 100644 common/setups/rasr/util/nn/__init__.py create mode 100644 common/setups/rasr/util/nn/common.py create mode 100644 common/setups/rasr/util/nn/data.py create mode 100644 common/setups/rasr/util/nn/decode.py create mode 100644 common/setups/rasr/util/nn/training.py diff --git a/common/baselines/tedlium2/data.py b/common/baselines/tedlium2/data.py index 92daa180c..e75475746 100644 --- a/common/baselines/tedlium2/data.py +++ b/common/baselines/tedlium2/data.py @@ -1,6 +1,7 @@ from collections import defaultdict from typing import Dict +from sisyphus import tk from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT from i6_experiments.common.datasets.tedlium2.corpus import get_corpus_object_dict from i6_experiments.common.datasets.tedlium2.lexicon import ( @@ -28,6 +29,9 @@ def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dic lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping) lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm) + kaldi_small_lm = ArpaLmRasrConfig( + lm_path=tk.Path("/work/asr3/zhou/kaldi/egs/tedlium/s5_r2/data/local/local_lm/data/arpa/4gram_small.arpa.gz") + ) rasr_data_input_dict = defaultdict(dict) @@ -38,5 +42,11 @@ def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dic concurrent=CONCURRENT[name], lm=comb_lm.get_dict() if name == "dev" or name == "test" else None, ) + rasr_data_input_dict["dev"]["dev_kaldi_small_4_gram"] = RasrDataInput( + corpus_object=crp_obj, + lexicon=train_lexicon.get_dict(), + concurrent=CONCURRENT[name], + lm=kaldi_small_lm.get_dict(), + ) return rasr_data_input_dict diff --git a/common/baselines/tedlium2/default_tools.py b/common/baselines/tedlium2/default_tools.py index 347f9ffbe..0e99b9b55 100644 --- a/common/baselines/tedlium2/default_tools.py +++ b/common/baselines/tedlium2/default_tools.py @@ -11,17 +11,39 @@ from i6_experiments.common.tools.audio import compile_ffmpeg_binary from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode from i6_experiments.common.tools.sctk import compile_sctk +from i6_core.tools.git import CloneGitRepositoryJob -RASR_BINARY_PATH = compile_rasr_binaries_i6mode( - branch="apptainer_tf_2_8", configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"] -) # use most recent RASR -# RASR_BINARY_PATH = tk.Path("/work/asr4/rossenbach/neon_test/rasr_versions/rasr_no_tf/arch/linux-x86_64-standard/") -assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline" -# RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" +PACKAGE = __package__ +# RASR_BINARY_PATH = compile_rasr_binaries_i6mode( +# branch="apptainer_tf_2_8", configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"] +# ) # use most recent RASR +# RASR_BINARY_PATH = compile_rasr_binaries_i6mode( +# branch="apptainer_tf_2_8", +# configure_options=["--apptainer-setup=2023-05-08_tensorflow-2.8_v1"], +# commit="5e7adf5034dbafac90caf0e50b5bfd6410c98d5e", +# ) # use most recent RASR +# assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline" +RASR_BINARY_PATH = tk.Path( + "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/onnx_extended_rasr/arch/linux-x86_64-standard" +) +RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" -SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version +SCTK_BINARY_PATH = compile_sctk() # use last published version SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH" +SCTK_BINARY_PATH2 = compile_sctk(alias="wei_u16_sctk") # use last published version, HACK to have u16 compiled + SRILM_PATH = tk.Path("/work/tools/users/luescher/srilm-1.7.3/bin/i686-m64/") SRILM_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SRILM_PATH" + +RETURNN_EXE = tk.Path( + "/usr/bin/python3", + hash_overwrite="GENERIC_RETURNN_LAUNCHER", +) + +RETURNN_RC_ROOT = CloneGitRepositoryJob( + "https://github.com/rwth-i6/returnn", + commit="d7689b945b2fe781b3c79fbef9d82f018c7b11e8", +).out_repository +RETURNN_RC_ROOT.hash_overwrite = "TEDLIUM2_DEFAULT_RETURNN_RC_ROOT" diff --git a/common/baselines/tedlium2/gmm/baseline_args.py b/common/baselines/tedlium2/gmm/baseline_args.py index b227cccd1..927d3d466 100644 --- a/common/baselines/tedlium2/gmm/baseline_args.py +++ b/common/baselines/tedlium2/gmm/baseline_args.py @@ -3,7 +3,7 @@ from i6_experiments.common.setups.rasr import util from i6_experiments.common.datasets.tedlium2.cart import CartQuestions -from i6_experiments.common.baselines.librispeech.default_tools import SCTK_BINARY_PATH +from i6_experiments.common.baselines.tedlium2.default_tools import SCTK_BINARY_PATH def get_init_args(): @@ -320,10 +320,10 @@ def get_sat_args(): "feature_cache": "mfcc", "cache_regex": "^mfcc.*$", "cmllr_mixtures": "estimate_mixtures_sdm.tri", - "iters": [8, 10], + "iters": [8, 9, 10], "feature_flow": "uncached_mfcc+context+lda", - "pronunciation_scales": [1.0], - "lm_scales": [25], + "pronunciation_scales": [1.0, 0.0], + "lm_scales": [25, 8.0, 20], "lm_lookahead": True, "lookahead_options": None, "create_lattice": True, @@ -385,10 +385,10 @@ def get_vtln_sat_args(): "feature_cache": "mfcc", "cache_regex": "^mfcc.*$", "cmllr_mixtures": "estimate_mixtures_sdm.vtln", - "iters": [8, 10], + "iters": [8, 9, 10], "feature_flow": "uncached_mfcc+context+lda+vtln", - "pronunciation_scales": [1.0], - "lm_scales": [25], + "pronunciation_scales": [1.0, 0.0], + "lm_scales": [25, 8.0, 20], "lm_lookahead": True, "lookahead_options": None, "create_lattice": True, diff --git a/common/baselines/tedlium2/gmm/baseline_config.py b/common/baselines/tedlium2/gmm/baseline_config.py index ebe5e4256..257363d82 100644 --- a/common/baselines/tedlium2/gmm/baseline_config.py +++ b/common/baselines/tedlium2/gmm/baseline_config.py @@ -27,6 +27,7 @@ def run_tedlium2_common_baseline( final_output_args.define_corpus_type("train", "train") final_output_args.define_corpus_type("dev", "dev") final_output_args.define_corpus_type("test", "test") + final_output_args.define_corpus_type("dev_kaldi_small_4_gram", "test") # final_output_args.add_feature_to_extract("gt") steps = RasrSteps() diff --git a/common/baselines/tedlium2/hybrid/__init__.py b/common/baselines/tedlium2/hybrid/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/common/baselines/tedlium2/hybrid/baseline_args.py b/common/baselines/tedlium2/hybrid/baseline_args.py new file mode 100644 index 000000000..75b2ac679 --- /dev/null +++ b/common/baselines/tedlium2/hybrid/baseline_args.py @@ -0,0 +1,51 @@ +from i6_core.features import filter_width_from_channels + + +def get_gammatone_feature_extraction_args(): + return { + "gt_options": { + "minfreq": 100, + "maxfreq": 7500, + "channels": 50, + "tempint_type": "hanning", + "tempint_shift": 0.01, + "tempint_length": 0.025, + "flush_before_gap": True, + "do_specint": False, + "specint_type": "hanning", + "specint_shift": 4, + "specint_length": 9, + "normalize": True, + "preemphasis": True, + "legacy_scaling": False, + "without_samples": False, + "samples_options": { + "audio_format": "wav", + "dc_detection": False, + }, + "normalization_options": {}, + } + } + + +def get_log_mel_feature_extraction_args(): + + return { + "fb": { + "filterbank_options": { + "warping_function": "mel", + "filter_width": filter_width_from_channels(channels=80, warping_function="mel", f_max=8000), + "normalize": True, + "normalization_options": None, + "without_samples": False, + "samples_options": { + "audio_format": "wav", + "dc_detection": False, + }, + "fft_options": None, + "add_features_output": True, + "apply_log": True, + "add_epsilon": True, + } + } + } diff --git a/common/baselines/tedlium2/hybrid/baseline_config.py b/common/baselines/tedlium2/hybrid/baseline_config.py new file mode 100644 index 000000000..4146a8036 --- /dev/null +++ b/common/baselines/tedlium2/hybrid/baseline_config.py @@ -0,0 +1,76 @@ +import copy +from sisyphus import gs, tk + +from i6_core.features import FilterbankJob + +from i6_experiments.common.setups.rasr.util import RasrSteps +from i6_experiments.common.setups.rasr.hybrid_system import HybridSystem +from i6_experiments.common.baselines.tedlium2.default_tools import RETURNN_RC_ROOT, SCTK_BINARY_PATH2 + +from .data import get_corpus_data_inputs +from .baseline_args import get_log_mel_feature_extraction_args +from i6_experiments.common.baselines.tedlium2.hybrid.nn_config.nn_args import get_nn_args as get_nn_args2 + + +def run_gmm_system(): + from i6_experiments.common.baselines.tedlium2.gmm.baseline_config import ( + run_tedlium2_common_baseline, + ) + + system = run_tedlium2_common_baseline() + return system + + +def run_tedlium2_hybrid_baseline(): + gs.ALIAS_AND_OUTPUT_SUBDIR = "baselines/tedlium2/hybrid/baseline" + + gmm_system = run_gmm_system() + rasr_init_args = copy.deepcopy(gmm_system.rasr_init_args) + rasr_init_args.scorer_args["sctk_binary_path"] = SCTK_BINARY_PATH2 # Hack to have a U16 compiled SCTK + rasr_init_args.feature_extraction_args = get_log_mel_feature_extraction_args() + ( + nn_train_data_inputs, + nn_cv_data_inputs, + nn_devtrain_data_inputs, + nn_dev_data_inputs, + nn_test_data_inputs, + ) = get_corpus_data_inputs( + gmm_system, + rasr_init_args.feature_extraction_args["fb"], + FilterbankJob, + alias_prefix="experiments/tedlium2/hybrid/wei_baseline", + ) + # image only, so just python3 + returnn_exe = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") + blas_lib = tk.Path( + "/work/tools/asr/tensorflow/2.3.4-generic+cuda10.1+mkl/bazel_out/external/mkl_linux/lib/libmklml_intel.so", + hash_overwrite="TF23_MKL_BLAS", + ) + blas_lib.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" + rasr_binary = tk.Path("/work/tools/asr/rasr/20211217_tf23_cuda101_mkl/arch/linux-x86_64-standard") + steps = RasrSteps() + steps.add_step("extract", rasr_init_args.feature_extraction_args) + gmm_system.run(steps) + nn_args = get_nn_args2(num_epochs=160) + nn_steps = RasrSteps() + nn_steps.add_step("nn", nn_args) + + tedlium_nn_system = HybridSystem( + returnn_root=RETURNN_RC_ROOT, + returnn_python_exe=returnn_exe, + blas_lib=blas_lib, + rasr_arch="linux-x86_64-standard", + rasr_binary_path=rasr_binary, + ) + tedlium_nn_system.init_system( + rasr_init_args=rasr_init_args, + train_data=nn_train_data_inputs, + cv_data=nn_cv_data_inputs, + devtrain_data=nn_devtrain_data_inputs, + dev_data=nn_dev_data_inputs, + test_data=nn_test_data_inputs, + train_cv_pairing=[tuple(["train.train", "dev.cv"])], + ) + tedlium_nn_system.run(nn_steps) + + gs.ALIAS_AND_OUTPUT_SUBDIR = "" diff --git a/common/baselines/tedlium2/hybrid/data.py b/common/baselines/tedlium2/hybrid/data.py new file mode 100644 index 000000000..40faaf99e --- /dev/null +++ b/common/baselines/tedlium2/hybrid/data.py @@ -0,0 +1,270 @@ +from typing import Optional, Dict, Any, Tuple, Callable +from sisyphus import tk + +from i6_core import corpus as corpus_recipe +from i6_core.returnn import ReturnnDumpHDFJob +from i6_core.features import FeatureExtractionJob + +from i6_experiments.common.datasets.tedlium2.constants import DURATIONS, NUM_SEGMENTS +from i6_experiments.common.setups.rasr.gmm_system import GmmSystem +from i6_experiments.common.setups.rasr.util import ( + HdfDataInput, + AllophoneLabeling, + ReturnnRasrDataInput, + ForcedAlignmentArgs, +) +from i6_experiments.common.datasets.tedlium2.lexicon import get_g2p_augmented_bliss_lexicon +from ..default_tools import RETURNN_EXE, RETURNN_RC_ROOT + + +def build_hdf_data_input( + features: tk.Path, + allophone_labeling: AllophoneLabeling, + alignments: tk.Path, + segment_list: Optional[tk.Path] = None, + alias_prefix: Optional[str] = None, + partition_epoch: int = 1, + acoustic_mixtures: Optional = None, + seq_ordering: str = "sorted", +): + """ + Dumps features and alignments from RASR into hdfs, to enable full RETURNN training + :param features: Feature bundle generated by the dump_features_for_hybrid_training function + :param allophone_labeling: Allophone labeling including silence_phoneme, allophones and state_tying + :param alignments: Target alignments generated from the pre-trained GMM + :param segment_list: segment list for the alignment dataset which will serve as seq_control dataset + :param alias_prefix: Prefix for the dump jobs + :param partition_epoch: Partition epoch for the alignment dataset, mainly relevant for training dataset + :param acoustic_mixtures: Acoustic mixture file from the GMM for prior calculation, most likely going to be replaced + :param seq_ordering: sequence ordering for the align dataset, usually sorted for dev/eval and laplace for train + :return: + """ + + feat_dataset = { + "class": "SprintCacheDataset", + "data": { + "data": { + "filename": features, + "data_type": "feat", + "allophone_labeling": { + "silence_phone": allophone_labeling.silence_phone, + "allophone_file": allophone_labeling.allophone_file, + "state_tying_file": allophone_labeling.state_tying_file, + }, + } + }, + "seq_list_filter_file": segment_list, + } + + feat_job = ReturnnDumpHDFJob( + data=feat_dataset, + returnn_python_exe=RETURNN_EXE, + returnn_root=RETURNN_RC_ROOT, + ) + if alias_prefix is not None: + feat_job.add_alias(alias_prefix + "/dump_features") + feat_hdf = feat_job.out_hdf + align_dataset = { + "class": "SprintCacheDataset", + "data": { + "data": { + "filename": alignments, + "data_type": "align", + "allophone_labeling": { + "silence_phone": allophone_labeling.silence_phone, + "allophone_file": allophone_labeling.allophone_file, + "state_tying_file": allophone_labeling.state_tying_file, + }, + } + }, + "seq_list_filter_file": segment_list, + } + align_job = ReturnnDumpHDFJob(data=align_dataset, returnn_python_exe=RETURNN_EXE, returnn_root=RETURNN_RC_ROOT) + if alias_prefix is not None: + align_job.add_alias(alias_prefix + "/dump_alignments") + align_hdf = align_job.out_hdf + + return HdfDataInput( + features=feat_hdf, + alignments=align_hdf, + partition_epoch=partition_epoch, + acoustic_mixtures=acoustic_mixtures, + seq_ordering=seq_ordering, + segment_file=segment_list, + ) + + +def dump_features_for_hybrid_training( + gmm_system: GmmSystem, + feature_extraction_args: Dict[str, Any], + feature_extraction_class: Callable[[Any, ...], FeatureExtractionJob], +) -> Tuple[tk.Path, tk.Path, tk.Path]: + """ + + :param gmm_system: GMM system to get corpora from + :param feature_extraction_args: Args for the feature extraction + :param feature_extraction_class: Feature extraction class/job to be used for extraction + :return: + """ + features = {} + for name in ["nn-train", "nn-cv", "nn-devtrain"]: + features[name] = list( + feature_extraction_class(gmm_system.crp[name], **feature_extraction_args).out_feature_bundle.values() + )[0] + return features["nn-train"], features["nn-cv"], features["nn-devtrain"] + + +def get_corpus_data_inputs( + gmm_system: GmmSystem, + feature_extraction_args: Dict[str, Any], + feature_extraction_class: Callable[[Any], FeatureExtractionJob], + alias_prefix: Optional[str] = None, + remove_faulty_segments: bool = False, +) -> Tuple[ + Dict[str, HdfDataInput], + Dict[str, HdfDataInput], + Dict[str, HdfDataInput], + Dict[str, ReturnnRasrDataInput], + Dict[str, ReturnnRasrDataInput], +]: + """ + Builds the data inputs for the hybrid system, inlcuding 3 training hdf pairs with align and feature dataset for + full returnn training + :param gmm_system: Pre-trained GMM-system to derive the hybrid setup from + :param feature_extraction_args: Args for the feature extraction of the hybrid features (might be different from GMM) + :param feature_extraction_class: Feature extraction class/job to be used for extraction + :param alias_prefix: Prefix for naming of experiments + :return: + """ + + train_corpus_path = gmm_system.corpora["train"].corpus_file + cv_corpus_path = gmm_system.corpora["dev"].corpus_file + + cv_corpus_path = corpus_recipe.FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=cv_corpus_path, bliss_lexicon=get_g2p_augmented_bliss_lexicon(), all_unknown=False + ).out_corpus + + total_train_num_segments = NUM_SEGMENTS["train"] + + all_train_segments = corpus_recipe.SegmentCorpusJob(train_corpus_path, 1).out_single_segment_files[1] + if remove_faulty_segments: + all_train_segments = corpus_recipe.FilterSegmentsByListJob( + segment_files={1: all_train_segments}, + filter_list=["TED-LIUM-realease2/AndrewMcAfee_2013/23", "TED-LIUM-realease2/iOTillettWright_2012X/43"], + ).out_single_segment_files[1] + cv_segments = corpus_recipe.SegmentCorpusJob(cv_corpus_path, 1).out_single_segment_files[1] + + dev_train_size = 500 / total_train_num_segments + splitted_train_segments_job = corpus_recipe.ShuffleAndSplitSegmentsJob( + all_train_segments, + {"devtrain": dev_train_size, "unused": 1 - dev_train_size}, + ) + devtrain_segments = splitted_train_segments_job.out_segments["devtrain"] + + # ******************** NN Init ******************** + + gmm_system.add_overlay("train", "nn-train") + gmm_system.crp["nn-train"].segment_path = all_train_segments + gmm_system.crp["nn-train"].concurrent = 1 + gmm_system.crp["nn-train"].corpus_duration = DURATIONS["train"] + + gmm_system.add_overlay("dev", "nn-cv") + gmm_system.crp["nn-cv"].corpus_config.file = cv_corpus_path + gmm_system.crp["nn-cv"].segment_path = cv_segments + gmm_system.crp["nn-cv"].concurrent = 1 + gmm_system.crp["nn-cv"].corpus_duration = DURATIONS["dev"] + + gmm_system.add_overlay("train", "nn-devtrain") + gmm_system.crp["nn-devtrain"].segment_path = devtrain_segments + gmm_system.crp["nn-devtrain"].concurrent = 1 + gmm_system.crp["nn-devtrain"].corpus_duration = DURATIONS["train"] * dev_train_size + + # ******************** extract features ******************** + + train_features, cv_features, devtrain_features = dump_features_for_hybrid_training( + gmm_system, + feature_extraction_args, + feature_extraction_class, + ) + + allophone_labeling = AllophoneLabeling( + silence_phone="[SILENCE]", + allophone_file=gmm_system.allophone_files["train"], + state_tying_file=gmm_system.jobs["train"]["state_tying"].out_state_tying, + ) + + forced_align_args = ForcedAlignmentArgs( + name="nn-cv", + target_corpus_keys=["nn-cv"], + flow="uncached_mfcc+context+lda+vtln+cmllr", + feature_scorer="train_vtln+sat", + scorer_index=-1, + bliss_lexicon={ + "filename": get_g2p_augmented_bliss_lexicon(), + "normalize_pronunciation": False, + }, + dump_alignment=True, + ) + gmm_system.run_forced_align_step(forced_align_args) + + nn_train_data = build_hdf_data_input( + features=train_features, + alignments=gmm_system.outputs["train"]["final"].as_returnn_rasr_data_input().alignments.alternatives["bundle"], + allophone_labeling=allophone_labeling, + alias_prefix=alias_prefix + "/nn_train_data", + partition_epoch=5, + acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, # TODO: NN Mixtures + seq_ordering="laplace:.1000", + ) + tk.register_output(f"{alias_prefix}/nn_train_data/features", nn_train_data.features) + tk.register_output(f"{alias_prefix}/nn_train_data/alignments", nn_train_data.alignments) + nn_devtrain_data = build_hdf_data_input( + features=devtrain_features, + alignments=gmm_system.outputs["train"]["final"].as_returnn_rasr_data_input().alignments.alternatives["bundle"], + allophone_labeling=allophone_labeling, + segment_list=devtrain_segments, + alias_prefix=alias_prefix + "/nn_devtrain_data", + partition_epoch=1, + seq_ordering="sorted", + ) + tk.register_output(f"{alias_prefix}/nn_devtrain_data/features", nn_devtrain_data.features) + tk.register_output(f"{alias_prefix}/nn_devtrain_data/alignments", nn_devtrain_data.alignments) + nn_cv_data = build_hdf_data_input( + features=cv_features, + alignments=gmm_system.alignments["nn-cv_forced-align"]["nn-cv"].alternatives["bundle"], + allophone_labeling=allophone_labeling, + alias_prefix=alias_prefix + "/nn_cv_data", + partition_epoch=1, + seq_ordering="sorted", + ) + tk.register_output(f"{alias_prefix}/nn_cv_data/features", nn_cv_data.features) + tk.register_output(f"{alias_prefix}/nn_cv_data/alignments", nn_cv_data.alignments) + + nn_train_data_inputs = { + "train.train": nn_train_data, + } + nn_devtrain_data_inputs = { + "train.devtrain": nn_devtrain_data, + } + + nn_cv_data_inputs = { + "dev.cv": nn_cv_data, + } + + nn_dev_data_inputs = { + "dev": gmm_system.outputs["dev"]["final"].as_returnn_rasr_data_input(), + "dev_kaldi_small": gmm_system.outputs["dev_kaldi_small_4_gram"]["final"].as_returnn_rasr_data_input(), + } + nn_test_data_inputs = { + # "test": gmm_system.outputs["test"][ + # "final" + # ].as_returnn_rasr_data_input(), + } + + return ( + nn_train_data_inputs, + nn_cv_data_inputs, + nn_devtrain_data_inputs, + nn_dev_data_inputs, + nn_test_data_inputs, + ) diff --git a/common/baselines/tedlium2/hybrid/nn_config/experiment.py b/common/baselines/tedlium2/hybrid/nn_config/experiment.py new file mode 100644 index 000000000..5ee3529e0 --- /dev/null +++ b/common/baselines/tedlium2/hybrid/nn_config/experiment.py @@ -0,0 +1,17 @@ +from .helper import get_network +from .helper import make_nn_config + + +def get_wei_config(specaug=False): + network = get_network(spec_augment=specaug) + nn_config = make_nn_config(network) + nn_config["extern_data"] = { + "data": { + "dim": 80, + "shape": (None, 80), + "available_for_inference": True, + }, # input: 80-dimensional logmel features + "classes": {"dim": 9001, "shape": (None,), "available_for_inference": True, "sparse": True, "dtype": "int16"}, + } + + return nn_config diff --git a/common/baselines/tedlium2/hybrid/nn_config/helper.py b/common/baselines/tedlium2/hybrid/nn_config/helper.py new file mode 100644 index 000000000..3f8dec4e5 --- /dev/null +++ b/common/baselines/tedlium2/hybrid/nn_config/helper.py @@ -0,0 +1,86 @@ +import copy + + +from .nn_setup import build_encoder_network, add_output_layer, add_specaug_source_layer + + +default_nn_config_args = { # batching # + "batch_size": 10000, + "max_seqs": 128, + "chunking": "64:32", # better than 50:25 + "batching": "random", + "min_seq_length": {"classes": 1}, + # optimization # + #'nadam': True, + "learning_rate": 0.0009, + "gradient_clip": 0, + "gradient_noise": 0.1, # together with l2 and dropout for overfit + # Note: (default 1e-8) likely not too much impact + #'optimizer_epsilon': 1e-8, + "optimizer": {"class": "nadam", "epsilon": 1e-08}, + # let it stop and adjust in time + # Note: for inf or nan, sth. is too big (e.g. lr warm up) + # 'stop_on_nonfinite_train_score' : False, + "learning_rate_control": "newbob_multi_epoch", + "newbob_multi_num_epochs": 5, + "newbob_multi_update_interval": 1, + "newbob_learning_rate_decay": 0.9, + # 'newbob_relative_error_threshold' : -0.02, # default -0.01 + # 'min_learning_rate' : 1e-5 # + "learning_rate_control_relative_error_relative_lr": True, + "learning_rate_control_min_num_epochs_per_new_lr": 3, + "start_epoch": "auto", + "start_batch": "auto", + "use_tensorflow": True, + "update_on_device": True, + "multiprocessing": True, + "cache_size": "0", + "truncation": -1, + "window": 1, +} + + +def get_network(num_layers=6, layer_size=512, spec_augment=False, **kwargs): + + lstm_args = { + "num_layers": num_layers, + "size": layer_size, + "l2": 0.01, + "dropout": 0.1, + "bidirectional": True, + "unit": "nativelstm2", + } + network, from_list = build_encoder_network(**lstm_args) + + output_args = { + "loss": "ce", + "loss_opts": { # less weight on loss of easy samples (larger p) + "focal_loss_factor": 2.0, + }, + } + + network = add_output_layer(network, from_list, **output_args) + + if spec_augment: + network, from_list2 = add_specaug_source_layer(network) + + return copy.deepcopy(network) + + +def make_nn_config(network, nn_config_args=default_nn_config_args, **kwargs): + + nn_config = copy.deepcopy(nn_config_args) + nn_config["network"] = network + + # common training settings + optimizer = kwargs.pop("optimizer", None) + if optimizer is not None and not optimizer == "nadam": + del nn_config["nadam"] + nn_config[optimizer] = True + if kwargs.pop("no_pretrain", False): + del nn_config["pretrain"] + if kwargs.pop("no_chunking", False): + del nn_config["chunking"] + # Note: whatever left ! + nn_config.update(kwargs) + return nn_config diff --git a/common/baselines/tedlium2/hybrid/nn_config/nn_args.py b/common/baselines/tedlium2/hybrid/nn_config/nn_args.py new file mode 100644 index 000000000..a286bb744 --- /dev/null +++ b/common/baselines/tedlium2/hybrid/nn_config/nn_args.py @@ -0,0 +1,91 @@ +import copy + +from .experiment import get_wei_config +from .nn_setup import get_spec_augment_mask_python +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.common.setups.rasr.util import HybridArgs, ReturnnTrainingJobArgs +from copy import deepcopy + + +def get_nn_args(num_epochs=125, no_min_seq_len=False): + + # gets the hardcoded config from existing setup for baseline and comparison + base_config = get_wei_config() + returnn_config = ReturnnConfig(config=base_config) + # two variants of spec augment + spec_augment_args = { + "max_time_num": 3, + "max_time": 10, + "max_feature_num": 5, + "max_feature": 18, + "conservatvie_step": 2000, + } + specaug = get_spec_augment_mask_python(**spec_augment_args) + specaug_config = get_wei_config(specaug=True) + spec_cfg = ReturnnConfig(config=copy.deepcopy(specaug_config), python_epilog=specaug) + + configs = { + "wei_base_config": returnn_config, + "wei_specaug_config": spec_cfg, + } + + # change softmax to log softmax for hybrid + recog_configs = deepcopy(configs) + for config_name in recog_configs: + recog_configs[config_name].config["network"]["output"]["class"] = "log_softmax" + recog_configs[config_name].config["network"]["output"]["class"] = "linear" + recog_configs[config_name].config["network"]["output"]["activation"] = "log_softmax" + + # arguments for ReturnnTraining for now fixed + training_args = ReturnnTrainingJobArgs( + num_epochs=num_epochs, + log_verbosity=5, + save_interval=1, + keep_epochs=None, + time_rqmt=168, + mem_rqmt=8, + cpu_rqmt=3, + ) + + recognition_args = { + "dev": { + "epochs": [num_epochs], + "feature_flow_key": "fb", + "prior_scales": [0.7, 0.8, 0.9], + "pronunciation_scales": [0.0], + "lm_scales": [10.0, 7.5, 5.0], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam-pruning": 14.0, + "beam-pruning-limit": 100000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": True, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 50, + "mem": 8, + "lmgc_mem": 16, + "cpu": 4, + "parallelize_conversion": True, + "use_epoch_for_compile": True, + "native_ops": ["NativeLstm2"], + }, + } + + nn_args = HybridArgs( + returnn_training_configs=configs, + returnn_recognition_configs=recog_configs, + training_args=training_args, + recognition_args=recognition_args, + test_recognition_args=None, + ) + + return nn_args diff --git a/common/baselines/tedlium2/hybrid/nn_config/nn_setup.py b/common/baselines/tedlium2/hybrid/nn_config/nn_setup.py new file mode 100644 index 000000000..3340c5d6a --- /dev/null +++ b/common/baselines/tedlium2/hybrid/nn_config/nn_setup.py @@ -0,0 +1,854 @@ +## lm_config +import os +import copy + +# ------------------------------ Recipes ------------------------------ +from sisyphus import tk + +Path = tk.Path + +# only used for seq-training/full_sum training +# import recipe.crnn as crnn + +### construct RETURNN network layers on demand of training ### + + +def make_network(): + network = dict() + fromList = ["data"] + return network, fromList + + +def add_loss_to_layer(network, name, loss, loss_opts=None, target=None, **kwargs): + assert loss is not None + network[name]["loss"] = loss + if loss_opts: + network[name]["loss_opts"] = loss_opts + if target is not None: + network[name]["target"] = target + return network + + +def add_specaug_source_layer(network, name="source", nextLayers=["fwd_lstm_1", "bwd_lstm_1"]): + network2 = copy.deepcopy(network) + network2[name] = { + "class": "eval", + "eval": "self.network.get_config().typed_value('transform')(source(0, as_data=True), network=self.network)", + } + for layer in nextLayers: + if not layer in network2: + continue + network2[layer]["from"] = [name] + return network2, name + + +def add_linear_layer( + network: object, + name: object, + fromList: object, + size: object, + l2: object = 0.01, + dropout: object = None, + bias: object = None, + activation: object = None, + **kwargs, +) -> object: + network[name] = {"class": "linear", "n_out": size, "from": fromList, "activation": activation} + if l2 is not None: + network[name]["L2"] = l2 + if dropout is not None: + network[name]["dropout"] = dropout + # bias is default true in RETURNN + if bias is not None: + network[name]["with_bias"] = bias + if kwargs.get("random_norm_init", False): + network[name]["forward_weights_init"] = "random_normal_initializer(mean=0.0, stddev=0.1)" + if kwargs.get("initial", None) is not None: + network[name]["initial_output"] = kwargs.get("initial", None) + if kwargs.get("loss", None) is not None: + network = add_loss_to_layer(network, name, **kwargs) + if kwargs.get("reuse_params", None) is not None: + network[name]["reuse_params"] = kwargs.get("reuse_params", None) + if not kwargs.get("trainable", True): + network[name]["trainable"] = False + if kwargs.get("out_type", None) is not None: + network[name]["out_type"] = kwargs.get("out_type", None) + # Note: this is not in the master RETURNN branch + if kwargs.get("safe_embedding", False): + network[name]["safe_embedding"] = True # 0-vectors for out-of-range ids (only for embedding) + if kwargs.get("validate_indices", False): + network[name]["validate_indices"] = True # round out-of-range ids to 0 (only for embedding) + return network, name + + +def add_activation_layer(network, name, fromList, activation, **kwargs): + network[name] = {"class": "activation", "from": fromList, "activation": activation} + if kwargs.get("loss", None) is not None: + network = add_loss_to_layer(network, name, **kwargs) + return network, name + + +def add_lstm_layer( + network, name, fromList, size, l2=0.01, dropout=0.1, bidirectional=True, unit="nativelstm2", **kwargs +): + if bidirectional: + layers = [("fwd_" + name, 1), ("bwd_" + name, -1)] + else: + layers = [(name, 1)] + + names = [] + for n, d in layers: + network[n] = { + "class": "rec", + "unit": unit, + "n_out": size, + "from": fromList, + "direction": d, + "dropout": dropout, + "L2": l2, + } + if kwargs.get("drop_connect", None) is not None: + network[n]["unit_opts"] = {"rec_weight_dropout": kwargs.get("drop_connect", None)} + if kwargs.get("random_norm_init", False): + network[n]["forward_weights_init"] = "random_normal_initializer(mean=0.0, stddev=0.1)" + network[n]["recurrent_weights_init"] = "random_normal_initializer(mean=0.0, stddev=0.1)" + network[n]["bias_init"] = "random_normal_initializer(mean=0.0, stddev=0.1)" + if not kwargs.get("trainable", True): + network[n]["trainable"] = False + names.append(n) + + if len(names) == 1: + names = names[0] + return network, names + + +def add_constant_layer(network, name, value, dtype="int32", with_batch_dim=True, **kwargs): + network[name] = {"class": "constant", "value": value, "dtype": dtype, "with_batch_dim": with_batch_dim} + if kwargs.get("out_type", {}): + network[name]["out_type"] = kwargs.get("out_type", {}) + if kwargs.get("initial", None) is not None: + network[name]["initial_output"] = kwargs.get("initial", None) + return network, name + + +def add_cast_layer(network, name, fromList, dtype="float32"): + network[name] = {"class": "cast", "from": fromList, "dtype": dtype} + return network, name + + +def add_expand_dim_layer(network, name, fromList, axis, out_type=None): + network[name] = {"class": "expand_dims", "from": fromList, "axis": 2} # if int, then automatically batch major + if out_type is not None: + network[name]["out_type"] = out_type + return network, name + + +def add_copy_layer(network, name, fromList, initial=None, loss=None, **kwargs): + network[name] = {"class": "copy", "from": fromList} + if initial is not None: + network[name]["initial_output"] = initial + if loss is not None: + network = add_loss_to_layer(network, name, loss, **kwargs) + if kwargs.get("is_output", False): + network[name]["is_output_layer"] = True + if kwargs.get("dropout", None) is not None: + network[name]["dropout"] = kwargs.get("dropout", None) + return network, name + + +def add_compare_layer(network, name, fromList, value=None, kind="not_equal", initial=None): + network[name] = {"class": "compare", "from": fromList, "kind": kind} + if value is not None: + network[name]["value"] = value + if initial is not None: + network[name]["initial_output"] = initial + return network, name + + +def make_subnet(fromList, net): + subnet = {"class": "subnetwork", "from": fromList, "subnetwork": net} + return subnet + + +# masked computation +def add_mask_layer( + network: object, name: object, fromList: object, mask: object, unit: object = {"class": "copy"}, **kwargs: object +) -> object: + network[name] = { + "class": "masked_computation", + "from": fromList, + "mask": mask, + "unit": unit, + } + # more likely to be used in training where input is already masked elsewhere: directly use + if kwargs.get("masked_from", None) is not None: + network[name]["masked_from"] = kwargs.get("masked_from", None) + # heuristics likely not needed anymore, use pad layer to achieve the same + if kwargs.get("initial", None) is not None: + network[name]["unit"]["initial_output"] = kwargs.get("initial", None) + if kwargs.get("keep_last_for_prev", False): + network[name]["keep_last_for_prev"] = True + if kwargs.get("is_output", False): + network[name]["is_output_layer"] = True + return network, name + + +def add_unmask_layer(network, name, fromList, mask, **kwargs): + network[name] = {"class": "unmask", "from": fromList, "mask": mask} + # do not use initial_output but directly the 1st frame of input for the first Fs + if kwargs.get("skip_initial", True): + network[name]["skip_initial"] = True + return network, name + + +def add_padding_layer( + network, name, fromList, axes="T", padding=(0, 1), value=0, mode="constant", n_out=None, **kwargs +): + network[name] = {"class": "pad", "from": fromList, "axes": axes, "padding": padding, "value": value, "mode": mode} + if n_out is not None: + network[name]["n_out"] = n_out + if kwargs.get("is_output", False): + network[name]["is_output_layer"] = True + if kwargs.get("initial", None) is not None: + network[name]["initial_output"] = kwargs.get("initial", None) + if kwargs.get("out_type", None) is not None: + network[name]["out_type"] = kwargs.get("out_type", None) + return network, name + + +def add_time_postfix_layer(network, name, fromList, postfix, repeat=1): + network[name] = {"class": "postfix_in_time", "from": fromList, "postfix": postfix, "repeat": repeat} + return network, name + + +def add_axis_range_layer(network, name, fromList, axis="T", unbroadcast=True): + network[name] = {"class": "range_in_axis", "from": fromList, "axis": axis, "unbroadcast": unbroadcast} + return network, name + + +def add_shift_layer(network, name, fromList, axis="T", amount=1, pad=True, **kwargs): + network[name] = {"class": "shift_axis", "from": fromList, "axis": axis, "amount": amount, "pad": pad} + if kwargs.get("adjust_size", None) is not None: + network[name]["adjust_size_info"] = kwargs.get("adjust_size", None) + if kwargs.get("initial", None) is not None: + network[name]["initial_output"] = kwargs.get("initial", None) + return network, name + + +def add_seq_len_mask_layer(network, name, fromList, axis="T", mask_value=0): + network[name] = {"class": "seq_len_mask", "from": fromList, "axis": axis, "mask_value": mask_value} + return network, name + + +def add_pool_layer(network, name, fromList, mode="max", pool_size=(2,), padding="same", **kwargs): + network[name] = { + "class": "pool", + "mode": mode, + "padding": padding, + "pool_size": pool_size, + "from": fromList, + "trainable": False, + } + return network, name + + +def add_reinterpret_data_layer(network, name, fromList, size_base=None, **kwargs): + network[name] = {"class": "reinterpret_data", "from": fromList} + if kwargs.get("loss", None) is not None: + network = add_loss_to_layer(network, name, **kwargs) + if size_base is not None: + network[name]["size_base"] = size_base + if kwargs.get("enforce_time_major", False): + network[name]["enforce_time_major"] = True + if kwargs.get("set_sparse", None) is not None: + network[name]["set_sparse"] = kwargs.get("set_sparse", None) + if kwargs.get("set_sparse_dim", None) is not None: + network[name]["set_sparse_dim"] = kwargs.get("set_sparse_dim", None) + if kwargs.get("is_output", False): + network[name]["is_output_layer"] = True + return network, name + + +def add_window_layer(network, name, fromList, winSize, winLeft, **kwargs): + network[name] = { + "class": "window", + "from": fromList, + "window_size": winSize, + "window_left": winLeft + # default along time axis and 0 padding (also works inside rec loop) + } + return network, name + + +def add_merge_dim_layer(network, name, fromList, axes="except_time", **kwargs): + network[name] = {"class": "merge_dims", "from": fromList, "axes": axes} + return network, name + + +def add_split_dim_layer(network, name, fromList, axis, dims, **kwargs): + network[name] = {"class": "split_dims", "from": fromList, "axis": axis, "dims": dims} + return network, name + + +def add_slice_layer(network, name, fromList, axis="F", start=None, end=None, step=None): + network[name] = { + "class": "slice", + "from": fromList, + "axis": axis, + "slice_start": start, + "slice_end": end, + "slice_step": step, + } + return network, name + + +def add_squeeze_layer(network, name, fromList, axis, enforce_batch_dim_axis=None): + network[name] = {"class": "squeeze", "from": fromList, "axis": axis} + if enforce_batch_dim_axis is not None: + network[name]["enforce_batch_dim_axis"] = enforce_batch_dim_axis + return network, name + + +def add_layer_norm_layer(network, name, fromList): + network[name] = {"class": "layer_norm", "from": fromList} + return network, name + + +def add_batch_norm_layer(network, name, fromList, **kwargs): + network[name] = {"class": "batch_norm", "from": fromList} + # RETURNN defaults wrong + if kwargs.get("fix_settings", False): + network[name].update( + { + "momentum": 0.1, + "epsilon": 1e-5, + # otherwise eval may be batch-size and utterance-order dependent ! + "update_sample_only_in_training": True, + "delay_sample_update": True, + } + ) + # freeze batch norm running average in training: consistent with testing + if kwargs.get("freeze_average", False): + network[name]["momentum"] = 0.0 + network[name]["use_sample"] = 1.0 + return network, name + + +# eval layer is a also special case of combine layer, but we distinguish them explicitly here +# and only restricted to the 'kind' usage +def add_combine_layer(network, name, fromList, kind="add", **kwargs): + network[name] = {"class": "combine", "from": fromList, "kind": kind} + if kwargs.get("activation", None) is not None: + network[name]["activation"] = kwargs.get("activation", None) + if kwargs.get("with_bias", None) is not None: + network[name]["with_bias"] = kwargs.get("with_bias", None) + if kwargs.get("n_out", None) is not None: + network[name]["n_out"] = kwargs.get("n_out", None) + if kwargs.get("is_output", False): + network[name]["is_output_layer"] = True + return network, name + + +# Note: RETURNN source(i, auto_convert=True, enforce_batch_major=False, as_data=False) +def add_eval_layer(network, name, fromList, eval_str, **kwargs): + network[name] = {"class": "eval", "from": fromList, "eval": eval_str} + if kwargs.get("loss", None) is not None: + network = add_loss_to_layer(network, name, **kwargs) + if kwargs.get("initial", None) is not None: + network[name]["initial_output"] = kwargs.get("initial", None) + if kwargs.get("n_out", None) is not None: + network[name]["n_out"] = kwargs.get("n_out", None) + if kwargs.get("out_type", None) is not None: + network[name]["out_type"] = kwargs.get("out_type", None) + return network, name + + +def add_variable_layer(network, name, shape, **kwargs): + network[name] = {"class": "variable", "shape": shape} + return network, name + + +# generic attention +def add_attention_layer(network, name, base, weights, **kwargs): + network[name] = {"class": "generic_attention", "base": base, "weights": weights} + return network, name + + +def add_spatial_softmax_layer(network, name, fromList, **kwargs): + network[name] = {"class": "softmax_over_spatial", "from": fromList} + return network, name + + +def add_rel_pos_encoding_layer(network, name, fromList, n_out, clipping=64, **kwargs): + network[name] = {"class": "relative_positional_encoding", "from": fromList, "n_out": n_out, "clipping": clipping} + return network, name + + +def add_self_attention_layer( + network, name, fromList, n_out, num_heads, total_key_dim, key_shift=None, attention_dropout=None, **kwargs +): + network[name] = { + "class": "self_attention", + "from": fromList, + "n_out": n_out, + "num_heads": num_heads, + "total_key_dim": total_key_dim, + } + if key_shift is not None: + network[name]["key_shift"] = key_shift + if attention_dropout is not None: + network[name]["attention_dropout"] = attention_dropout + return network, name + + +def add_conv_layer( + network, name, fromList, n_out, filter_size, padding="VALID", l2=0.01, bias=True, activation=None, **kwargs +): + network[name] = { + "class": "conv", + "from": fromList, + "n_out": n_out, + "filter_size": filter_size, + "padding": padding, + "with_bias": bias, + "activation": activation, + } + if l2 is not None: + network[name]["L2"] = l2 + if kwargs.get("strides", None) is not None: + network[name]["strides"] = kwargs.get("strides", None) + if kwargs.get("groups", None) is not None: + network[name]["groups"] = kwargs.get("groups", None) + if not kwargs.get("trainable", True): + network[name]["trainable"] = False + return network, name + + +def add_gating_layer(network, name, fromList, activation=None, gate_activation="sigmoid", **kwargs): + network[name] = {"class": "gating", "from": fromList, "activation": activation, "gate_activation": gate_activation} + return network, name + + +def add_reduce_layer(network, name, fromList, mode="mean", axes="T", keep_dims=False, **kwargs): + network[name] = {"class": "reduce", "from": fromList, "mode": mode, "axes": axes, "keep_dims": keep_dims} + return network, name + + +def add_reduce_out_layer(network, name, fromList, mode="max", num_pieces=2, **kwargs): + network[name] = {"class": "reduce_out", "from": fromList, "mode": mode, "num_pieces": num_pieces} + return network, name + + +# Convolution block +def add_conv_block( + network, fromList, conv_layers, conv_filter, conv_size, pool_size=None, name_prefix="conv", **kwargs +): + network, fromList = add_split_dim_layer(network, "conv_source", fromList, axis="F", dims=(-1, 1)) + for idx in range(conv_layers): + name = name_prefix + "_" + str(idx + 1) + network, fromList = add_conv_layer(network, name, fromList, conv_size, conv_filter, padding="same", **kwargs) + if pool_size is not None: + name += "_pool" + if isinstance(pool_size, list): + assert idx < len(pool_size) + pool = pool_size[idx] + else: + pool = pool_size + assert isinstance(pool, tuple) + if any([p > 1 for p in pool]): + network, fromList = add_pool_layer(network, name, fromList, pool_size=pool) + network, fromList = add_merge_dim_layer(network, "conv_merged", fromList, axes="static") + return network, fromList + + +# BLSTM encoder with optional max-pool subsampling +def build_encoder_network(num_layers=6, size=512, max_pool=[], **kwargs): + network, fromList = make_network() + # Convolution layers (no subsampling) + if kwargs.pop("initial_convolution", False): + # TODO no pooling on feature dim ? (correlation is already low) + conv_layers, conv_filter, conv_size, pool = kwargs.pop("convolution_layers", (2, (3, 3), 32, (1, 2))) + network, fromList = add_conv_block( + network, fromList, conv_layers, conv_filter, conv_size, pool_size=pool, **kwargs + ) + # BLSTM layers + for idx in range(num_layers): + name = "lstm_" + str(idx + 1) + network, fromList = add_lstm_layer(network, name, fromList, size, **kwargs) + if max_pool and idx < len(max_pool) and max_pool[idx] > 1: + name = "max_pool_" + str(idx + 1) + network, fromList = add_pool_layer(network, name, fromList, pool_size=(max_pool[idx],)) + return network, fromList + + +# Conformer encoder TODO freeze encoder: pass trainable False +def add_conformer_block(network, name, fromList, size, dropout, l2, **kwargs): + # feed-forward module + def add_ff_module(net, n, fin): + net, fout = add_layer_norm_layer(net, n + "_ln", fin) + net, fout = add_linear_layer(net, n + "_linear_swish", fout, size * 4, l2=l2, activation="swish") + net, fout = add_linear_layer(net, n + "_dropout_linear", fout, size, l2=l2, dropout=dropout) + net, fout = add_copy_layer(net, n + "_dropout", fout, dropout=dropout) + net, fout = add_eval_layer(net, n + "_half_res_add", [fout, fin], "0.5 * source(0) + source(1)") + return net, fout + + # multi-head self-attention module + def add_mhsa_module(net, n, fin, heads, posEncSize, posEncClip, posEnc=True): + net, fout = add_layer_norm_layer(net, n + "_ln", fin) + if posEnc: + net, fpos = add_rel_pos_encoding_layer(net, n + "_relpos_encoding", fout, posEncSize, clipping=posEncClip) + else: + fpos = None + net, fout = add_self_attention_layer( + net, n + "_self_attention", fout, size, heads, size, key_shift=fpos, attention_dropout=dropout + ) + net, fout = add_linear_layer(net, n + "_att_linear", fout, size, l2=l2, bias=False) + net, fout = add_copy_layer(net, n + "_dropout", fout, dropout=dropout) + net, fout = add_combine_layer(net, n + "_res_add", [fout, fin]) + return net, fout + + # convolution module + def add_conv_module(net, n, fin, filterSize, bnFix, bnFreeze, bn2ln): + net, fout = add_layer_norm_layer(net, n + "_ln", fin) + # glu weights merged into pointwise conv, i.e. linear layer + net, fout = add_linear_layer(net, n + "_pointwise_conv_1", fout, size * 2, l2=l2) + net, fout = add_gating_layer(net, n + "_glu", fout) + net, fout = add_conv_layer( + net, n + "_depthwise_conv", fout, size, filterSize, padding="same", l2=l2, groups=size + ) + if bn2ln: + net, fout = add_layer_norm_layer(net, n + "_bn2ln", fout) + else: + net, fout = add_batch_norm_layer(net, n + "_bn", fout, fix_settings=bnFix, freeze_average=bnFreeze) + net, fout = add_activation_layer(net, n + "_swish", fout, "swish") + net, fout = add_linear_layer(net, n + "_pointwise_conv_2", fout, size, l2=l2) + net, fout = add_copy_layer(net, n + "_dropout", fout, dropout=dropout) + net, fout = add_combine_layer(net, n + "_res_add", [fout, fin]) + return net, fout + + network, fList = add_ff_module(network, name + "_ffmod_1", fromList) + + mhsa_args = { + "heads": kwargs.get("num_att_heads", 8), + "posEncSize": kwargs.get("pos_enc_size", 64), + "posEncClip": kwargs.get("pos_enc_clip", 64), # default clipping 16 in RETURNN + "posEnc": kwargs.get("pos_encoding", True), + } + conv_args = { + "filterSize": kwargs.get("conv_filter_size", (32,)), + "bnFix": kwargs.get("batch_norm_fix", False), + "bnFreeze": kwargs.get("batch_norm_freeze", False), + "bn2ln": kwargs.get("batch_norm_to_layer_norm", False), + } + if kwargs.get("switch_conv_mhsa_module", False): + network, fList = add_conv_module(network, name + "_conv_mod", fList, **conv_args) + network, fList = add_mhsa_module(network, name + "_mhsa_mod", fList, **mhsa_args) + else: + network, fList = add_mhsa_module(network, name + "_mhsa_mod", fList, **mhsa_args) + network, fList = add_conv_module(network, name + "_conv_mod", fList, **conv_args) + + network, fList = add_ff_module(network, name + "_ffmod_2", fList) + network, fList = add_layer_norm_layer(network, name + "_output", fList) + return network, fList + + +def build_conformer_encoder(num_blocks=12, size=512, dropout=0.1, l2=0.0001, max_pool=[], **kwargs): + network, fromList = make_network() + # Input block + if kwargs.get("initial_convolution", True): + # vgg conv with subsampling 4 + if kwargs.get("vgg_conv", True): + network, fromList = add_conv_block( + network, fromList, 1, (3, 3), 32, pool_size=(1, 2), activation="swish", **kwargs + ) + stride1, stride2 = kwargs.get("vgg_conv_strides", (2, 2)) + network, fList = add_conv_layer( + network, + "conv_2", + network[fromList]["from"], + 64, + (3, 3), + padding="same", + strides=(stride1, 1), + activation="swish", + **kwargs, + ) + network, fList = add_conv_layer( + network, "conv_3", fList, 64, (3, 3), padding="same", strides=(stride2, 1), activation="swish", **kwargs + ) + network[fromList]["from"] = fList + elif kwargs.get("stride_subsampling", False): + conv_layers, conv_filter, conv_size, strides = kwargs.pop("convolution_layers", (2, (3, 3), 32, [2, 2])) + network, fromList = add_conv_block( + network, fromList, conv_layers, conv_filter, conv_size, strides=strides, **kwargs + ) + else: # max_pool subsampling + conv_layers, conv_filter, conv_size, pool = kwargs.pop("convolution_layers", (2, (3, 3), 32, (1, 2))) + network, fromList = add_conv_block( + network, fromList, conv_layers, conv_filter, conv_size, pool_size=pool, **kwargs + ) + assert not max_pool + elif kwargs.get("initial_blstm", False): # BLSTM with subsampling 4 + layers, uniSize, pool = kwargs.pop("blstm_layers", (2, 512, [2, 2])) + network, fromList = build_encoder_network( + num_layers=layers, size=uniSize, max_pool=pool, dropout=dropout, l2=l2, **kwargs + ) + assert not max_pool + network, fromList = add_linear_layer(network, "input_linear", fromList, size, l2=l2, bias=False) + network, fromList = add_copy_layer(network, "input_dropout", fromList, dropout=dropout) + + # Conformer blocks + for idx in range(num_blocks): + name = "conformer_" + str(idx + 1) + network, fromList = add_conformer_block(network, name, fromList, size, dropout, l2, **kwargs) + # also allow subsampling between conformer blocks + if max_pool and idx < len(max_pool) and max_pool[idx] > 1: + name += "_max_pool" + network, fromList = add_pool_layer(network, name, fromList, pool_size=(max_pool[idx],)) + return network, fromList + + +# -- output and loss -- +def add_loss_layer(network, name, fromList, loss="ce", **kwargs): + network[name] = {"class": "loss", "from": fromList, "loss_": loss} + if kwargs.get("target", None) is not None: + network[name]["target_"] = kwargs.get("target", None) + if kwargs.get("loss_opts", None) is not None: + network[name]["loss_opts_"] = kwargs.get("loss_opts", None) + return network, name + + +def add_output_layer(network, fromList, name="output", loss="ce", loss_opts=None, cls="softmax", **kwargs): + network[name] = {"class": cls, "from": fromList} + if loss is not None: + network = add_loss_to_layer(network, name, loss, loss_opts=loss_opts, **kwargs) + else: + n_out = kwargs.get("n_out", None) + assert n_out is not None, "either loss or n_out need to be given" + network[name]["n_out"] = n_out + network[name]["is_output_layer"] = True + + if kwargs.get("random_norm_init", False): + network[name]["forward_weights_init"] = "random_normal_initializer(mean=0.0, stddev=0.1)" + if kwargs.get("dropout", None) is not None: + network[name]["dropout"] = kwargs.get("dropout", None) + if kwargs.get("loss_scale", None) is not None: + network[name]["loss_scale"] = kwargs.get("loss_scale", None) + if kwargs.get("activation", None) is not None: + network[name]["class"] = "linear" + network[name]["activation"] = kwargs.get("activation", None) + if kwargs.get("reuse_params", None) is not None: + network[name]["reuse_params"] = kwargs.get("reuse_params", None) + network[name].update(kwargs.get("extra_args", {})) + return network + + +def add_sMBR_output(inNetwork, name="output_ac", output="output", ce_smooth=0.1, **kwargs): + network = copy.deepcopy(inNetwork) + network[output]["loss_scale"] = ce_smooth + network[name] = { + "class": "copy", + "from": output, + "loss": "sprint", + "loss_scale": 1 - ce_smooth, + "loss_opts": { + "sprint_opts": crnn.CustomCRNNSprintTrainingJob.create_sprint_loss_opts( + loss_mode="sMBR", num_sprint_instance=1 + ) + }, + } + return network + + +# full-sum training using sprint FSA (so far only fast_bw loss) +def add_full_sum_output_layer(network, fromList, num_classes, loss="fast_bw", name="output", **kwargs): + output_args = { + "name": name, + "loss": loss, + "loss_opts": { + "sprint_opts": crnn.CustomCRNNSprintTrainingJob.create_sprint_loss_opts(**kwargs), + "tdp_scale": kwargs.get("tdp_scale", 0.0), + }, + "extra_args": {"target": None, "n_out": num_classes}, # no target to infer output size + } + return add_output_layer(network, fromList, **output_args) + + +# decoder output layer using rec-layer unit (including prediction and joint network) +def add_decoder_output_rec_layer(network, fromList, recUnit, optimize_move_layers_out=None, **kwargs): + network = copy.deepcopy(network) + network["output"] = { + "class": "rec", + "from": fromList, + # only relevant for beam_search: e.g. determine length by targets + "cheating": False, + "target": kwargs.get("target", "classes"), + "unit": recUnit, + } + if optimize_move_layers_out is not None: + network["output"]["optimize_move_layers_out"] = optimize_move_layers_out + if kwargs.get("max_seq_len", None) is not None: + network["output"]["max_seq_len"] = kwargs.get("max_seq_len", None) + return network + + +def add_choice_layer(network, name="output_choice", fromList=["output"], initial=0, beam=1, **kwargs): + network[name] = { + "class": "choice", + "target": kwargs.get("target", "classes"), + "from": fromList, + "initial_output": initial, + # only relevant for beam_search: e.g. task='search' + "cheating": "False", # include targets in the beam + "beam_size": beam, + } + if kwargs.get("scheduled_sampling", False): + network[name]["scheduled_sampling"] = kwargs.get("scheduled_sampling", False) + if kwargs.get("input_type", None) is not None: + network[name]["input_type"] = kwargs.get("input_type", None) + # Note: either/none of the following is needed for recognition + # old compile_tf_graph + if kwargs.get("is_stochastic_var", None) is not None: + network[name]["is_stochastic_var"] = kwargs.get("is_stochastic_var", None) + # new compile_tf_graph + if kwargs.get("score_dependent", None) is not None: + network[name]["score_dependent"] = kwargs.get("score_dependent", None) + return network + + +def make_recog_rec_network(trainRecNetwork, removeList=[], update={}, recRemoveList=[], recUpdate={}): + # Note: can not add new layers + def modify(net, toRemove, toUpdate): + network = copy.deepcopy(net) + for lname in net.keys(): + # name pattern match: removal + removed = False + for rk in toRemove: + if rk in lname: + del network[lname] + removed = True + break + if removed: + continue + # name match: dict update + if lname in toUpdate: + network[lname].update(toUpdate[lname]) + return network + + # apply change + recogRecNetwork = modify(trainRecNetwork, removeList, update) + if recRemoveList or recUpdate: + assert recogRecNetwork["output"]["class"] == "rec" + recUnit = modify(recogRecNetwork["output"]["unit"], recRemoveList, recUpdate) + recogRecNetwork["output"]["unit"] = recUnit + return recogRecNetwork + + +# simple zero-encoder estimated internal LM TODO add more +def make_internal_LM_rec_network(recUnit, name, scale, lm_output, num_classes, blankIndex=0, posterior="output"): + assert blankIndex == 0, "assume blank index 0" + assert posterior in recUnit + recUnit[posterior].update({"class": "linear", "activation": "log_softmax"}) + # TODO exclude bias ? + recUnit, fList = add_linear_layer(recUnit, "intLM_logits", lm_output, num_classes, reuse_params=posterior) + recUnit, fList = add_slice_layer(recUnit, "intLM_logits_noBlank", fList, start=1) + recUnit, fList = add_activation_layer(recUnit, "intLM_softmax", fList, "log_softmax") + recUnit, fList = add_padding_layer( + recUnit, "intLM_prior", fList, axes="F", value=0, padding=(1, 0), n_out=num_classes + ) + # log(posterior) - alpha * log(prior) + recUnit, fList = add_eval_layer(recUnit, name, [posterior, fList], "source(0) - %s * source(1)" % (str(scale))) + return recUnit + + +## ----------------------- extra python code ----------------------- ## +# SpecAugment # +def get_spec_augment_mask_python( + codeFile=None, + max_time_num=6, + max_time=5, + max_feature_num=4, + max_feature=5, + conservatvie_step=2000, + feature_limit=None, + customRep={}, +): + path = os.path.dirname(os.path.abspath(__file__)) + if codeFile is None: + if feature_limit is not None: + codeFile = os.path.join(path, "spec_augment_mask_flimit.py") + else: + codeFile = os.path.join(path, "spec_augment_mask.py") + elif codeFile in os.listdir(path): + codeFile = os.path.join(path, codeFile) + with open(codeFile, "r") as f: + python_code = f.read() + + python_code = python_code.replace("max_time_num = 6", "max_time_num = %d" % max_time_num) + python_code = python_code.replace("max_time = 5", "max_time = %d" % max_time) + python_code = python_code.replace("max_feature_num = 4", "max_feature_num = %d" % max_feature_num) + python_code = python_code.replace("max_feature = 5", "max_feature = %d" % max_feature) + python_code = python_code.replace("conservatvie_step = 2000", "conservatvie_step = %d" % conservatvie_step) + + if feature_limit is not None: + assert isinstance(feature_limit, int) + python_code = python_code.replace("feature_limit = 80", "feature_limit = %d" % feature_limit) + + for old, new in customRep.items(): + python_code = python_code.replace("%s" % old, "%s" % new) + return python_code + + +def get_extern_data_python(codeFile=None, nInput=50, nOutput=40): + if codeFile is None: + path = os.path.dirname(os.path.abspath(__file__)) + codeFile = os.path.join(path, "extern_data.py") + with open(codeFile, "r") as f: + python_code = f.read() + + python_code = python_code.replace("nInput", str(nInput)) + python_code = python_code.replace("nOutput", str(nOutput)) + return python_code + + +# custom pretrain construction with down-sampling # +def get_pretrain_python(codeFile=None, repetitions="1", customRep={}): + path = os.path.dirname(os.path.abspath(__file__)) + if codeFile is None: + codeFile = os.path.join(path, "pretrain.py") + elif codeFile in os.listdir(path): + codeFile = os.path.join(path, codeFile) + with open(codeFile, "r") as f: + python_code = f.read() + + if not isinstance(repetitions, str): + repetitions = str(repetitions) + if not repetitions == "1": + python_code = python_code.replace("'repetitions': 1", "'repetitions': %s" % repetitions) + for old, new in customRep.items(): + python_code = python_code.replace("%s" % old, "%s" % new) + return python_code + + +def get_segmental_loss_python(codeFile=None, time_axis=None): + path = os.path.dirname(os.path.abspath(__file__)) + if codeFile is None: + codeFile = os.path.join(path, "segmental_loss.py") + elif codeFile in os.listdir(path): + codeFile = os.path.join(path, codeFile) + with open(codeFile, "r") as f: + python_code = f.read() + if time_axis is not None: + python_code = python_code.replace("axis=0", "axis=%d" % time_axis) + return python_code + + +def get_extra_python(codeFile, customRep={}): + assert codeFile is not None + path = os.path.dirname(os.path.abspath(__file__)) + codeFile = os.path.join(path, codeFile) + with open(codeFile, "r") as f: + python_code = f.read() + for old, new in customRep.items(): + python_code = python_code.replace("%s" % old, "%s" % new) + return python_code diff --git a/common/baselines/tedlium2/hybrid/nn_config/spec_augment_mask.py b/common/baselines/tedlium2/hybrid/nn_config/spec_augment_mask.py new file mode 100644 index 000000000..c98f2aeaa --- /dev/null +++ b/common/baselines/tedlium2/hybrid/nn_config/spec_augment_mask.py @@ -0,0 +1,131 @@ +# for debug only +def summary(name, x): + """ + :param str name: + :param tf.Tensor x: (batch,time,feature) + """ + from returnn.tf.compat import v1 as tf + + # tf.summary.image wants [batch_size, height, width, channels], + # we have (batch, time, feature). + img = tf.expand_dims(x, axis=3) # (batch,time,feature,1) + img = tf.transpose(img, [0, 2, 1, 3]) # (batch,feature,time,1) + tf.summary.image(name, img, max_outputs=10) + tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x))) + mean = tf.reduce_mean(x) + tf.summary.scalar("%s_mean" % name, mean) + stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean))) + tf.summary.scalar("%s_stddev" % name, stddev) + tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2)) + + +def _mask(x, batch_axis, axis, pos, max_amount): + """ + :param tf.Tensor x: (batch,time,feature) + :param int batch_axis: + :param int axis: + :param tf.Tensor pos: (batch,) + :param int|tf.Tensor max_amount: inclusive + """ + from returnn.tf.compat import v1 as tf + + ndim = x.get_shape().ndims + n_batch = tf.shape(x)[batch_axis] + dim = tf.shape(x)[axis] + amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32) + pos2 = tf.minimum(pos + amount, dim) + idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) + pos_bc = tf.expand_dims(pos, 1) # (batch,1) + pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) + cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)) # (batch,dim) + if batch_axis > axis: + cond = tf.transpose(cond) # (dim,batch) + cond = tf.reshape(cond, [tf.shape(x)[i] if i in (batch_axis, axis) else 1 for i in range(ndim)]) + from TFUtil import where_bc + + x = where_bc(cond, 0.0, x) + return x + + +def random_mask(x, batch_axis, axis, min_num, max_num, max_dims): + """ + :param tf.Tensor x: (batch,time,feature) + :param int batch_axis: + :param int axis: + :param int|tf.Tensor min_num: + :param int|tf.Tensor max_num: inclusive + :param int|tf.Tensor max_dims: inclusive + """ + from returnn.tf.compat import v1 as tf + + n_batch = tf.shape(x)[batch_axis] + if isinstance(min_num, int) and isinstance(max_num, int) and min_num == max_num: + num = min_num + else: + num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32) + # https://github.com/tensorflow/tensorflow/issues/9260 + # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ + z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1))) + _, indices = tf.nn.top_k(z, num if isinstance(num, int) else tf.reduce_max(num)) + # indices should be sorted, and of shape (batch,num), entries (int32) in [0,dim) + # indices = tf.Print(indices, ["indices", indices, tf.shape(indices)]) + if isinstance(num, int): + for i in range(num): + x = _mask(x, batch_axis=batch_axis, axis=axis, pos=indices[:, i], max_amount=max_dims) + else: + _, x = tf.while_loop( + cond=lambda i, _: tf.less(i, tf.reduce_max(num)), + body=lambda i, x: ( + i + 1, + tf.where( + tf.less(i, num), + _mask(x, batch_axis=batch_axis, axis=axis, pos=indices[:, i], max_amount=max_dims), + x, + ), + ), + loop_vars=(0, x), + ) + return x + + +def transform(data, network): + # to be adjusted (20-50%) + max_time_num = 6 + max_time = 5 + + max_feature_num = 4 + max_feature = 5 + + # halved before this step + conservatvie_step = 2000 + + x = data.placeholder + from returnn.tf.compat import v1 as tf + + # summary("features", x) + step = network.global_train_step + increase_flag = tf.where(tf.greater_equal(step, conservatvie_step), 0, 1) + + def get_masked(): + x_masked = x + x_masked = random_mask( + x_masked, + batch_axis=data.batch_dim_axis, + axis=data.time_dim_axis, + min_num=0, + max_num=tf.maximum(tf.shape(x)[data.time_dim_axis] // (2 * max_time), max_time_num) // (1 + increase_flag), + max_dims=max_time, + ) + x_masked = random_mask( + x_masked, + batch_axis=data.batch_dim_axis, + axis=data.feature_dim_axis, + min_num=0, + max_num=max_feature_num // (1 + increase_flag), + max_dims=max_feature, + ) + # summary("features_mask", x_masked) + return x_masked + + x = network.cond_on_train(get_masked, lambda: x) + return x diff --git a/common/setups/rasr/gmm_system.py b/common/setups/rasr/gmm_system.py index a24ef6a01..d2ffb0801 100644 --- a/common/setups/rasr/gmm_system.py +++ b/common/setups/rasr/gmm_system.py @@ -294,12 +294,6 @@ def monophone_training( self.jobs[corpus_key]["train_{}".format(name)].get_alignment_score_report(), ) - state_tying_job = allophones.DumpStateTyingJob(self.crp[corpus_key]) - tk.register_output( - "{}_{}_state_tying".format(corpus_key, name), - state_tying_job.out_state_tying, - ) - # -------------------- CaRT and LDA -------------------- def cart_and_lda( @@ -1061,6 +1055,13 @@ def get_gmm_output( gmm_output.alignments = self.alignments[corpus_key][f"train_{steps.get_prev_gmm_step(step_idx)}"][-1] gmm_output.acoustic_mixtures = self.mixtures[corpus_key][f"train_{steps.get_prev_gmm_step(step_idx)}"][-1] + state_tying_job = allophones.DumpStateTyingJob(self.crp[corpus_key]) + self.jobs[corpus_key]["state_tying"] = state_tying_job + tk.register_output( + "final_{}_state_tying".format(corpus_key), + state_tying_job.out_state_tying, + ) + return gmm_output # -------------------- run functions -------------------- @@ -1237,6 +1238,23 @@ def run_vtln_sat_step(self, step_args): **step_args.sdm_args, ) + def run_forced_align_step(self, step_args): + train_corpus_keys = step_args.pop("train_corpus_keys", self.train_corpora) + target_corpus_keys = step_args.pop("target_corpus_keys") + bliss_lexicon = step_args.pop("bliss_lexicon", None) + for corpus in train_corpus_keys: + for trg_key in target_corpus_keys: + forced_align_trg_key = trg_key + "_forced-align" + self.add_overlay(trg_key, forced_align_trg_key) + if bliss_lexicon: + self._init_lexicon(forced_align_trg_key, **bliss_lexicon) + + self.forced_align( + target_corpus_key=forced_align_trg_key, + feature_scorer_corpus_key=corpus, + **step_args, + ) + def run_recognition_step( self, step_args, @@ -1390,12 +1408,7 @@ def run(self, steps: Union[List[str], RasrSteps]): # ---------- Forced Alignment ---------- if step_name.startswith("forced_align"): - corpus_keys = step_args.pop("corpus_keys", self.train_corpora) - for corpus in corpus_keys: - self.forced_align( - feature_scorer_corpus_key=corpus, - **step_args, - ) + self.run_forced_align_step(step_args) # ---------- Only Recognition ---------- if step_name.startswith("recog"): diff --git a/common/setups/rasr/hybrid_decoder.py b/common/setups/rasr/hybrid_decoder.py index 4923f0640..fb6ff9294 100644 --- a/common/setups/rasr/hybrid_decoder.py +++ b/common/setups/rasr/hybrid_decoder.py @@ -23,6 +23,7 @@ CombineLmRasrConfig, ) from .util.decode import ( + DevRecognitionParameters, RecognitionParameters, SearchJobArgs, Lattice2CtmArgs, @@ -47,7 +48,7 @@ class HybridDecoder(BaseDecoder): def __init__( self, rasr_binary_path: tk.Path, - rasr_arch: "str" = "linux-x86_64-standard", + rasr_arch: str = "linux-x86_64-standard", compress: bool = False, append: bool = False, unbuffered: bool = False, @@ -155,8 +156,9 @@ def recognition( tf_fwd_input_name: str = "tf-fwd-input", ): """ - run the recognitino, consisting of search, lattice to ctm, and scoring + run the recognition, consisting of search, lattice to ctm, and scoring + :param name: decoding name :param returnn_config: RETURNN config for recognition :param checkpoints: epoch to model checkpoint mapping :param recognition_parameters: keys are the corpus keys so that recog params can be set for specific eval sets. @@ -221,3 +223,247 @@ def recognition( scorer_hyp_param_name=scorer_hyp_param_name, optimize_pron_lm_scales=optimize_pron_lm_scales, ) + + +def tune_scales( + decoder: HybridDecoder, + name: str, + returnn_config: Union[returnn.ReturnnConfig, tk.Path], + checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], + lm_configs: Dict[str, LmConfig], + prior_paths: Dict[str, PriorPath], + search_job_args: Union[SearchJobArgs, Dict], + lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], + scorer_args: Union[ScliteScorerArgs, Dict], + optimize_parameters: Union[OptimizeJobArgs, Dict], + epochs: Optional[List[int]] = None, + scorer_hyp_param_name: str = "hyp", + optimize_pron_lm_scales: bool = False, + forward_output_layer: str = "output", + tf_fwd_input_name: str = "tf-fwd-input", +): + """ + this function tunes the prior scale, TDP scale and silence/non-word exit penalties + + :return: + """ + recog_params = { + "tune1": [ + DevRecognitionParameters( + am_scales=[1.0], + lm_scales=[12.0], + prior_scales=[0.3, 0.5, 0.7], + pronunciation_scales=[1.0], + tdp_scales=[0.1, 0.5, 1.0], + speech_tdps=[], + silence_tdps=[], + nonspeech_tdps=[], + altas=[12.0], + ), + ], + } + + decoder.recognition( + name=name, + returnn_config=returnn_config, + checkpoints=checkpoints, + recognition_parameters=recog_params, + lm_configs=lm_configs, + prior_paths=prior_paths, + search_job_args=search_job_args, + lat_2_ctm_args=lat_2_ctm_args, + scorer_args=scorer_args, + optimize_parameters=optimize_parameters, + epochs=epochs, + scorer_hyp_param_name=scorer_hyp_param_name, + optimize_pron_lm_scales=optimize_pron_lm_scales, + forward_output_layer=forward_output_layer, + tf_fwd_input_name=tf_fwd_input_name, + ) + + +def tune_lm_scale( + decoder: HybridDecoder, + name: str, + returnn_config: Union[returnn.ReturnnConfig, tk.Path], + checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], + lm_configs: Dict[str, LmConfig], + prior_paths: Dict[str, PriorPath], + search_job_args: Union[SearchJobArgs, Dict], + lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], + scorer_args: Union[ScliteScorerArgs, Dict], + optimize_parameters: Union[OptimizeJobArgs, Dict], + epochs: Optional[List[int]] = None, + scorer_hyp_param_name: str = "hyp", + optimize_pron_lm_scales: bool = False, + forward_output_layer: str = "output", + tf_fwd_input_name: str = "tf-fwd-input", +): + """ + tunes the LM scale + + :return: + """ + recog_params = { + "tune2": [ + DevRecognitionParameters( + am_scales=[1.0], + lm_scales=[12.0], + prior_scales=[0.3, 0.5, 0.7], + pronunciation_scales=[1.0], + tdp_scales=[0.1, 0.5, 1.0], + speech_tdps=[], + silence_tdps=[], + nonspeech_tdps=[], + altas=[0.0], + ), + ], + } + + decoder.recognition( + name=name, + returnn_config=returnn_config, + checkpoints=checkpoints, + recognition_parameters=recog_params, + lm_configs=lm_configs, + prior_paths=prior_paths, + search_job_args=search_job_args, + lat_2_ctm_args=lat_2_ctm_args, + scorer_args=scorer_args, + optimize_parameters=optimize_parameters, + epochs=epochs, + scorer_hyp_param_name=scorer_hyp_param_name, + optimize_pron_lm_scales=optimize_pron_lm_scales, + forward_output_layer=forward_output_layer, + tf_fwd_input_name=tf_fwd_input_name, + ) + + +def tune_search_space( + decoder: HybridDecoder, + name: str, + returnn_config: Union[returnn.ReturnnConfig, tk.Path], + checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], + lm_configs: Dict[str, LmConfig], + prior_paths: Dict[str, PriorPath], + search_job_args: Union[SearchJobArgs, Dict], + lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], + scorer_args: Union[ScliteScorerArgs, Dict], + optimize_parameters: Union[OptimizeJobArgs, Dict], + epochs: Optional[List[int]] = None, + scorer_hyp_param_name: str = "hyp", + optimize_pron_lm_scales: bool = False, + forward_output_layer: str = "output", + tf_fwd_input_name: str = "tf-fwd-input", +): + """ + tunes beam search size and altas + + :return: + """ + recog_params = DevRecognitionParameters() + + decoder.recognition() + + +def tune_beam_pruning_limit( + decoder: HybridDecoder, + name: str, + returnn_config: Union[returnn.ReturnnConfig, tk.Path], + checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], + lm_configs: Dict[str, LmConfig], + prior_paths: Dict[str, PriorPath], + search_job_args: Union[SearchJobArgs, Dict], + lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], + scorer_args: Union[ScliteScorerArgs, Dict], + optimize_parameters: Union[OptimizeJobArgs, Dict], + epochs: Optional[List[int]] = None, + scorer_hyp_param_name: str = "hyp", + optimize_pron_lm_scales: bool = False, + forward_output_layer: str = "output", + tf_fwd_input_name: str = "tf-fwd-input", +): + """ + tunes the beam pruning limit + + :return: + """ + recog_params = DevRecognitionParameters() + + decoder.recognition() + + +def tune_decoding( + name: str, + *, + rasr_binary_path: tk.Path, + acoustic_model_config: AmRasrConfig, + lexicon_config: LexiconRasrConfig, + returnn_config: Union[returnn.ReturnnConfig, tk.Path], + checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], + lm_configs: Dict[str, LmConfig], + prior_paths: Dict[str, PriorPath], + search_job_args: Union[SearchJobArgs, Dict], + lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], + scorer_args: Union[ScliteScorerArgs, Dict], + optimize_parameters: Union[OptimizeJobArgs, Dict], + rasr_arch: str = "linux-x86_64-standard", + compress: bool = False, + append: bool = False, + unbuffered: bool = False, + compress_after_run: bool = True, + search_job_class: Type[tk.Job] = recog.AdvancedTreeSearchJob, + scorer_job_class: Type[tk.Job] = recog.ScliteJob, + alias_output_prefix: str = "", + returnn_root: Optional[tk.Path] = None, + returnn_python_home: Optional[tk.Path] = None, + returnn_python_exe: Optional[tk.Path] = None, + blas_lib: Optional[tk.Path] = None, + search_numpy_blas: bool = True, + required_native_ops: Optional[List[str]] = None, + extra_configs: Optional[Dict[str, rasr.RasrConfig]] = None, + crp_name: str = "base", + epochs: Optional[List[int]] = None, + scorer_hyp_param_name: str = "hyp", + optimize_pron_lm_scales: bool = False, + forward_output_layer: str = "output", + tf_fwd_input_name: str = "tf-fwd-input", +): + """ + 1. TDPs, scales: prior, and TDP [beam-pruning = 14.0, altas = 12.0] + a. TDP: {0.1, 0.5, 1.0} + b. Prior: {0.3, 0.5, 0.7} + c. Silence and non-word phon: {0.0, 4.0, 10.0} + 2. LM scale optimization + a. no altas + b. beam-pruning: 14.0, 15.0 + 3. + a. beam-pruning: 14.0, 15.0 + b. altas: 2.0, 4.0, 6.0, 8.0 + 4. beam pruning-limit: 15k, 10k, 7.5k, 6k, 5k, 4k + + :return: + """ + decoder = HybridDecoder( + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + compress=compress, + append=append, + unbuffered=unbuffered, + compress_after_run=compress_after_run, + search_job_class=search_job_class, + scorer_job_class=scorer_job_class, + alias_output_prefix=alias_output_prefix, + returnn_root=returnn_root, + returnn_python_home=returnn_python_home, + returnn_python_exe=returnn_python_exe, + blas_lib=blas_lib, + search_numpy_blas=search_numpy_blas, + required_native_ops=required_native_ops, + ) + decoder.init_decoder( + acoustic_model_config=acoustic_model_config, + lexicon_config=lexicon_config, + extra_configs=extra_configs, + crp_name=crp_name, + ) diff --git a/common/setups/rasr/hybrid_system.py b/common/setups/rasr/hybrid_system.py index 8ad96549f..b6ba82f55 100644 --- a/common/setups/rasr/hybrid_system.py +++ b/common/setups/rasr/hybrid_system.py @@ -1,4 +1,4 @@ -__all__ = ["HybridArgs", "HybridSystem"] +__all__ = ["HybridSystem"] import copy import itertools @@ -21,17 +21,21 @@ add_tf_flow_to_base_flow, ) from i6_core.util import MultiPath, MultiOutputPath +from i6_core.mm import CreateDummyMixturesJob +from i6_core.returnn import ReturnnComputePriorJobV2 -from .nn_system import NnSystem +from .hybrid_decoder import HybridDecoder +from .nn_system import NnSystem, returnn_training from .util import ( RasrInitArgs, ReturnnRasrDataInput, - OggZipHdfDataInput, HybridArgs, NnRecogArgs, RasrSteps, NnForcedAlignArgs, + ReturnnTrainingJobArgs, + AllowedReturnnTrainingDataInput, ) # -------------------- Init -------------------- @@ -90,9 +94,15 @@ def __init__( self.cv_corpora = [] self.devtrain_corpora = [] - self.train_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] - self.cv_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] - self.devtrain_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] + self.train_input_data = ( + None + ) # type:Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] + self.cv_input_data = ( + None + ) # type:Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] + self.devtrain_input_data = ( + None + ) # type:Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] self.dev_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] self.test_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] @@ -128,9 +138,9 @@ def _add_output_alias_for_train_job( def init_system( self, rasr_init_args: RasrInitArgs, - train_data: Dict[str, Union[ReturnnRasrDataInput, OggZipHdfDataInput]], - cv_data: Dict[str, Union[ReturnnRasrDataInput, OggZipHdfDataInput]], - devtrain_data: Optional[Dict[str, Union[ReturnnRasrDataInput, OggZipHdfDataInput]]] = None, + train_data: Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]], + cv_data: Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]], + devtrain_data: Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] = None, dev_data: Optional[Dict[str, ReturnnRasrDataInput]] = None, test_data: Optional[Dict[str, ReturnnRasrDataInput]] = None, train_cv_pairing: Optional[List[Tuple[str, ...]]] = None, # List[Tuple[trn_c, cv_c, name, dvtr_c]] @@ -211,27 +221,28 @@ def generate_lattices(self): def returnn_training( self, - name, - returnn_config, - nn_train_args, + name: str, + returnn_config: returnn.ReturnnConfig, + nn_train_args: Union[Dict, ReturnnTrainingJobArgs], train_corpus_key, cv_corpus_key, devtrain_corpus_key=None, - ): - assert isinstance(returnn_config, returnn.ReturnnConfig) - - returnn_config.config["train"] = self.train_input_data[train_corpus_key].get_data_dict() - returnn_config.config["dev"] = self.cv_input_data[cv_corpus_key].get_data_dict() - if devtrain_corpus_key is not None: - returnn_config.config["eval_datasets"] = { - "devtrain": self.devtrain_input_data[devtrain_corpus_key].get_data_dict() - } + ) -> returnn.ReturnnTrainingJob: + if nn_train_args.returnn_root is None: + nn_train_args.returnn_root = self.returnn_root + if nn_train_args.returnn_python_exe is None: + nn_train_args.returnn_python_exe = self.returnn_python_exe - train_job = returnn.ReturnnTrainingJob( + train_job = returnn_training( + name=name, returnn_config=returnn_config, - returnn_root=self.returnn_root, - returnn_python_exe=self.returnn_python_exe, - **nn_train_args, + training_args=nn_train_args, + train_data=self.train_input_data[train_corpus_key], + cv_data=self.cv_input_data[cv_corpus_key], + additional_data={"devtrain": self.devtrain_input_data[devtrain_corpus_key]} + if devtrain_corpus_key is not None + else None, + register_output=False, ) self._add_output_alias_for_train_job( train_job=train_job, @@ -346,7 +357,7 @@ def nn_recognition( name: str, returnn_config: returnn.ReturnnConfig, checkpoints: Dict[int, returnn.Checkpoint], - acoustic_mixture_path: tk.Path, # TODO maybe Optional if prior file provided -> automatically construct dummy file + train_job: Union[returnn.ReturnnTrainingJob, returnn.ReturnnRasrTrainingJob], prior_scales: List[float], pronunciation_scales: List[float], lm_scales: List[float], @@ -362,6 +373,7 @@ def nn_recognition( use_epoch_for_compile=False, forward_output_layer="output", native_ops: Optional[List[str]] = None, + acoustic_mixture_path: Optional[tk.Path] = None, **kwargs, ): with tk.block(f"{name}_recognition"): @@ -384,15 +396,31 @@ def nn_recognition( for pron, lm, prior, epoch in itertools.product(pronunciation_scales, lm_scales, prior_scales, epochs): assert epoch in checkpoints.keys() - assert acoustic_mixture_path is not None - - if use_epoch_for_compile: - tf_graph = self.nn_compile_graph(name, returnn_config, epoch=epoch) + acoustic_mixture_path = CreateDummyMixturesJob( + num_mixtures=returnn_config.config["extern_data"]["classes"]["dim"], + num_features=returnn_config.config["extern_data"]["data"]["dim"], + ).out_mixtures + lmgc_scorer = rasr.GMMFeatureScorer(acoustic_mixture_path) + prior_job = ReturnnComputePriorJobV2( + model_checkpoint=checkpoints[epoch], + returnn_config=train_job.returnn_config, + returnn_python_exe=train_job.returnn_python_exe, + returnn_root=train_job.returnn_root, + log_verbosity=train_job.returnn_config.post_config["log_verbosity"], + ) + prior_job.add_alias("extract_nn_prior/" + name) + prior_file = prior_job.out_prior_xml_file + assert prior_file is not None scorer = rasr.PrecomputedHybridFeatureScorer( prior_mixtures=acoustic_mixture_path, priori_scale=prior, + prior_file=prior_file, ) + assert acoustic_mixture_path is not None + + if use_epoch_for_compile: + tf_graph = self.nn_compile_graph(name, returnn_config, epoch=epoch) tf_flow = make_precomputed_hybrid_tf_feature_flow( tf_checkpoint=checkpoints[epoch], @@ -419,6 +447,8 @@ def nn_recognition( parallelize_conversion=parallelize_conversion, rtf=rtf, mem=mem, + lmgc_alias=f"lmgc/{name}/{recognition_corpus_key}-{recog_name}", + lmgc_scorer=lmgc_scorer, **kwargs, ) @@ -429,15 +459,22 @@ def nn_recog( returnn_config: Path, checkpoints: Dict[int, returnn.Checkpoint], step_args: HybridArgs, + train_job: Union[returnn.ReturnnTrainingJob, returnn.ReturnnRasrTrainingJob], ): for recog_name, recog_args in step_args.recognition_args.items(): + recog_args = copy.deepcopy(recog_args) + whitelist = recog_args.pop("training_whitelist", None) + if whitelist: + if train_name not in whitelist: + continue for dev_c in self.dev_corpora: self.nn_recognition( name=f"{train_corpus_key}-{train_name}-{recog_name}", returnn_config=returnn_config, checkpoints=checkpoints, - acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, + train_job=train_job, recognition_corpus_key=dev_c, + acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, **recog_args, ) @@ -451,8 +488,9 @@ def nn_recog( name=f"{train_name}-{recog_name}", returnn_config=returnn_config, checkpoints=checkpoints, - acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, + train_job=train_job, recognition_corpus_key=tst_c, + acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, **r_args, ) @@ -471,8 +509,12 @@ def nn_compile_graph( e.g. `def get_network(epoch=...)` in the config :return: the TF graph """ + # TODO remove, temporary hack + cfg = returnn_config + if "pretrain" in cfg.config.keys(): + del cfg.config["pretrain"] graph_compile_job = returnn.CompileTFGraphJob( - returnn_config, + cfg, epoch=epoch, returnn_root=self.returnn_root, returnn_python_exe=self.returnn_python_exe, @@ -509,7 +551,7 @@ def run_nn_step(self, step_name: str, step_args: HybridArgs): train_corpus_key=trn_c, cv_corpus_key=cv_c, ) - else: + elif isinstance(self.train_input_data[trn_c], AllowedReturnnTrainingDataInput): returnn_train_job = self.returnn_training( name=name, returnn_config=step_args.returnn_training_configs[name], @@ -518,6 +560,8 @@ def run_nn_step(self, step_name: str, step_args: HybridArgs): cv_corpus_key=cv_c, devtrain_corpus_key=dvtr_c, ) + else: + raise NotImplementedError returnn_recog_config = step_args.returnn_recognition_configs.get( name, step_args.returnn_training_configs[name] @@ -529,6 +573,7 @@ def run_nn_step(self, step_name: str, step_args: HybridArgs): returnn_config=returnn_recog_config, checkpoints=returnn_train_job.out_checkpoints, step_args=step_args, + train_job=returnn_train_job, ) def run_nn_recog_step(self, step_args: NnRecogArgs): diff --git a/common/setups/rasr/nn_system.py b/common/setups/rasr/nn_system.py index 13c3d239d..327a1b33d 100644 --- a/common/setups/rasr/nn_system.py +++ b/common/setups/rasr/nn_system.py @@ -1,36 +1,22 @@ -__all__ = ["NnSystem"] +__all__ = ["NnSystem", "returnn_training"] import copy -import itertools -import sys from dataclasses import asdict -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Union # -------------------- Sisyphus -------------------- import sisyphus.toolkit as tk import sisyphus.global_settings as gs -from sisyphus.delayed_ops import DelayedFormat - # -------------------- Recipes -------------------- -import i6_core.features as features -import i6_core.rasr as rasr import i6_core.returnn as returnn - -from i6_core.util import MultiPath, MultiOutputPath +from i6_core.tools import CloneGitRepositoryJob from .rasr_system import RasrSystem -from .util import ( - RasrInitArgs, - ReturnnRasrDataInput, - OggZipHdfDataInput, - HybridArgs, - NnRecogArgs, - RasrSteps, -) +from .util import ReturnnTrainingJobArgs, AllowedReturnnTrainingDataInput # -------------------- Init -------------------- @@ -79,7 +65,7 @@ def compile_native_op(self, op_name: str): returnn_python_exe=self.returnn_python_exe, blas_lib=self.blas_lib, ) - native_op_job.add_alias("native_ops/compile_native_%s" % op_name) + native_op_job.add_alias("wei_native_ops/compile_native_%s" % op_name) self.native_ops[op_name] = native_op_job.out_op def get_native_ops(self, op_names: Optional[List[str]]) -> Optional[List[tk.Path]]: @@ -95,3 +81,35 @@ def get_native_ops(self, op_names: Optional[List[str]]) -> Optional[List[tk.Path if op_name not in self.native_ops.keys(): self.compile_native_op(op_name) return [self.native_ops[op_name] for op_name in op_names] + + +def returnn_training( + name: str, + returnn_config: returnn.ReturnnConfig, + training_args: Union[Dict, ReturnnTrainingJobArgs], + train_data: AllowedReturnnTrainingDataInput, + *, + cv_data: Optional[AllowedReturnnTrainingDataInput] = None, + additional_data: Optional[Dict[str, AllowedReturnnTrainingDataInput]] = None, + register_output: bool = True, +) -> returnn.ReturnnTrainingJob: + assert isinstance(returnn_config, returnn.ReturnnConfig) + + config = copy.deepcopy(returnn_config) + + config.config["train"] = train_data if isinstance(train_data, Dict) else train_data.get_data_dict() + if cv_data is not None: + config.config["dev"] = cv_data if isinstance(cv_data, Dict) else cv_data.get_data_dict() + if additional_data is not None: + config.config["eval_datasets"] = {} + for name, data in additional_data.items(): + config.config["eval_datasets"][name] = data if isinstance(data, Dict) else data.get_data_dict() + returnn_training_job = returnn.ReturnnTrainingJob( + returnn_config=config, + **asdict(training_args) if isinstance(training_args, ReturnnTrainingJobArgs) else training_args, + ) + if register_output: + returnn_training_job.add_alias(f"nn_train/{name}") + tk.register_output(f"nn_train/{name}_learning_rates.png", returnn_training_job.out_plot_lr) + + return returnn_training_job diff --git a/common/setups/rasr/rasr_system.py b/common/setups/rasr/rasr_system.py index 1dc8b41a9..b7bc057c2 100644 --- a/common/setups/rasr/rasr_system.py +++ b/common/setups/rasr/rasr_system.py @@ -354,8 +354,9 @@ def forced_align( :param name: :param target_corpus_key: :param flow: - :param feature_scorer: :param feature_scorer_corpus_key: + :param feature_scorer: + :param scorer_index: :param dump_alignment: :param kwargs: :return: @@ -374,10 +375,10 @@ def forced_align( **kwargs, ) - align_job = self.jobs[target_corpus_key]["alignment_%s" % name] - align_job.add_alias("forced_alignment/alignment_%s" % name) + align_job: mm.AlignmentJob = self.jobs[target_corpus_key]["alignment_%s" % name] + align_job.add_alias("forced_alignment/alignment_%s/%s" % (name, target_corpus_key)) tk.register_output( - "forced_alignment/alignment_%s.bundle" % name, + "forced_alignment/alignment_%s_%s.bundle" % (name, target_corpus_key), align_job.out_alignment_bundle, ) @@ -388,8 +389,8 @@ def forced_align( original_alignment=meta.select_element(self.alignments, target_corpus_key, name), ) self.jobs[target_corpus_key]["alignment_dump_%s" % name] = dump_job - dump_job.add_alias("forced_alignment/alignment_dump_%s" % name) + dump_job.add_alias("forced_alignment/alignment_dump_%s/%s" % (name, target_corpus_key)) tk.register_output( - "forced_alignment/alignment_dump_%s.bundle" % name, + "forced_alignment/alignment_dump_%s_%s.bundle" % (name, target_corpus_key), dump_job.out_alignment_bundle, ) diff --git a/common/setups/rasr/util/nn/__init__.py b/common/setups/rasr/util/nn/__init__.py new file mode 100644 index 000000000..241cef3d3 --- /dev/null +++ b/common/setups/rasr/util/nn/__init__.py @@ -0,0 +1,4 @@ +from .common import * +from .data import * +from .decode import * +from .training import * diff --git a/common/setups/rasr/util/nn/common.py b/common/setups/rasr/util/nn/common.py new file mode 100644 index 000000000..73a573c2c --- /dev/null +++ b/common/setups/rasr/util/nn/common.py @@ -0,0 +1,53 @@ +__all__ = ["HybridArgs", "NnForcedAlignArgs"] + +from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union + +import i6_core.rasr as rasr +import i6_core.returnn as returnn + +from .decode import KeyedRecogArgsType +from .training import ReturnnRasrTrainingArgs, ReturnnTrainingJobArgs + + +class HybridArgs: + def __init__( + self, + returnn_training_configs: Dict[str, returnn.ReturnnConfig], + returnn_recognition_configs: Dict[str, returnn.ReturnnConfig], + training_args: Union[Dict[str, Any], ReturnnRasrTrainingArgs, ReturnnTrainingJobArgs], + recognition_args: KeyedRecogArgsType, + test_recognition_args: Optional[KeyedRecogArgsType] = None, + ): + """ + ################################################## + :param returnn_training_configs + RETURNN config keyed by training corpus. + ################################################## + :param returnn_recognition_configs + If a config is not found here, the corresponding training config is used + ################################################## + :param training_args: + ################################################## + :param recognition_args: + Configuration for recognition on dev corpora. + ################################################## + :param test_recognition_args: + Additional configuration for recognition on test corpora. Merged with recognition_args. + ################################################## + """ + self.returnn_training_configs = returnn_training_configs + self.returnn_recognition_configs = returnn_recognition_configs + self.training_args = training_args + self.recognition_args = recognition_args + self.test_recognition_args = test_recognition_args + + +class NnForcedAlignArgs(TypedDict): + name: str + target_corpus_keys: List[str] + feature_scorer_corpus_key: str + scorer_model_key: Union[str, List[str], Tuple[str], rasr.FeatureScorer] + epoch: int + base_flow_key: str + tf_flow_key: str + dump_alignment: bool diff --git a/common/setups/rasr/util/nn/data.py b/common/setups/rasr/util/nn/data.py new file mode 100644 index 000000000..0ba98c43e --- /dev/null +++ b/common/setups/rasr/util/nn/data.py @@ -0,0 +1,615 @@ +__all__ = [ + "RasrDataInput", + "ReturnnRasrTrainingArgs", + "ReturnnRasrDataInput", + "AllophoneLabeling", + "OggZipRasrCacheDataInput", + "OggZipExternRasrDataInput", + "OggZipHdfDataInput", + "HdfDataInput", + "NextGenHdfDataInput", + "ReturnnRawAlignmentHdfTrainingDataInput", + "AllowedReturnnTrainingDataInput", +] + +import copy +from dataclasses import dataclass, asdict +from typing import Any, Dict, List, Optional, Tuple, Type, TypedDict, Union + +from sisyphus import tk +from sisyphus.delayed_ops import DelayedFormat, DelayedBase + +import i6_core.am as am +import i6_core.rasr as rasr +import i6_core.returnn as returnn + +from i6_core.returnn.hdf import BlissToPcmHDFJob, RasrAlignmentDumpHDFJob +from i6_core.util import MultiPath + +RasrCacheTypes = Union[tk.Path, str, MultiPath, rasr.FlagDependentFlowAttribute, rasr.FlowNetwork] + + +@dataclass(frozen=True) +class RasrDataInput: + features: RasrCacheTypes + + +@dataclass(frozen=True) +class ReturnnRasrTrainingArgs: + """ + Options for writing a RASR training config. See `ReturnnRasrTrainingJob`. + Most of them may be disregarded, i.e. the defaults can be left untouched. + + :param partition_epochs: if >1, split the full dataset into multiple sub-epochs + :param num_classes: number of classes + :param disregarded_classes: path to file with list of disregarded classes + :param class_label_file: path to file with class labels + :param buffer_size: buffer size for data loading + :param extra_rasr_config: extra RASR config + :param extra_rasr_post_config: extra RASR post config + :param use_python_control: whether to use python control, usually True + """ + + partition_epochs: Optional[int] = None + num_classes: Optional[int] = None + disregarded_classes: Optional[tk.Path] = None + class_label_file: Optional[tk.Path] = None + buffer_size: int = 200 * 1024 + extra_rasr_config: Optional[rasr.RasrConfig] = None + extra_rasr_post_config: Optional[rasr.RasrConfig] = None + use_python_control: bool = True + + +class ReturnnRasrDataInput: + """ + Holds the data for ReturnnRasrTrainingJob. + """ + + def __init__( + self, + name: str, + crp: Optional[rasr.CommonRasrParameters] = None, + alignments: Optional[RasrCacheTypes] = None, + feature_flow: Optional[Union[rasr.FlowNetwork, Dict[str, rasr.FlowNetwork]]] = None, + features: Optional[Union[RasrCacheTypes, Dict[str, RasrCacheTypes]]] = None, + acoustic_mixtures: Optional[Union[tk.Path, str]] = None, + feature_scorers: Optional[Dict[str, Type[rasr.FeatureScorer]]] = None, + shuffle_data: bool = True, + shuffling_parameters: Optional[Dict[str, Any]] = None, + stm: Optional[tk.Path] = None, + glm: Optional[tk.Path] = None, + returnn_rasr_training_args: Optional[ReturnnRasrTrainingArgs] = None, + **kwargs, + ): + """ + + :param name: name of the data + :param crp: common RASR parameters + :param alignments: RASR cache of an alignment + :param feature_flow: acoustic feature flow network or dict of feature flow networks + :param features: RASR cache of acoustic features + :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) + :param feature_scorers: RASR feature scorers + :param shuffle_data: shuffle training segments into bins of similar length. The bins are sorted by length. + :param shuffling_parameters: Dict of additional parameters to set for shuffling, + currently only 'segment_order_sort_by_time_length_chunk_size' is supported + :param stm: stm file for scoring + :param glm: glm file for scoring + :param returnn_rasr_training_args: arguments for RETURNN training with RASR + """ + self.name = name + self.crp = crp + self.alignments = alignments + self.feature_flow = feature_flow + self.features = features + self.acoustic_mixtures = acoustic_mixtures + self.feature_scorers = feature_scorers + self.shuffle_data = shuffle_data + self.shuffling_parameters = shuffling_parameters + if shuffle_data and self.shuffling_parameters is None: + # apply the legacy defaults if shuffling_parameters is not set + self.shuffling_parameters = {"segment_order_sort_by_time_length_chunk_size": 384} + self.stm = stm + self.glm = glm + self.returnn_rasr_training_args = returnn_rasr_training_args or ReturnnRasrTrainingArgs() + + def get_training_feature_flow_file(self) -> tk.Path: + """Returns the feature flow file for the RETURNN training with RASR.""" + feature_flow = returnn.ReturnnRasrTrainingJob.create_flow(self.feature_flow, self.alignments) + write_feature_flow = rasr.WriteFlowNetworkJob(feature_flow) + return write_feature_flow.out_flow_file + + def get_training_rasr_config_file(self) -> tk.Path: + """Returns the RASR config file for the RETURNN training with RASR.""" + config, post_config = returnn.ReturnnRasrTrainingJob.create_config( + self.crp, self.alignments, **asdict(self.returnn_rasr_training_args) + ) + config.neural_network_trainer.feature_extraction.file = self.get_training_feature_flow_file() + write_rasr_config = rasr.WriteRasrConfigJob(config, post_config) + return write_rasr_config.out_config + + def get_data_dict(self) -> Dict[str, Union[str, DelayedFormat, tk.Path]]: + """Returns the data dict for the ExternSprintDataset to be used in a training ReturnnConfig.""" + config_file = self.get_training_rasr_config_file() + config_str = DelayedFormat("--config={} --*.LOGFILE=nn-trainer.{}.log --*.TASK=1", config_file, self.name) + dataset = { + "class": "ExternSprintDataset", + "sprintTrainerExecPath": rasr.RasrCommand.select_exe(self.crp.nn_trainer_exe, "nn-trainer"), + "sprintConfigStr": config_str, + } + partition_epochs = self.returnn_rasr_training_args.partition_epochs + if partition_epochs is not None: + dataset["partitionEpoch"] = partition_epochs + return dataset + + def build_crp( + self, + am_args, + corpus_object, + concurrent, + segment_path, + lexicon_args, + cart_tree_path=None, + allophone_file=None, + lm_args=None, + ): + """ + constructs and returns a CommonRasrParameters from the given settings and files + """ + crp = rasr.CommonRasrParameters() + rasr.crp_add_default_output(crp) + crp.acoustic_model_config = am.acoustic_model_config(**am_args) + rasr.crp_set_corpus(crp, corpus_object) + crp.concurrent = concurrent + crp.segment_path = segment_path + + crp.lexicon_config = rasr.RasrConfig() + crp.lexicon_config.file = lexicon_args["filename"] + crp.lexicon_config.normalize_pronunciation = lexicon_args["normalize_pronunciation"] + + if "add_from_lexicon" in lexicon_args: + crp.acoustic_model_config.allophones.add_from_lexicon = lexicon_args["add_from_lexicon"] + if "add_all" in lexicon_args: + crp.acoustic_model_config.allophones.add_all = lexicon_args["add_all"] + + if cart_tree_path is not None: + crp.acoustic_model_config.state_tying.type = "cart" + crp.acoustic_model_config.state_tying.file = cart_tree_path + + if lm_args is not None: + crp.language_model_config = rasr.RasrConfig() + crp.language_model_config.type = lm_args["type"] + crp.language_model_config.file = lm_args["filename"] + crp.language_model_config.scale = lm_args["scale"] + + if allophone_file is not None: + crp.acoustic_model_config.allophones.add_from_file = allophone_file + + self.crp = crp + + def update_crp_with_shuffle_parameters(self): + if self.shuffle_data: + self.crp.corpus_config.segment_order_shuffle = True + if self.shuffling_parameters is not None: + if "segment_order_sort_by_time_length_chunk_size" in self.shuffling_parameters: + self.crp.corpus_config.segment_order_sort_by_time_length = True + self.crp.corpus_config.segment_order_sort_by_time_length_chunk_size = self.shuffling_parameters[ + "segment_order_sort_by_time_length_chunk_size" + ] + + def update_crp_with( + self, + *, + corpus_file: Optional[tk.Path] = None, + audio_dir: Optional[Union[str, tk.Path]] = None, + corpus_duration: Optional[int] = None, + segment_path: Optional[Union[str, tk.Path]] = None, + concurrent: Optional[int] = None, + shuffle_data: Optional[bool] = None, + shuffling_parameters: Optional[Dict[str, Any]] = None, + ): + if corpus_file is not None: + self.crp.corpus_config.file = corpus_file + if audio_dir is not None: + self.crp.corpus_config.audio_dir = audio_dir + if corpus_duration is not None: + self.crp.corpus_duration = corpus_duration + if segment_path is not None: + self.crp.segment_path = segment_path + if concurrent is not None: + self.crp.concurrent = concurrent + if shuffle_data is not None: + self.shuffle_data = shuffle_data + if shuffling_parameters is not None: + assert self.shuffle_data, "You need to set shuffle_data to true when using shuffling_parameters" + self.shuffling_parameters = shuffling_parameters + self.update_crp_with_shuffle_parameters() + + def get_crp(self, **kwargs) -> rasr.CommonRasrParameters: + """ + constructs and returns a CommonRasrParameters from the given settings and files + :rtype CommonRasrParameters: + """ + if self.crp is None: + self.build_crp(**kwargs) + + if self.shuffle_data: + self.update_crp_with_shuffle_parameters() + + return self.crp + + +@dataclass() +class AllophoneLabeling: + silence_phone: str + allophone_file: Union[tk.Path, DelayedBase] + phoneme_file: Optional[Union[tk.Path, DelayedBase]] = None + state_tying_file: Optional[Union[tk.Path, DelayedBase]] = None + + +class OggZipRasrCacheDataInput: + def __init__( + self, + oggzip_files: List[tk.Path], + audio: Dict, + alignment_file: tk.Path, + allophone_labeling: AllophoneLabeling, + partition_epoch: int = 1, + seq_ordering: str = "laplace:.1000", + *, + meta_args: Optional[Dict[str, Any]] = None, + ogg_args: Optional[Dict[str, Any]] = None, + rasr_args: Optional[Dict[str, Any]] = None, + acoustic_mixtures: Optional[tk.Path] = None, + ): + """ + :param oggzip_files: zipped ogg files which contain the audio + :param audio: e.g. {"features": "raw", "sample_rate": 16000} for raw waveform input with a sample rate of 16 kHz + :param alignment_file: hdf files which contain dumped RASR alignments + :param allophone_labeling: labels for the RASR alignments + :param partition_epoch: if >1, split the full dataset into multiple sub-epochs + :param seq_ordering: sort the sequences in the dataset, e.g. "random" or "laplace:.100" + :param meta_args: parameters for the `MetaDataset` + :param ogg_args: parameters for the `OggZipDataset` + :param rasr_args: parameters for the `SprintCacheDataset` + :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) + """ + self.oggzip_files = oggzip_files + self.audio = audio + self.alignment_file = alignment_file + self.allophone_labeling = allophone_labeling + self.partition_epoch = partition_epoch + self.seq_ordering = seq_ordering + self.meta_args = meta_args + self.ogg_args = ogg_args + self.rasr_args = rasr_args + self.acoustic_mixtures = acoustic_mixtures + + def get_data_dict(self): + return { + "class": "MetaDataset", + "data_map": {"classes": ("rasr", "classes"), "data": ("ogg", "data")}, + "datasets": { + "rasr": { + "class": "SprintCacheDataset", + "data": { + "classes": { + "filename": self.alignment_file, + "data_type": "align", + "allophone_labeling": asdict(self.allophone_labeling), + }, + }, + "use_cache_manager": True, + **(self.rasr_args or {}), + }, + "ogg": { + "class": "OggZipDataset", + "audio": self.audio, + "path": self.oggzip_files, + "use_cache_manager": True, + **(self.ogg_args or {}), + }, + }, + "partition_epoch": self.partition_epoch, + "seq_ordering": self.seq_ordering, + **(self.meta_args or {}), + } + + +class OggZipExternRasrDataInput: + def __init__( + self, + oggzip_files: List[tk.Path], + audio: Dict, + alignment_file: tk.Path, + rasr_exe: tk.Path, + rasr_config_str: str, + partition_epoch: int = 1, + seq_ordering: str = "laplace:.1000", + reduce_target_factor: int = 1, + *, + meta_args: Optional[Dict[str, Any]] = None, + ogg_args: Optional[Dict[str, Any]] = None, + rasr_args: Optional[Dict[str, Any]] = None, + acoustic_mixtures: Optional[tk.Path] = None, + ): + """ + :param oggzip_files: zipped ogg files which contain the audio + :param audio: e.g. {"features": "raw", "sample_rate": 16000} for raw waveform input with a sample rate of 16 kHz + :param alignment_file: hdf files which contain dumped RASR alignments + :param rasr_exe: path to RASR NN trainer executable + :param rasr_config_str: str of rasr parameters + :param partition_epoch: if >1, split the full dataset into multiple sub-epochs + :param seq_ordering: sort the sequences in the dataset, e.g. "random" or "laplace:.100" + :param reduce_target_factor: reduce the alignment by a factor + :param meta_args: parameters for the `MetaDataset` + :param ogg_args: parameters for the `OggZipDataset` + :param rasr_args: parameters for the `SprintCacheDataset` + :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) + """ + self.oggzip_files = oggzip_files + self.audio = audio + self.alignment_file = alignment_file + self.rasr_exe = rasr_exe + self.rasr_config_str = rasr_config_str + self.partition_epoch = partition_epoch + self.seq_ordering = seq_ordering + self.reduce_target_factor = reduce_target_factor + self.meta_args = meta_args + self.ogg_args = ogg_args + self.rasr_args = rasr_args + self.acoustic_mixtures = acoustic_mixtures + + def get_data_dict(self): + return { + "class": "MetaDataset", + "data_map": {"classes": ("rasr", "classes"), "data": ("ogg", "data")}, + "datasets": { + "rasr": { + "class": "SprintCacheDataset", + "sprintConfigSts": self.rasr_config_str, + "sprintTrainerExecPath": self.rasr_exe, + "partition_epoch": self.partition_epoch, + "suppress_load_seqs_print": True, + "reduce_target_factor": self.reduce_target_factor, + **(self.rasr_args or {}), + }, + "ogg": { + "class": "OggZipDataset", + "audio": self.audio, + "path": self.oggzip_files, + "use_cache_manager": True, + **(self.ogg_args or {}), + }, + }, + "seq_order_control_dataset": "rasr", + **(self.meta_args or {}), + } + + +class OggZipHdfDataInput: + def __init__( + self, + oggzip_files: List[tk.Path], + alignments: List[tk.Path], + audio: Dict, + partition_epoch: int = 1, + seq_ordering: str = "laplace:.1000", + meta_args: Optional[Dict[str, Any]] = None, + ogg_args: Optional[Dict[str, Any]] = None, + hdf_args: Optional[Dict[str, Any]] = None, + acoustic_mixtures: Optional[tk.Path] = None, + ): + """ + :param oggzip_files: zipped ogg files which contain the audio + :param alignments: hdf files which contain dumped RASR alignments + :param audio: e.g. {"features": "raw", "sample_rate": 16000} for raw waveform input with a sample rate of 16 kHz + :param partition_epoch: if >1, split the full dataset into multiple sub-epochs + :param seq_ordering: sort the sequences in the dataset, e.g. "random" or "laplace:.100" + :param meta_args: parameters for the `MetaDataset` + :param ogg_args: parameters for the `OggZipDataset` + :param hdf_args: parameters for the `HdfDataset` + :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) + """ + self.oggzip_files = oggzip_files + self.alignments = alignments + self.audio = audio + self.partition_epoch = partition_epoch + self.seq_ordering = seq_ordering + self.meta_args = meta_args + self.ogg_args = ogg_args + self.hdf_args = hdf_args + self.acoustic_mixtures = acoustic_mixtures + + def get_data_dict(self): + return { + "class": "MetaDataset", + "data_map": {"classes": ("hdf", "classes"), "data": ("ogg", "data")}, + "datasets": { + "hdf": { + "class": "HDFDataset", + "files": self.alignments, + "use_cache_manager": True, + **(self.hdf_args or {}), + }, + "ogg": { + "class": "OggZipDataset", + "audio": self.audio, + "partition_epoch": self.partition_epoch, + "path": self.oggzip_files, + "seq_ordering": self.seq_ordering, + "use_cache_manager": True, + **(self.ogg_args or {}), + }, + }, + "seq_order_control_dataset": "ogg", + **(self.meta_args or {}), + } + + +class HdfDataInput: + def __init__( + self, + features: Union[tk.Path, List[tk.Path]], + alignments: Union[tk.Path, List[tk.Path]], + partition_epoch: int = 1, + seq_ordering: str = "laplace:.1000", + *, + meta_args: Optional[Dict[str, Any]] = None, + align_args: Optional[Dict[str, Any]] = None, + feat_args: Optional[Dict[str, Any]] = None, + acoustic_mixtures: Optional[tk.Path] = None, + segment_file: Optional[tk.Path] = None, + ): + """ + :param features: hdf files which contain raw wve form or features, like GT or MFCC + :param alignments: hdf files which contain dumped RASR alignments + :param partition_epoch: if >1, split the full dataset into multiple sub-epochs + :param seq_ordering: sort the sequences in the dataset, e.g. "random" or "laplace:.100" + :param meta_args: parameters for the `MetaDataset` + :param align_args: parameters for the `HDFDataset` for the alignments + :param feat_args: parameters for the `HDFDataset` for the features + :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) + :param segment_file: path to the segment file which defines which segments from corpus to use + """ + self.features = features + self.alignments = alignments + self.partition_epoch = partition_epoch + self.seq_ordering = seq_ordering + self.meta_args = meta_args + self.align_args = align_args + self.feat_args = feat_args + self.acoustic_mixtures = acoustic_mixtures + self.segment_file = segment_file + + from returnn_common.datasets import MetaDataset, HDFDataset + + self.align_dataset = HDFDataset( + files=self.alignments, + seq_ordering=self.seq_ordering, + partition_epoch=self.partition_epoch, + segment_file=self.segment_file, + **(self.align_args or {}), + ) + self.feature_dataset = HDFDataset(files=self.features, **(self.feat_args or {})) + self.meta_dataset = MetaDataset( + data_map={"classes": ("align", "data"), "data": ("feat", "data")}, + datasets={"align": self.align_dataset, "feat": self.feature_dataset}, + seq_order_control_dataset="align", + additional_options={**(self.meta_args or {})}, + ) + + def get_data_dict(self): + return self.meta_dataset.as_returnn_opts() + + def get_dataset_object(self): + return self.meta_dataset + + +class NextGenHdfDataInput: + def __init__( + self, + streams: Dict[str, List[tk.Path]], + data_map: Dict[str, Tuple[str, str]], + partition_epoch: int = 1, + seq_ordering: str = "laplace:.1000", + *, + meta_args: Optional[Dict[str, Any]] = None, + stream_args: Optional[Dict[str, Dict[str, Any]]] = None, + acoustic_mixtures: Optional[tk.Path] = None, + ): + """ + :param streams: `NextGenHDFDataset` for different data streams + :param data_map: a data map specifying the connection between the data stored in the HDF and RETURNN. + Key is the RETURNN name, first value is the name in the `datasets` from `MetaDataset`, + second value the name in the HDF. + :param partition_epoch: if >1, split the full dataset into multiple sub-epochs + :param seq_ordering: sort the sequences in the dataset, e.g. "random" or "laplace:.100" + :param meta_args: parameters for the `MetaDataset` + :param stream_args: parameters for the different `NextGenHDFDataset` + :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) + """ + self.streams = streams + self.data_map = data_map + self.partition_epoch = partition_epoch + self.seq_ordering = seq_ordering + self.meta_args = meta_args + self.stream_args = stream_args + self.acoustic_mixtures = acoustic_mixtures + + assert sorted(list(streams.keys())) == sorted([x[0] for x in data_map.values()]) + + def get_data_dict(self): + d = { + "class": "MetaDataset", + "data_map": {}, + "datasets": {}, + "partition_epoch": self.partition_epoch, + "seq_ordering": self.seq_ordering, + **(self.meta_args or {}), + } + for k, v in self.data_map.items(): + d["data_map"][k] = v + + for k, v in self.streams.items(): + d["datasets"][k] = { + "class": "NextGenHDFDataset", + "files": v, + "use_cache_manager": True, + } + if self.stream_args is not None: + d["datasets"][k].update(**self.stream_args[k] or {}) + + return d + + +@dataclass() +class ReturnnRawAlignmentHdfTrainingDataInput: + bliss_corpus: tk.Path + alignment_caches: List[tk.Path] + state_tying_file: tk.Path + allophone_file: tk.Path + returnn_root: tk.Path + seq_ordering: str + + def get_data_dict(self): + raw_hdf_path = BlissToPcmHDFJob( + bliss_corpus=self.bliss_corpus, + returnn_root=self.returnn_root, + ).out_hdf + alignment_hdf_path = RasrAlignmentDumpHDFJob( + alignment_caches=self.alignment_caches, + allophone_file=self.allophone_file, + state_tying_file=self.state_tying_file, + returnn_root=self.returnn_root, + ).out_hdf_files + + data = { + "class": "MetaDataset", + "data_map": {"classes": ("alignments", "data"), "data": ("features", "data")}, + "datasets": { + "alignments": { + "class": "HDFDataset", + "files": alignment_hdf_path, + "seq_ordering": self.seq_ordering, + }, + "features": { + "class": "HDFDataset", + "files": [raw_hdf_path], + }, + }, + "seq_order_control_dataset": "alignments", + } + + return data + + +AllowedReturnnTrainingDataInput = Union[ + Dict, + OggZipRasrCacheDataInput, + OggZipExternRasrDataInput, + OggZipHdfDataInput, + NextGenHdfDataInput, + ReturnnRawAlignmentHdfTrainingDataInput, + HdfDataInput, +] diff --git a/common/setups/rasr/util/nn/decode.py b/common/setups/rasr/util/nn/decode.py new file mode 100644 index 000000000..e70c5256c --- /dev/null +++ b/common/setups/rasr/util/nn/decode.py @@ -0,0 +1,88 @@ +__all__ = ["SearchParameters", "LookaheadOptions", "LatticeToCtmArgs", "NnRecogArgs", "KeyedRecogArgsType"] + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, TypedDict, Union + +from sisyphus import tk + +import i6_core.returnn as returnn + +# Attribute names are invalid identifiers, therefore use old syntax +SearchParameters = TypedDict( + "SearchParameters", + { + "beam-pruning": float, + "beam-pruning-limit": float, + "lm-state-pruning": Optional[float], + "word-end-pruning": float, + "word-end-pruning-limit": float, + }, +) + + +class LookaheadOptions(TypedDict): + cache_high: Optional[int] + cache_low: Optional[int] + history_limit: Optional[int] + laziness: Optional[int] + minimum_representation: Optional[int] + tree_cutoff: Optional[int] + + +class LatticeToCtmArgs(TypedDict): + best_path_algo: Optional[str] + encoding: Optional[str] + extra_config: Optional[Any] + extra_post_config: Optional[Any] + fill_empty_segments: Optional[bool] + + +class NnRecogArgs(TypedDict): + acoustic_mixture_path: Optional[tk.Path] + checkpoints: Optional[Dict[int, returnn.Checkpoint]] + create_lattice: Optional[bool] + epochs: Optional[List[int]] + eval_best_in_lattice: Optional[bool] + eval_single_best: Optional[bool] + feature_flow_key: str + lattice_to_ctm_kwargs: Optional[LatticeToCtmArgs] + lm_lookahead: bool + lm_scales: List[float] + lookahead_options: Optional[LookaheadOptions] + mem: int + name: str + optimize_am_lm_scale: bool + parallelize_conversion: Optional[bool] + prior_scales: List[float] + pronunciation_scales: List[float] + returnn_config: Optional[returnn.ReturnnConfig] + rtf: int + search_parameters: Optional[SearchParameters] + use_gpu: Optional[bool] + + +@dataclass() +class NnRecogArgs: + name: str + returnn_config: returnn.ReturnnConfig + checkpoints: Dict[int, returnn.Checkpoint] + acoustic_mixture_path: tk.Path + prior_scales: List[float] + pronunciation_scales: List[float] + lm_scales: List[float] + optimize_am_lm_scale: bool + feature_flow_key: str + search_parameters: Dict + lm_lookahead: bool + lattice_to_ctm_kwargs: Dict + parallelize_conversion: bool + rtf: int + mem: int + lookahead_options: Optional[Dict] = None + epochs: Optional[List[int]] = None + native_ops: Optional[List[str]] = None + + +# TODO merge the two NnRecogArgs + +KeyedRecogArgsType = Dict[str, Union[Dict[str, Any], NnRecogArgs]] diff --git a/common/setups/rasr/util/nn/training.py b/common/setups/rasr/util/nn/training.py new file mode 100644 index 000000000..3c3d1e606 --- /dev/null +++ b/common/setups/rasr/util/nn/training.py @@ -0,0 +1,49 @@ +__all__ = ["ReturnnTrainingJobArgs", "EpochPartitioning", "ReturnnRasrTrainingArgs"] + +from dataclasses import dataclass, field +from typing import Any, List, Optional, Set, TypedDict, Union + +from sisyphus import tk + +import i6_core.rasr as rasr + + +@dataclass() +class ReturnnTrainingJobArgs: + num_epochs: int + log_verbosity: int = field(default=4) + device: str = field(default="gpu") + save_interval: int = field(default=1) + keep_epochs: Optional[Union[List[int], Set[int]]] = None + time_rqmt: float = field(default=168) + mem_rqmt: float = field(default=14) + cpu_rqmt: int = field(default=4) + horovod_num_processes: Optional[int] = None + multi_node_slots: Optional[int] = None + returnn_python_exe: Optional[tk.Path] = None + returnn_root: Optional[tk.Path] = None + + +class EpochPartitioning(TypedDict): + dev: int + train: int + + +class ReturnnRasrTrainingArgs(TypedDict): + buffer_size: Optional[int] + class_label_file: Optional[tk.Path] + cpu_rqmt: Optional[int] + device: Optional[str] + disregarded_classes: Optional[Any] + extra_rasr_config: Optional[rasr.RasrConfig] + extra_rasr_post_config: Optional[rasr.RasrConfig] + horovod_num_processes: Optional[int] + keep_epochs: Optional[bool] + log_verbosity: Optional[int] + mem_rqmt: Optional[int] + num_classes: int + num_epochs: int + partition_epochs: Optional[EpochPartitioning] + save_interval: Optional[int] + time_rqmt: Optional[int] + use_python_control: Optional[bool] diff --git a/common/tools/sctk.py b/common/tools/sctk.py index 7fb1b2b37..388d44cec 100644 --- a/common/tools/sctk.py +++ b/common/tools/sctk.py @@ -10,6 +10,7 @@ def compile_sctk( branch: Optional[str] = None, commit: Optional[str] = None, sctk_git_repository: str = "https://github.com/usnistgov/SCTK.git", + alias: Optional[str] = None, ) -> tk.Path: """ :param branch: specify a specific branch @@ -17,12 +18,22 @@ def compile_sctk( :param sctk_git_repository: where to clone SCTK from, usually does not need to be altered :return: SCTK binary folder """ - sctk_repo = CloneGitRepositoryJob(url=sctk_git_repository, branch=branch, commit=commit).out_repository + sctk_repo = CloneGitRepositoryJob( + url=sctk_git_repository, + branch=branch, + commit=commit, + checkout_folder_name=alias if alias is not None else "repository", + ).out_repository sctk_make = MakeJob( folder=sctk_repo, make_sequence=["config", "all", "check", "install", "doc"], link_outputs={"bin": "bin/"}, ) + + # This is probably the dirtiest hack i ever did: + if alias is not None: + sctk_make.add_alias(alias) + # This is needed for the compilation to work in the i6 environment, otherwise still untested sctk_make._sis_environment.set("CPPFLAGS", "-std=c++11") return sctk_make.out_links["bin"] From 5c8f69a85593c047d8ff95eb8fde9d8b5898fde7 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:16:52 +0200 Subject: [PATCH 02/26] update tedlium2/data.py --- common/baselines/tedlium2/data.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/common/baselines/tedlium2/data.py b/common/baselines/tedlium2/data.py index e75475746..92daa180c 100644 --- a/common/baselines/tedlium2/data.py +++ b/common/baselines/tedlium2/data.py @@ -1,7 +1,6 @@ from collections import defaultdict from typing import Dict -from sisyphus import tk from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT from i6_experiments.common.datasets.tedlium2.corpus import get_corpus_object_dict from i6_experiments.common.datasets.tedlium2.lexicon import ( @@ -29,9 +28,6 @@ def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dic lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping) lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm) - kaldi_small_lm = ArpaLmRasrConfig( - lm_path=tk.Path("/work/asr3/zhou/kaldi/egs/tedlium/s5_r2/data/local/local_lm/data/arpa/4gram_small.arpa.gz") - ) rasr_data_input_dict = defaultdict(dict) @@ -42,11 +38,5 @@ def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = False) -> Dic concurrent=CONCURRENT[name], lm=comb_lm.get_dict() if name == "dev" or name == "test" else None, ) - rasr_data_input_dict["dev"]["dev_kaldi_small_4_gram"] = RasrDataInput( - corpus_object=crp_obj, - lexicon=train_lexicon.get_dict(), - concurrent=CONCURRENT[name], - lm=kaldi_small_lm.get_dict(), - ) return rasr_data_input_dict From fc233426237c2689cbf9e8ff4685e3fc1002f260 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:19:13 +0200 Subject: [PATCH 03/26] update default tools --- common/baselines/tedlium2/default_tools.py | 17 +---------------- .../tedlium2/hybrid/baseline_config.py | 5 ++--- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/common/baselines/tedlium2/default_tools.py b/common/baselines/tedlium2/default_tools.py index 0e99b9b55..265642c29 100644 --- a/common/baselines/tedlium2/default_tools.py +++ b/common/baselines/tedlium2/default_tools.py @@ -8,32 +8,17 @@ version listed here. Nevertheless, the most recent "head" should be safe to be used as well. """ from sisyphus import tk -from i6_experiments.common.tools.audio import compile_ffmpeg_binary -from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode from i6_experiments.common.tools.sctk import compile_sctk from i6_core.tools.git import CloneGitRepositoryJob PACKAGE = __package__ -# RASR_BINARY_PATH = compile_rasr_binaries_i6mode( -# branch="apptainer_tf_2_8", configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"] -# ) # use most recent RASR -# RASR_BINARY_PATH = compile_rasr_binaries_i6mode( -# branch="apptainer_tf_2_8", -# configure_options=["--apptainer-setup=2023-05-08_tensorflow-2.8_v1"], -# commit="5e7adf5034dbafac90caf0e50b5bfd6410c98d5e", -# ) # use most recent RASR -# assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline" -RASR_BINARY_PATH = tk.Path( - "/work/asr4/rossenbach/rescale/pytorch_mixed_precision/onnx_extended_rasr/arch/linux-x86_64-standard" -) +RASR_BINARY_PATH = tk.Path("/work/tools/asr/rasr/20211217_tf23_cuda101_mkl/arch/linux-x86_64-standard") RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" SCTK_BINARY_PATH = compile_sctk() # use last published version SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH" -SCTK_BINARY_PATH2 = compile_sctk(alias="wei_u16_sctk") # use last published version, HACK to have u16 compiled - SRILM_PATH = tk.Path("/work/tools/users/luescher/srilm-1.7.3/bin/i686-m64/") SRILM_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SRILM_PATH" diff --git a/common/baselines/tedlium2/hybrid/baseline_config.py b/common/baselines/tedlium2/hybrid/baseline_config.py index 4146a8036..9d0407d91 100644 --- a/common/baselines/tedlium2/hybrid/baseline_config.py +++ b/common/baselines/tedlium2/hybrid/baseline_config.py @@ -5,7 +5,7 @@ from i6_experiments.common.setups.rasr.util import RasrSteps from i6_experiments.common.setups.rasr.hybrid_system import HybridSystem -from i6_experiments.common.baselines.tedlium2.default_tools import RETURNN_RC_ROOT, SCTK_BINARY_PATH2 +from i6_experiments.common.baselines.tedlium2.default_tools import RETURNN_RC_ROOT, RASR_BINARY_PATH, SCTK_BINARY_PATH from .data import get_corpus_data_inputs from .baseline_args import get_log_mel_feature_extraction_args @@ -26,7 +26,6 @@ def run_tedlium2_hybrid_baseline(): gmm_system = run_gmm_system() rasr_init_args = copy.deepcopy(gmm_system.rasr_init_args) - rasr_init_args.scorer_args["sctk_binary_path"] = SCTK_BINARY_PATH2 # Hack to have a U16 compiled SCTK rasr_init_args.feature_extraction_args = get_log_mel_feature_extraction_args() ( nn_train_data_inputs, @@ -60,7 +59,7 @@ def run_tedlium2_hybrid_baseline(): returnn_python_exe=returnn_exe, blas_lib=blas_lib, rasr_arch="linux-x86_64-standard", - rasr_binary_path=rasr_binary, + rasr_binary_path=RASR_BINARY_PATH, ) tedlium_nn_system.init_system( rasr_init_args=rasr_init_args, From 2d2f39abc1c12dd1812ec0edaa588e2c54235f0d Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:33:08 +0200 Subject: [PATCH 04/26] updates to gmm args --- common/baselines/tedlium2/gmm/baseline_args.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/baselines/tedlium2/gmm/baseline_args.py b/common/baselines/tedlium2/gmm/baseline_args.py index 927d3d466..f305c9146 100644 --- a/common/baselines/tedlium2/gmm/baseline_args.py +++ b/common/baselines/tedlium2/gmm/baseline_args.py @@ -322,8 +322,8 @@ def get_sat_args(): "cmllr_mixtures": "estimate_mixtures_sdm.tri", "iters": [8, 9, 10], "feature_flow": "uncached_mfcc+context+lda", - "pronunciation_scales": [1.0, 0.0], - "lm_scales": [25, 8.0, 20], + "pronunciation_scales": [0.0], + "lm_scales": [25, 20, 8.0], "lm_lookahead": True, "lookahead_options": None, "create_lattice": True, @@ -387,8 +387,8 @@ def get_vtln_sat_args(): "cmllr_mixtures": "estimate_mixtures_sdm.vtln", "iters": [8, 9, 10], "feature_flow": "uncached_mfcc+context+lda+vtln", - "pronunciation_scales": [1.0, 0.0], - "lm_scales": [25, 8.0, 20], + "pronunciation_scales": [0.0], + "lm_scales": [25, 20, 8.0], "lm_lookahead": True, "lookahead_options": None, "create_lattice": True, From e38010d8a68a3a5202af8798bdfff93497d2e4e2 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:33:40 +0200 Subject: [PATCH 05/26] update baseline config --- common/baselines/tedlium2/gmm/baseline_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/common/baselines/tedlium2/gmm/baseline_config.py b/common/baselines/tedlium2/gmm/baseline_config.py index 257363d82..ebe5e4256 100644 --- a/common/baselines/tedlium2/gmm/baseline_config.py +++ b/common/baselines/tedlium2/gmm/baseline_config.py @@ -27,7 +27,6 @@ def run_tedlium2_common_baseline( final_output_args.define_corpus_type("train", "train") final_output_args.define_corpus_type("dev", "dev") final_output_args.define_corpus_type("test", "test") - final_output_args.define_corpus_type("dev_kaldi_small_4_gram", "test") # final_output_args.add_feature_to_extract("gt") steps = RasrSteps() From ddf6155ebf35d94e00e6785d46c1843ba157cfb4 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:37:38 +0200 Subject: [PATCH 06/26] updates to baseline config --- common/baselines/tedlium2/hybrid/baseline_config.py | 7 +++---- common/baselines/tedlium2/hybrid/data.py | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/common/baselines/tedlium2/hybrid/baseline_config.py b/common/baselines/tedlium2/hybrid/baseline_config.py index 9d0407d91..8232943b4 100644 --- a/common/baselines/tedlium2/hybrid/baseline_config.py +++ b/common/baselines/tedlium2/hybrid/baseline_config.py @@ -5,11 +5,11 @@ from i6_experiments.common.setups.rasr.util import RasrSteps from i6_experiments.common.setups.rasr.hybrid_system import HybridSystem -from i6_experiments.common.baselines.tedlium2.default_tools import RETURNN_RC_ROOT, RASR_BINARY_PATH, SCTK_BINARY_PATH +from i6_experiments.common.baselines.tedlium2.default_tools import RETURNN_RC_ROOT, RASR_BINARY_PATH from .data import get_corpus_data_inputs from .baseline_args import get_log_mel_feature_extraction_args -from i6_experiments.common.baselines.tedlium2.hybrid.nn_config.nn_args import get_nn_args as get_nn_args2 +from .nn_config.nn_args import get_nn_args def run_gmm_system(): @@ -46,11 +46,10 @@ def run_tedlium2_hybrid_baseline(): hash_overwrite="TF23_MKL_BLAS", ) blas_lib.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" - rasr_binary = tk.Path("/work/tools/asr/rasr/20211217_tf23_cuda101_mkl/arch/linux-x86_64-standard") steps = RasrSteps() steps.add_step("extract", rasr_init_args.feature_extraction_args) gmm_system.run(steps) - nn_args = get_nn_args2(num_epochs=160) + nn_args = get_nn_args(num_epochs=160) nn_steps = RasrSteps() nn_steps.add_step("nn", nn_args) diff --git a/common/baselines/tedlium2/hybrid/data.py b/common/baselines/tedlium2/hybrid/data.py index 40faaf99e..db59cd5f8 100644 --- a/common/baselines/tedlium2/hybrid/data.py +++ b/common/baselines/tedlium2/hybrid/data.py @@ -253,7 +253,6 @@ def get_corpus_data_inputs( nn_dev_data_inputs = { "dev": gmm_system.outputs["dev"]["final"].as_returnn_rasr_data_input(), - "dev_kaldi_small": gmm_system.outputs["dev_kaldi_small_4_gram"]["final"].as_returnn_rasr_data_input(), } nn_test_data_inputs = { # "test": gmm_system.outputs["test"][ From 780625c07c2e2447bcf138f5fd907f0fa7d261ba Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:42:28 +0200 Subject: [PATCH 07/26] update hybrid data --- common/baselines/tedlium2/hybrid/data.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/common/baselines/tedlium2/hybrid/data.py b/common/baselines/tedlium2/hybrid/data.py index db59cd5f8..a6c8df598 100644 --- a/common/baselines/tedlium2/hybrid/data.py +++ b/common/baselines/tedlium2/hybrid/data.py @@ -26,7 +26,7 @@ def build_hdf_data_input( partition_epoch: int = 1, acoustic_mixtures: Optional = None, seq_ordering: str = "sorted", -): +) -> HdfDataInput: """ Dumps features and alignments from RASR into hdfs, to enable full RETURNN training :param features: Feature bundle generated by the dump_features_for_hybrid_training function @@ -37,7 +37,7 @@ def build_hdf_data_input( :param partition_epoch: Partition epoch for the alignment dataset, mainly relevant for training dataset :param acoustic_mixtures: Acoustic mixture file from the GMM for prior calculation, most likely going to be replaced :param seq_ordering: sequence ordering for the align dataset, usually sorted for dev/eval and laplace for train - :return: + :return: HdfDataInput with corresponding hdf datasets """ feat_dataset = { @@ -56,14 +56,11 @@ def build_hdf_data_input( "seq_list_filter_file": segment_list, } - feat_job = ReturnnDumpHDFJob( - data=feat_dataset, - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_RC_ROOT, - ) + feat_job = ReturnnDumpHDFJob(data=feat_dataset, returnn_python_exe=RETURNN_EXE, returnn_root=RETURNN_RC_ROOT) if alias_prefix is not None: feat_job.add_alias(alias_prefix + "/dump_features") feat_hdf = feat_job.out_hdf + align_dataset = { "class": "SprintCacheDataset", "data": { @@ -104,7 +101,7 @@ def dump_features_for_hybrid_training( :param gmm_system: GMM system to get corpora from :param feature_extraction_args: Args for the feature extraction :param feature_extraction_class: Feature extraction class/job to be used for extraction - :return: + :return: path to the train cv and devtrain features """ features = {} for name in ["nn-train", "nn-cv", "nn-devtrain"]: @@ -119,7 +116,6 @@ def get_corpus_data_inputs( feature_extraction_args: Dict[str, Any], feature_extraction_class: Callable[[Any], FeatureExtractionJob], alias_prefix: Optional[str] = None, - remove_faulty_segments: bool = False, ) -> Tuple[ Dict[str, HdfDataInput], Dict[str, HdfDataInput], @@ -134,7 +130,7 @@ def get_corpus_data_inputs( :param feature_extraction_args: Args for the feature extraction of the hybrid features (might be different from GMM) :param feature_extraction_class: Feature extraction class/job to be used for extraction :param alias_prefix: Prefix for naming of experiments - :return: + :return: HdfDataInputs for the train sets and ReturnnRasrDataInputs for the dev and train sets """ train_corpus_path = gmm_system.corpora["train"].corpus_file @@ -147,11 +143,6 @@ def get_corpus_data_inputs( total_train_num_segments = NUM_SEGMENTS["train"] all_train_segments = corpus_recipe.SegmentCorpusJob(train_corpus_path, 1).out_single_segment_files[1] - if remove_faulty_segments: - all_train_segments = corpus_recipe.FilterSegmentsByListJob( - segment_files={1: all_train_segments}, - filter_list=["TED-LIUM-realease2/AndrewMcAfee_2013/23", "TED-LIUM-realease2/iOTillettWright_2012X/43"], - ).out_single_segment_files[1] cv_segments = corpus_recipe.SegmentCorpusJob(cv_corpus_path, 1).out_single_segment_files[1] dev_train_size = 500 / total_train_num_segments @@ -213,7 +204,7 @@ def get_corpus_data_inputs( allophone_labeling=allophone_labeling, alias_prefix=alias_prefix + "/nn_train_data", partition_epoch=5, - acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, # TODO: NN Mixtures + acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, seq_ordering="laplace:.1000", ) tk.register_output(f"{alias_prefix}/nn_train_data/features", nn_train_data.features) From f58e2ff60ad96da050b2ac5dfa72241cd2bbfa78 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:44:30 +0200 Subject: [PATCH 08/26] update nn_config --- .../tedlium2/hybrid/nn_config/experiment.py | 2 +- .../tedlium2/hybrid/nn_config/nn_args.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/common/baselines/tedlium2/hybrid/nn_config/experiment.py b/common/baselines/tedlium2/hybrid/nn_config/experiment.py index 5ee3529e0..b79581ec5 100644 --- a/common/baselines/tedlium2/hybrid/nn_config/experiment.py +++ b/common/baselines/tedlium2/hybrid/nn_config/experiment.py @@ -2,7 +2,7 @@ from .helper import make_nn_config -def get_wei_config(specaug=False): +def get_baseline_config(specaug=False): network = get_network(spec_augment=specaug) nn_config = make_nn_config(network) nn_config["extern_data"] = { diff --git a/common/baselines/tedlium2/hybrid/nn_config/nn_args.py b/common/baselines/tedlium2/hybrid/nn_config/nn_args.py index a286bb744..989789102 100644 --- a/common/baselines/tedlium2/hybrid/nn_config/nn_args.py +++ b/common/baselines/tedlium2/hybrid/nn_config/nn_args.py @@ -1,18 +1,17 @@ import copy -from .experiment import get_wei_config +from .experiment import get_baseline_config from .nn_setup import get_spec_augment_mask_python from i6_core.returnn.config import ReturnnConfig from i6_experiments.common.setups.rasr.util import HybridArgs, ReturnnTrainingJobArgs from copy import deepcopy -def get_nn_args(num_epochs=125, no_min_seq_len=False): +def get_nn_args(num_epochs=125): - # gets the hardcoded config from existing setup for baseline and comparison - base_config = get_wei_config() + base_config = get_baseline_config() returnn_config = ReturnnConfig(config=base_config) - # two variants of spec augment + spec_augment_args = { "max_time_num": 3, "max_time": 10, @@ -21,12 +20,12 @@ def get_nn_args(num_epochs=125, no_min_seq_len=False): "conservatvie_step": 2000, } specaug = get_spec_augment_mask_python(**spec_augment_args) - specaug_config = get_wei_config(specaug=True) + specaug_config = get_baseline_config(specaug=True) spec_cfg = ReturnnConfig(config=copy.deepcopy(specaug_config), python_epilog=specaug) configs = { - "wei_base_config": returnn_config, - "wei_specaug_config": spec_cfg, + "base_config": returnn_config, + "specaug_config": spec_cfg, } # change softmax to log softmax for hybrid From c71e88ca979aecd3917fc89656803c7d6b324bb1 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:50:26 +0200 Subject: [PATCH 09/26] update hybrid decoder --- common/setups/rasr/hybrid_decoder.py | 250 +-------------------------- 1 file changed, 2 insertions(+), 248 deletions(-) diff --git a/common/setups/rasr/hybrid_decoder.py b/common/setups/rasr/hybrid_decoder.py index fb6ff9294..4923f0640 100644 --- a/common/setups/rasr/hybrid_decoder.py +++ b/common/setups/rasr/hybrid_decoder.py @@ -23,7 +23,6 @@ CombineLmRasrConfig, ) from .util.decode import ( - DevRecognitionParameters, RecognitionParameters, SearchJobArgs, Lattice2CtmArgs, @@ -48,7 +47,7 @@ class HybridDecoder(BaseDecoder): def __init__( self, rasr_binary_path: tk.Path, - rasr_arch: str = "linux-x86_64-standard", + rasr_arch: "str" = "linux-x86_64-standard", compress: bool = False, append: bool = False, unbuffered: bool = False, @@ -156,9 +155,8 @@ def recognition( tf_fwd_input_name: str = "tf-fwd-input", ): """ - run the recognition, consisting of search, lattice to ctm, and scoring + run the recognitino, consisting of search, lattice to ctm, and scoring - :param name: decoding name :param returnn_config: RETURNN config for recognition :param checkpoints: epoch to model checkpoint mapping :param recognition_parameters: keys are the corpus keys so that recog params can be set for specific eval sets. @@ -223,247 +221,3 @@ def recognition( scorer_hyp_param_name=scorer_hyp_param_name, optimize_pron_lm_scales=optimize_pron_lm_scales, ) - - -def tune_scales( - decoder: HybridDecoder, - name: str, - returnn_config: Union[returnn.ReturnnConfig, tk.Path], - checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], - lm_configs: Dict[str, LmConfig], - prior_paths: Dict[str, PriorPath], - search_job_args: Union[SearchJobArgs, Dict], - lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], - scorer_args: Union[ScliteScorerArgs, Dict], - optimize_parameters: Union[OptimizeJobArgs, Dict], - epochs: Optional[List[int]] = None, - scorer_hyp_param_name: str = "hyp", - optimize_pron_lm_scales: bool = False, - forward_output_layer: str = "output", - tf_fwd_input_name: str = "tf-fwd-input", -): - """ - this function tunes the prior scale, TDP scale and silence/non-word exit penalties - - :return: - """ - recog_params = { - "tune1": [ - DevRecognitionParameters( - am_scales=[1.0], - lm_scales=[12.0], - prior_scales=[0.3, 0.5, 0.7], - pronunciation_scales=[1.0], - tdp_scales=[0.1, 0.5, 1.0], - speech_tdps=[], - silence_tdps=[], - nonspeech_tdps=[], - altas=[12.0], - ), - ], - } - - decoder.recognition( - name=name, - returnn_config=returnn_config, - checkpoints=checkpoints, - recognition_parameters=recog_params, - lm_configs=lm_configs, - prior_paths=prior_paths, - search_job_args=search_job_args, - lat_2_ctm_args=lat_2_ctm_args, - scorer_args=scorer_args, - optimize_parameters=optimize_parameters, - epochs=epochs, - scorer_hyp_param_name=scorer_hyp_param_name, - optimize_pron_lm_scales=optimize_pron_lm_scales, - forward_output_layer=forward_output_layer, - tf_fwd_input_name=tf_fwd_input_name, - ) - - -def tune_lm_scale( - decoder: HybridDecoder, - name: str, - returnn_config: Union[returnn.ReturnnConfig, tk.Path], - checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], - lm_configs: Dict[str, LmConfig], - prior_paths: Dict[str, PriorPath], - search_job_args: Union[SearchJobArgs, Dict], - lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], - scorer_args: Union[ScliteScorerArgs, Dict], - optimize_parameters: Union[OptimizeJobArgs, Dict], - epochs: Optional[List[int]] = None, - scorer_hyp_param_name: str = "hyp", - optimize_pron_lm_scales: bool = False, - forward_output_layer: str = "output", - tf_fwd_input_name: str = "tf-fwd-input", -): - """ - tunes the LM scale - - :return: - """ - recog_params = { - "tune2": [ - DevRecognitionParameters( - am_scales=[1.0], - lm_scales=[12.0], - prior_scales=[0.3, 0.5, 0.7], - pronunciation_scales=[1.0], - tdp_scales=[0.1, 0.5, 1.0], - speech_tdps=[], - silence_tdps=[], - nonspeech_tdps=[], - altas=[0.0], - ), - ], - } - - decoder.recognition( - name=name, - returnn_config=returnn_config, - checkpoints=checkpoints, - recognition_parameters=recog_params, - lm_configs=lm_configs, - prior_paths=prior_paths, - search_job_args=search_job_args, - lat_2_ctm_args=lat_2_ctm_args, - scorer_args=scorer_args, - optimize_parameters=optimize_parameters, - epochs=epochs, - scorer_hyp_param_name=scorer_hyp_param_name, - optimize_pron_lm_scales=optimize_pron_lm_scales, - forward_output_layer=forward_output_layer, - tf_fwd_input_name=tf_fwd_input_name, - ) - - -def tune_search_space( - decoder: HybridDecoder, - name: str, - returnn_config: Union[returnn.ReturnnConfig, tk.Path], - checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], - lm_configs: Dict[str, LmConfig], - prior_paths: Dict[str, PriorPath], - search_job_args: Union[SearchJobArgs, Dict], - lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], - scorer_args: Union[ScliteScorerArgs, Dict], - optimize_parameters: Union[OptimizeJobArgs, Dict], - epochs: Optional[List[int]] = None, - scorer_hyp_param_name: str = "hyp", - optimize_pron_lm_scales: bool = False, - forward_output_layer: str = "output", - tf_fwd_input_name: str = "tf-fwd-input", -): - """ - tunes beam search size and altas - - :return: - """ - recog_params = DevRecognitionParameters() - - decoder.recognition() - - -def tune_beam_pruning_limit( - decoder: HybridDecoder, - name: str, - returnn_config: Union[returnn.ReturnnConfig, tk.Path], - checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], - lm_configs: Dict[str, LmConfig], - prior_paths: Dict[str, PriorPath], - search_job_args: Union[SearchJobArgs, Dict], - lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], - scorer_args: Union[ScliteScorerArgs, Dict], - optimize_parameters: Union[OptimizeJobArgs, Dict], - epochs: Optional[List[int]] = None, - scorer_hyp_param_name: str = "hyp", - optimize_pron_lm_scales: bool = False, - forward_output_layer: str = "output", - tf_fwd_input_name: str = "tf-fwd-input", -): - """ - tunes the beam pruning limit - - :return: - """ - recog_params = DevRecognitionParameters() - - decoder.recognition() - - -def tune_decoding( - name: str, - *, - rasr_binary_path: tk.Path, - acoustic_model_config: AmRasrConfig, - lexicon_config: LexiconRasrConfig, - returnn_config: Union[returnn.ReturnnConfig, tk.Path], - checkpoints: Dict[int, Union[returnn.Checkpoint, tk.Path]], - lm_configs: Dict[str, LmConfig], - prior_paths: Dict[str, PriorPath], - search_job_args: Union[SearchJobArgs, Dict], - lat_2_ctm_args: Union[Lattice2CtmArgs, Dict], - scorer_args: Union[ScliteScorerArgs, Dict], - optimize_parameters: Union[OptimizeJobArgs, Dict], - rasr_arch: str = "linux-x86_64-standard", - compress: bool = False, - append: bool = False, - unbuffered: bool = False, - compress_after_run: bool = True, - search_job_class: Type[tk.Job] = recog.AdvancedTreeSearchJob, - scorer_job_class: Type[tk.Job] = recog.ScliteJob, - alias_output_prefix: str = "", - returnn_root: Optional[tk.Path] = None, - returnn_python_home: Optional[tk.Path] = None, - returnn_python_exe: Optional[tk.Path] = None, - blas_lib: Optional[tk.Path] = None, - search_numpy_blas: bool = True, - required_native_ops: Optional[List[str]] = None, - extra_configs: Optional[Dict[str, rasr.RasrConfig]] = None, - crp_name: str = "base", - epochs: Optional[List[int]] = None, - scorer_hyp_param_name: str = "hyp", - optimize_pron_lm_scales: bool = False, - forward_output_layer: str = "output", - tf_fwd_input_name: str = "tf-fwd-input", -): - """ - 1. TDPs, scales: prior, and TDP [beam-pruning = 14.0, altas = 12.0] - a. TDP: {0.1, 0.5, 1.0} - b. Prior: {0.3, 0.5, 0.7} - c. Silence and non-word phon: {0.0, 4.0, 10.0} - 2. LM scale optimization - a. no altas - b. beam-pruning: 14.0, 15.0 - 3. - a. beam-pruning: 14.0, 15.0 - b. altas: 2.0, 4.0, 6.0, 8.0 - 4. beam pruning-limit: 15k, 10k, 7.5k, 6k, 5k, 4k - - :return: - """ - decoder = HybridDecoder( - rasr_binary_path=rasr_binary_path, - rasr_arch=rasr_arch, - compress=compress, - append=append, - unbuffered=unbuffered, - compress_after_run=compress_after_run, - search_job_class=search_job_class, - scorer_job_class=scorer_job_class, - alias_output_prefix=alias_output_prefix, - returnn_root=returnn_root, - returnn_python_home=returnn_python_home, - returnn_python_exe=returnn_python_exe, - blas_lib=blas_lib, - search_numpy_blas=search_numpy_blas, - required_native_ops=required_native_ops, - ) - decoder.init_decoder( - acoustic_model_config=acoustic_model_config, - lexicon_config=lexicon_config, - extra_configs=extra_configs, - crp_name=crp_name, - ) From d6b8febb6911a3cf79245c2949996bc092db1715 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 15:59:28 +0200 Subject: [PATCH 10/26] update hybrid system --- common/setups/rasr/hybrid_system.py | 33 ++++++++++------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/common/setups/rasr/hybrid_system.py b/common/setups/rasr/hybrid_system.py index b6ba82f55..249946ea9 100644 --- a/common/setups/rasr/hybrid_system.py +++ b/common/setups/rasr/hybrid_system.py @@ -1,4 +1,4 @@ -__all__ = ["HybridSystem"] +__all__ = ["HybridArgs", "HybridSystem"] import copy import itertools @@ -94,17 +94,13 @@ def __init__( self.cv_corpora = [] self.devtrain_corpora = [] - self.train_input_data = ( - None - ) # type:Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] - self.cv_input_data = ( - None - ) # type:Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] - self.devtrain_input_data = ( - None - ) # type:Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] - self.dev_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] - self.test_input_data = None # type:Optional[Dict[str, ReturnnRasrDataInput]] + self.train_input_data: Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] = None + self.cv_input_data: Optional[Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]]] = None + self.devtrain_input_data: Optional[ + Dict[str, Union[ReturnnRasrDataInput, AllowedReturnnTrainingDataInput]] + ] = None + self.dev_input_data: Optional[Dict[str, ReturnnRasrDataInput]] = None + self.test_input_data: Optional[Dict[str, ReturnnRasrDataInput]] = None self.train_cv_pairing = None @@ -373,7 +369,6 @@ def nn_recognition( use_epoch_for_compile=False, forward_output_layer="output", native_ops: Optional[List[str]] = None, - acoustic_mixture_path: Optional[tk.Path] = None, **kwargs, ): with tk.block(f"{name}_recognition"): @@ -395,6 +390,7 @@ def nn_recognition( epochs = epochs if epochs is not None else list(checkpoints.keys()) for pron, lm, prior, epoch in itertools.product(pronunciation_scales, lm_scales, prior_scales, epochs): + assert epoch in checkpoints.keys() acoustic_mixture_path = CreateDummyMixturesJob( num_mixtures=returnn_config.config["extern_data"]["classes"]["dim"], @@ -408,16 +404,15 @@ def nn_recognition( returnn_root=train_job.returnn_root, log_verbosity=train_job.returnn_config.post_config["log_verbosity"], ) - prior_job.add_alias("extract_nn_prior/" + name) prior_file = prior_job.out_prior_xml_file assert prior_file is not None + scorer = rasr.PrecomputedHybridFeatureScorer( prior_mixtures=acoustic_mixture_path, priori_scale=prior, prior_file=prior_file, ) - assert acoustic_mixture_path is not None if use_epoch_for_compile: tf_graph = self.nn_compile_graph(name, returnn_config, epoch=epoch) @@ -474,7 +469,6 @@ def nn_recog( checkpoints=checkpoints, train_job=train_job, recognition_corpus_key=dev_c, - acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, **recog_args, ) @@ -490,7 +484,6 @@ def nn_recog( checkpoints=checkpoints, train_job=train_job, recognition_corpus_key=tst_c, - acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, **r_args, ) @@ -509,12 +502,8 @@ def nn_compile_graph( e.g. `def get_network(epoch=...)` in the config :return: the TF graph """ - # TODO remove, temporary hack - cfg = returnn_config - if "pretrain" in cfg.config.keys(): - del cfg.config["pretrain"] graph_compile_job = returnn.CompileTFGraphJob( - cfg, + returnn_config=returnn_config, epoch=epoch, returnn_root=self.returnn_root, returnn_python_exe=self.returnn_python_exe, From ca73e1b23e3880bf0ca1e9e9595de5f089f5a72a Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 16:01:11 +0200 Subject: [PATCH 11/26] update nn_system --- common/setups/rasr/nn_system.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/common/setups/rasr/nn_system.py b/common/setups/rasr/nn_system.py index 327a1b33d..93321e823 100644 --- a/common/setups/rasr/nn_system.py +++ b/common/setups/rasr/nn_system.py @@ -6,16 +6,13 @@ # -------------------- Sisyphus -------------------- -import sisyphus.toolkit as tk -import sisyphus.global_settings as gs +from sisyphus import tk, gs # -------------------- Recipes -------------------- import i6_core.returnn as returnn -from i6_core.tools import CloneGitRepositoryJob from .rasr_system import RasrSystem - from .util import ReturnnTrainingJobArgs, AllowedReturnnTrainingDataInput # -------------------- Init -------------------- @@ -65,7 +62,7 @@ def compile_native_op(self, op_name: str): returnn_python_exe=self.returnn_python_exe, blas_lib=self.blas_lib, ) - native_op_job.add_alias("wei_native_ops/compile_native_%s" % op_name) + native_op_job.add_alias("native_ops/compile_native_%s" % op_name) self.native_ops[op_name] = native_op_job.out_op def get_native_ops(self, op_names: Optional[List[str]]) -> Optional[List[tk.Path]]: From f5e2cfcf42a936a416bdbecc9424484e446bfd0c Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 16:19:14 +0200 Subject: [PATCH 12/26] updates to nn --- common/setups/rasr/util/nn.py | 2 -- common/setups/rasr/util/nn/data.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/common/setups/rasr/util/nn.py b/common/setups/rasr/util/nn.py index 2e3d1b175..dd8ba774f 100644 --- a/common/setups/rasr/util/nn.py +++ b/common/setups/rasr/util/nn.py @@ -20,8 +20,6 @@ from i6_core.util import MultiPath -from .rasr import RasrDataInput - RasrCacheTypes = Union[tk.Path, str, MultiPath, rasr.FlagDependentFlowAttribute] diff --git a/common/setups/rasr/util/nn/data.py b/common/setups/rasr/util/nn/data.py index 0ba98c43e..f70471195 100644 --- a/common/setups/rasr/util/nn/data.py +++ b/common/setups/rasr/util/nn/data.py @@ -14,7 +14,7 @@ import copy from dataclasses import dataclass, asdict -from typing import Any, Dict, List, Optional, Tuple, Type, TypedDict, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union from sisyphus import tk from sisyphus.delayed_ops import DelayedFormat, DelayedBase From 7d27d3a1d458b7bb5f8d7ed00348e88d90b77bc0 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 16:20:36 +0200 Subject: [PATCH 13/26] updates to sctk --- common/tools/sctk.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/common/tools/sctk.py b/common/tools/sctk.py index 388d44cec..f85b261bf 100644 --- a/common/tools/sctk.py +++ b/common/tools/sctk.py @@ -10,7 +10,6 @@ def compile_sctk( branch: Optional[str] = None, commit: Optional[str] = None, sctk_git_repository: str = "https://github.com/usnistgov/SCTK.git", - alias: Optional[str] = None, ) -> tk.Path: """ :param branch: specify a specific branch @@ -18,22 +17,13 @@ def compile_sctk( :param sctk_git_repository: where to clone SCTK from, usually does not need to be altered :return: SCTK binary folder """ - sctk_repo = CloneGitRepositoryJob( - url=sctk_git_repository, - branch=branch, - commit=commit, - checkout_folder_name=alias if alias is not None else "repository", - ).out_repository + sctk_repo = CloneGitRepositoryJob(url=sctk_git_repository, branch=branch, commit=commit).out_repository sctk_make = MakeJob( folder=sctk_repo, make_sequence=["config", "all", "check", "install", "doc"], link_outputs={"bin": "bin/"}, ) - # This is probably the dirtiest hack i ever did: - if alias is not None: - sctk_make.add_alias(alias) - # This is needed for the compilation to work in the i6 environment, otherwise still untested sctk_make._sis_environment.set("CPPFLAGS", "-std=c++11") return sctk_make.out_links["bin"] From a7d57920eefab4131655ed2eb9ef352bae58cadf Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 12 Jul 2023 16:21:35 +0200 Subject: [PATCH 14/26] update sctk --- common/tools/sctk.py | 1 - 1 file changed, 1 deletion(-) diff --git a/common/tools/sctk.py b/common/tools/sctk.py index f85b261bf..7fb1b2b37 100644 --- a/common/tools/sctk.py +++ b/common/tools/sctk.py @@ -23,7 +23,6 @@ def compile_sctk( make_sequence=["config", "all", "check", "install", "doc"], link_outputs={"bin": "bin/"}, ) - # This is needed for the compilation to work in the i6 environment, otherwise still untested sctk_make._sis_environment.set("CPPFLAGS", "-std=c++11") return sctk_make.out_links["bin"] From 2799615b0929ec8f37b9290913f50169fc34b265 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Tue, 29 Aug 2023 13:46:34 +0200 Subject: [PATCH 15/26] updates now finished --- common/baselines/tedlium2/default_tools.py | 10 +++++++--- common/baselines/tedlium2/hybrid/baseline_config.py | 5 +---- common/datasets/tedlium2/corpus.py | 8 ++++++-- common/datasets/tedlium2/export.py | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/common/baselines/tedlium2/default_tools.py b/common/baselines/tedlium2/default_tools.py index 265642c29..0cf245717 100644 --- a/common/baselines/tedlium2/default_tools.py +++ b/common/baselines/tedlium2/default_tools.py @@ -8,18 +8,22 @@ version listed here. Nevertheless, the most recent "head" should be safe to be used as well. """ from sisyphus import tk +from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode from i6_experiments.common.tools.sctk import compile_sctk from i6_core.tools.git import CloneGitRepositoryJob PACKAGE = __package__ -RASR_BINARY_PATH = tk.Path("/work/tools/asr/rasr/20211217_tf23_cuda101_mkl/arch/linux-x86_64-standard") +RASR_BINARY_PATH = compile_rasr_binaries_i6mode( + branch="bene_apptainer_tf213", + configure_options=["--apptainer-setup=2023-08-29_tensorflow-2.13_v1"], +) # use most recent RASR RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" SCTK_BINARY_PATH = compile_sctk() # use last published version SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH" -SRILM_PATH = tk.Path("/work/tools/users/luescher/srilm-1.7.3/bin/i686-m64/") +SRILM_PATH = tk.Path("/work/tools22/users/luescher/srilm-1.7.3-app-u22/bin/i686-m64") SRILM_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SRILM_PATH" RETURNN_EXE = tk.Path( @@ -29,6 +33,6 @@ RETURNN_RC_ROOT = CloneGitRepositoryJob( "https://github.com/rwth-i6/returnn", - commit="d7689b945b2fe781b3c79fbef9d82f018c7b11e8", + commit="11d33468ad56a6c254168560c29e77e65eb45b7c", ).out_repository RETURNN_RC_ROOT.hash_overwrite = "TEDLIUM2_DEFAULT_RETURNN_RC_ROOT" diff --git a/common/baselines/tedlium2/hybrid/baseline_config.py b/common/baselines/tedlium2/hybrid/baseline_config.py index 8232943b4..77df7617f 100644 --- a/common/baselines/tedlium2/hybrid/baseline_config.py +++ b/common/baselines/tedlium2/hybrid/baseline_config.py @@ -41,10 +41,7 @@ def run_tedlium2_hybrid_baseline(): ) # image only, so just python3 returnn_exe = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") - blas_lib = tk.Path( - "/work/tools/asr/tensorflow/2.3.4-generic+cuda10.1+mkl/bazel_out/external/mkl_linux/lib/libmklml_intel.so", - hash_overwrite="TF23_MKL_BLAS", - ) + blas_lib = tk.Path("/lib/x86_64-linux-gnu/liblapack.so.3") blas_lib.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" steps = RasrSteps() steps.add_step("extract", rasr_init_args.feature_extraction_args) diff --git a/common/datasets/tedlium2/corpus.py b/common/datasets/tedlium2/corpus.py index e732db251..c530c40e0 100644 --- a/common/datasets/tedlium2/corpus.py +++ b/common/datasets/tedlium2/corpus.py @@ -32,8 +32,12 @@ def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datas "output_format": "wav", "codec": "pcm_s16le", }, - "ogg": {"output_format": "ogg", "codec": "libvorbis"}, - "flac": {"output_format": "flac", "codec": "flac"}, + "ogg": {"output_format": "ogg", "codec": "libvorbis", "recover_duration": False}, + "flac": { + "output_format": "flac", + "codec": "flac", + "recover_duration": False, + }, } converted_bliss_corpus_dict = {} diff --git a/common/datasets/tedlium2/export.py b/common/datasets/tedlium2/export.py index 1919fa8c0..b8a324773 100644 --- a/common/datasets/tedlium2/export.py +++ b/common/datasets/tedlium2/export.py @@ -71,7 +71,7 @@ def _export_lexicon(output_prefix: str = "datasets"): """ lexicon_output_prefix = os.path.join(output_prefix, TEDLIUM_PREFIX, "lexicon") - bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix) + bliss_lexicon = get_bliss_lexicon(add_unknown_phoneme_and_mapping=False, output_prefix=output_prefix) tk.register_output(os.path.join(lexicon_output_prefix, "tedlium2.lexicon.xml.gz"), bliss_lexicon) g2p_bliss_lexicon = get_g2p_augmented_bliss_lexicon( From fd1cd0960e81d1f08bdb20cbf5534c4d798336ed Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Sep 2023 13:40:42 +0200 Subject: [PATCH 16/26] revert forced align --- common/setups/rasr/gmm_system.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/common/setups/rasr/gmm_system.py b/common/setups/rasr/gmm_system.py index d2ffb0801..57c51467b 100644 --- a/common/setups/rasr/gmm_system.py +++ b/common/setups/rasr/gmm_system.py @@ -1408,7 +1408,12 @@ def run(self, steps: Union[List[str], RasrSteps]): # ---------- Forced Alignment ---------- if step_name.startswith("forced_align"): - self.run_forced_align_step(step_args) + corpus_keys = step_args.pop("corpus_keys", self.train_corpora) + for corpus in corpus_keys: + self.forced_align( + feature_scorer_corpus_key=corpus, + **step_args, + ) # ---------- Only Recognition ---------- if step_name.startswith("recog"): From 6be293d9fd07dfbc30844a1cd72eeb9ab56fd439 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Sep 2023 13:43:14 +0200 Subject: [PATCH 17/26] fix setting of TrainingDataInputs --- common/setups/rasr/hybrid_system.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/common/setups/rasr/hybrid_system.py b/common/setups/rasr/hybrid_system.py index 249946ea9..65935db84 100644 --- a/common/setups/rasr/hybrid_system.py +++ b/common/setups/rasr/hybrid_system.py @@ -224,10 +224,11 @@ def returnn_training( cv_corpus_key, devtrain_corpus_key=None, ) -> returnn.ReturnnTrainingJob: - if nn_train_args.returnn_root is None: - nn_train_args.returnn_root = self.returnn_root - if nn_train_args.returnn_python_exe is None: - nn_train_args.returnn_python_exe = self.returnn_python_exe + if isinstance(nn_train_args, ReturnnTrainingJobArgs): + if nn_train_args.returnn_root is None: + nn_train_args.returnn_root = self.returnn_root + if nn_train_args.returnn_python_exe is None: + nn_train_args.returnn_python_exe = self.returnn_python_exe train_job = returnn_training( name=name, From 4b5e42ec2ec5da72e1a7a017d372ecb06a9701da Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Sep 2023 14:12:44 +0200 Subject: [PATCH 18/26] make use of nn prior optional --- common/baselines/tedlium2/hybrid/data.py | 2 +- common/setups/rasr/hybrid_system.py | 43 ++++++++++++++---------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/common/baselines/tedlium2/hybrid/data.py b/common/baselines/tedlium2/hybrid/data.py index a6c8df598..7e952e194 100644 --- a/common/baselines/tedlium2/hybrid/data.py +++ b/common/baselines/tedlium2/hybrid/data.py @@ -204,7 +204,7 @@ def get_corpus_data_inputs( allophone_labeling=allophone_labeling, alias_prefix=alias_prefix + "/nn_train_data", partition_epoch=5, - acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, + acoustic_mixtures=None, seq_ordering="laplace:.1000", ) tk.register_output(f"{alias_prefix}/nn_train_data/features", nn_train_data.features) diff --git a/common/setups/rasr/hybrid_system.py b/common/setups/rasr/hybrid_system.py index 65935db84..0d48bfe1c 100644 --- a/common/setups/rasr/hybrid_system.py +++ b/common/setups/rasr/hybrid_system.py @@ -354,7 +354,7 @@ def nn_recognition( name: str, returnn_config: returnn.ReturnnConfig, checkpoints: Dict[int, returnn.Checkpoint], - train_job: Union[returnn.ReturnnTrainingJob, returnn.ReturnnRasrTrainingJob], + acoustic_mixture_path: Optional[tk.Path], # TODO maybe Optional if prior file provided -> automatically construct dummy file prior_scales: List[float], pronunciation_scales: List[float], lm_scales: List[float], @@ -370,6 +370,7 @@ def nn_recognition( use_epoch_for_compile=False, forward_output_layer="output", native_ops: Optional[List[str]] = None, + train_job: Optional[Union[returnn.ReturnnTrainingJob, returnn.ReturnnRasrTrainingJob]] = None, **kwargs, ): with tk.block(f"{name}_recognition"): @@ -393,24 +394,28 @@ def nn_recognition( for pron, lm, prior, epoch in itertools.product(pronunciation_scales, lm_scales, prior_scales, epochs): assert epoch in checkpoints.keys() - acoustic_mixture_path = CreateDummyMixturesJob( - num_mixtures=returnn_config.config["extern_data"]["classes"]["dim"], - num_features=returnn_config.config["extern_data"]["data"]["dim"], - ).out_mixtures - lmgc_scorer = rasr.GMMFeatureScorer(acoustic_mixture_path) - prior_job = ReturnnComputePriorJobV2( - model_checkpoint=checkpoints[epoch], - returnn_config=train_job.returnn_config, - returnn_python_exe=train_job.returnn_python_exe, - returnn_root=train_job.returnn_root, - log_verbosity=train_job.returnn_config.post_config["log_verbosity"], - ) - prior_job.add_alias("extract_nn_prior/" + name) - prior_file = prior_job.out_prior_xml_file - assert prior_file is not None - + prior_file = None + lmgc_scorer = None + if acoustic_mixture_path is None: + assert train_job is not None, "Need ReturnnTrainingJob for computation of priors" + tmp_acoustic_mixture_path = CreateDummyMixturesJob( + num_mixtures=returnn_config.config["extern_data"]["classes"]["dim"], + num_features=returnn_config.config["extern_data"]["data"]["dim"], + ).out_mixtures + lmgc_scorer = rasr.GMMFeatureScorer(tmp_acoustic_mixture_path) + prior_job = ReturnnComputePriorJobV2( + model_checkpoint=checkpoints[epoch], + returnn_config=train_job.returnn_config, + returnn_python_exe=train_job.returnn_python_exe, + returnn_root=train_job.returnn_root, + log_verbosity=train_job.returnn_config.post_config["log_verbosity"], + ) + prior_job.add_alias("extract_nn_prior/" + name) + prior_file = prior_job.out_prior_xml_file + else: + tmp_acoustic_mixture_path = acoustic_mixture_path scorer = rasr.PrecomputedHybridFeatureScorer( - prior_mixtures=acoustic_mixture_path, + prior_mixtures=tmp_acoustic_mixture_path, # This needs to be a new variable otherwise nesting causes undesired behavior priori_scale=prior, prior_file=prior_file, ) @@ -468,6 +473,7 @@ def nn_recog( name=f"{train_corpus_key}-{train_name}-{recog_name}", returnn_config=returnn_config, checkpoints=checkpoints, + acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, train_job=train_job, recognition_corpus_key=dev_c, **recog_args, @@ -483,6 +489,7 @@ def nn_recog( name=f"{train_name}-{recog_name}", returnn_config=returnn_config, checkpoints=checkpoints, + acoustic_mixture_path=self.train_input_data[train_corpus_key].acoustic_mixtures, train_job=train_job, recognition_corpus_key=tst_c, **r_args, From 008d23062200b1293e83ac8ff4ff6234ae57514a Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Sep 2023 14:14:17 +0200 Subject: [PATCH 19/26] black --- common/setups/rasr/hybrid_system.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/setups/rasr/hybrid_system.py b/common/setups/rasr/hybrid_system.py index 0d48bfe1c..cd082144f 100644 --- a/common/setups/rasr/hybrid_system.py +++ b/common/setups/rasr/hybrid_system.py @@ -354,7 +354,9 @@ def nn_recognition( name: str, returnn_config: returnn.ReturnnConfig, checkpoints: Dict[int, returnn.Checkpoint], - acoustic_mixture_path: Optional[tk.Path], # TODO maybe Optional if prior file provided -> automatically construct dummy file + acoustic_mixture_path: Optional[ + tk.Path + ], # TODO maybe Optional if prior file provided -> automatically construct dummy file prior_scales: List[float], pronunciation_scales: List[float], lm_scales: List[float], From 8c9a3d84eec12440b9ed3a8795ce13d2581ad3ac Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Sep 2023 14:18:41 +0200 Subject: [PATCH 20/26] updates from main --- common/datasets/tedlium2/corpus.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/common/datasets/tedlium2/corpus.py b/common/datasets/tedlium2/corpus.py index c530c40e0..420adcdfc 100644 --- a/common/datasets/tedlium2/corpus.py +++ b/common/datasets/tedlium2/corpus.py @@ -28,16 +28,9 @@ def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datas bliss_corpus_dict = download_data_dict(output_prefix=output_prefix).bliss_nist audio_format_options = { - "wav": { - "output_format": "wav", - "codec": "pcm_s16le", - }, - "ogg": {"output_format": "ogg", "codec": "libvorbis", "recover_duration": False}, - "flac": { - "output_format": "flac", - "codec": "flac", - "recover_duration": False, - }, + "wav": {"output_format": "wav", "codec": "pcm_s16le"}, + "ogg": {"output_format": "ogg", "codec": "libvorbis"}, + "flac": {"output_format": "flac", "codec": "flac"}, } converted_bliss_corpus_dict = {} From b7ed5fd07f27c5a3efc96f1ebb8f32bdc71b01a1 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Dec 2023 14:05:32 +0100 Subject: [PATCH 21/26] delete nnpy --- common/setups/rasr/util/nn.py | 436 ---------------------------------- 1 file changed, 436 deletions(-) delete mode 100644 common/setups/rasr/util/nn.py diff --git a/common/setups/rasr/util/nn.py b/common/setups/rasr/util/nn.py deleted file mode 100644 index dd8ba774f..000000000 --- a/common/setups/rasr/util/nn.py +++ /dev/null @@ -1,436 +0,0 @@ -__all__ = [ - "ReturnnRasrTrainingArgs", - "ReturnnRasrDataInput", - "OggZipHdfDataInput", - "HybridArgs", - "NnRecogArgs", - "NnForcedAlignArgs", -] - -import copy -from dataclasses import dataclass, asdict -from typing import Any, Dict, List, Optional, Tuple, Type, TypedDict, Union - -from sisyphus import tk -from sisyphus.delayed_ops import DelayedFormat - -import i6_core.am as am -import i6_core.rasr as rasr -import i6_core.returnn as returnn - -from i6_core.util import MultiPath - -RasrCacheTypes = Union[tk.Path, str, MultiPath, rasr.FlagDependentFlowAttribute] - - -@dataclass(frozen=True) -class ReturnnRasrTrainingArgs: - """ - Options for writing a RASR training config. See `ReturnnRasrTrainingJob`. - Most of them may be disregarded, i.e. the defaults can be left untouched. - - :param partition_epochs: if >1, split the full dataset into multiple sub-epochs - :param num_classes: number of classes - :param disregarded_classes: path to file with list of disregarded classes - :param class_label_file: path to file with class labels - :param buffer_size: buffer size for data loading - :param extra_rasr_config: extra RASR config - :param extra_rasr_post_config: extra RASR post config - :param use_python_control: whether to use python control, usually True - """ - - partition_epochs: Optional[int] = None - num_classes: Optional[int] = None - disregarded_classes: Optional[tk.Path] = None - class_label_file: Optional[tk.Path] = None - buffer_size: int = 200 * 1024 - extra_rasr_config: Optional[rasr.RasrConfig] = None - extra_rasr_post_config: Optional[rasr.RasrConfig] = None - use_python_control: bool = True - - -class ReturnnRasrDataInput: - """ - Holds the data for ReturnnRasrTrainingJob. - """ - - def __init__( - self, - name: str, - crp: Optional[rasr.CommonRasrParameters] = None, - alignments: Optional[RasrCacheTypes] = None, - feature_flow: Optional[Union[rasr.FlowNetwork, Dict[str, rasr.FlowNetwork]]] = None, - features: Optional[Union[RasrCacheTypes, Dict[str, RasrCacheTypes]]] = None, - acoustic_mixtures: Optional[Union[tk.Path, str]] = None, - feature_scorers: Optional[Dict[str, Type[rasr.FeatureScorer]]] = None, - shuffle_data: bool = True, - shuffling_parameters: Optional[Dict[str, Any]] = None, - stm: Optional[tk.Path] = None, - glm: Optional[tk.Path] = None, - returnn_rasr_training_args: Optional[ReturnnRasrTrainingArgs] = None, - **kwargs, - ): - """ - - :param name: name of the data - :param crp: common RASR parameters - :param alignments: RASR cache of an alignment - :param feature_flow: acoustic feature flow network or dict of feature flow networks - :param features: RASR cache of acoustic features - :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) - :param feature_scorers: RASR feature scorers - :param shuffle_data: shuffle training segments into bins of similar length. The bins are sorted by length. - :param shuffling_parameters: Dict of additional parameters to set for shuffling, - currently only 'segment_order_sort_by_time_length_chunk_size' is supported - :param stm: stm file for scoring - :param glm: glm file for scoring - :param returnn_rasr_training_args: arguments for RETURNN training with RASR - """ - self.name = name - self.crp = crp - self.alignments = alignments - self.feature_flow = feature_flow - self.features = features - self.acoustic_mixtures = acoustic_mixtures - self.feature_scorers = feature_scorers - self.shuffle_data = shuffle_data - self.shuffling_parameters = shuffling_parameters - if shuffle_data and self.shuffling_parameters is None: - # apply the legacy defaults if shuffling_parameters is not set - self.shuffling_parameters = {"segment_order_sort_by_time_length_chunk_size": 384} - self.stm = stm - self.glm = glm - self.returnn_rasr_training_args = returnn_rasr_training_args or ReturnnRasrTrainingArgs() - - def get_training_feature_flow_file(self) -> tk.Path: - """Returns the feature flow file for the RETURNN training with RASR.""" - feature_flow = returnn.ReturnnRasrTrainingJob.create_flow(self.feature_flow, self.alignments) - write_feature_flow = rasr.WriteFlowNetworkJob(feature_flow) - return write_feature_flow.out_flow_file - - def get_training_rasr_config_file(self) -> tk.Path: - """Returns the RASR config file for the RETURNN training with RASR.""" - config, post_config = returnn.ReturnnRasrTrainingJob.create_config( - self.crp, self.alignments, **asdict(self.returnn_rasr_training_args) - ) - config.neural_network_trainer.feature_extraction.file = self.get_training_feature_flow_file() - write_rasr_config = rasr.WriteRasrConfigJob(config, post_config) - return write_rasr_config.out_config - - def get_data_dict(self) -> Dict[str, Union[str, DelayedFormat, tk.Path]]: - """Returns the data dict for the ExternSprintDataset to be used in a training ReturnnConfig.""" - config_file = self.get_training_rasr_config_file() - config_str = DelayedFormat("--config={} --*.LOGFILE=nn-trainer.{}.log --*.TASK=1", config_file, self.name) - dataset = { - "class": "ExternSprintDataset", - "sprintTrainerExecPath": rasr.RasrCommand.select_exe(self.crp.nn_trainer_exe, "nn-trainer"), - "sprintConfigStr": config_str, - } - partition_epochs = self.returnn_rasr_training_args.partition_epochs - if partition_epochs is not None: - dataset["partitionEpoch"] = partition_epochs - return dataset - - def build_crp( - self, - am_args, - corpus_object, - concurrent, - segment_path, - lexicon_args, - cart_tree_path=None, - allophone_file=None, - lm_args=None, - ): - """ - constructs and returns a CommonRasrParameters from the given settings and files - """ - crp = rasr.CommonRasrParameters() - rasr.crp_add_default_output(crp) - crp.acoustic_model_config = am.acoustic_model_config(**am_args) - rasr.crp_set_corpus(crp, corpus_object) - crp.concurrent = concurrent - crp.segment_path = segment_path - - crp.lexicon_config = rasr.RasrConfig() - crp.lexicon_config.file = lexicon_args["filename"] - crp.lexicon_config.normalize_pronunciation = lexicon_args["normalize_pronunciation"] - - if "add_from_lexicon" in lexicon_args: - crp.acoustic_model_config.allophones.add_from_lexicon = lexicon_args["add_from_lexicon"] - if "add_all" in lexicon_args: - crp.acoustic_model_config.allophones.add_all = lexicon_args["add_all"] - - if cart_tree_path is not None: - crp.acoustic_model_config.state_tying.type = "cart" - crp.acoustic_model_config.state_tying.file = cart_tree_path - - if lm_args is not None: - crp.language_model_config = rasr.RasrConfig() - crp.language_model_config.type = lm_args["type"] - crp.language_model_config.file = lm_args["filename"] - crp.language_model_config.scale = lm_args["scale"] - - if allophone_file is not None: - crp.acoustic_model_config.allophones.add_from_file = allophone_file - - self.crp = crp - - def update_crp_with_shuffle_parameters(self): - if self.shuffle_data: - self.crp.corpus_config.segment_order_shuffle = True - if self.shuffling_parameters is not None: - if "segment_order_sort_by_time_length_chunk_size" in self.shuffling_parameters: - self.crp.corpus_config.segment_order_sort_by_time_length = True - self.crp.corpus_config.segment_order_sort_by_time_length_chunk_size = self.shuffling_parameters[ - "segment_order_sort_by_time_length_chunk_size" - ] - - def update_crp_with( - self, - *, - corpus_file: Optional[tk.Path] = None, - audio_dir: Optional[Union[str, tk.Path]] = None, - corpus_duration: Optional[int] = None, - segment_path: Optional[Union[str, tk.Path]] = None, - concurrent: Optional[int] = None, - shuffle_data: Optional[bool] = None, - shuffling_parameters: Optional[Dict[str, Any]] = None, - ): - if corpus_file is not None: - self.crp.corpus_config.file = corpus_file - if audio_dir is not None: - self.crp.corpus_config.audio_dir = audio_dir - if corpus_duration is not None: - self.crp.corpus_duration = corpus_duration - if segment_path is not None: - self.crp.segment_path = segment_path - if concurrent is not None: - self.crp.concurrent = concurrent - if shuffle_data is not None: - self.shuffle_data = shuffle_data - if shuffling_parameters is not None: - assert self.shuffle_data, "You need to set shuffle_data to true when using shuffling_parameters" - self.shuffling_parameters = shuffling_parameters - self.update_crp_with_shuffle_parameters() - - def get_crp(self, **kwargs) -> rasr.CommonRasrParameters: - """ - constructs and returns a CommonRasrParameters from the given settings and files - :rtype CommonRasrParameters: - """ - if self.crp is None: - self.build_crp(**kwargs) - - if self.shuffle_data: - self.update_crp_with_shuffle_parameters() - - return self.crp - - -class OggZipHdfDataInput: - def __init__( - self, - oggzip_files: List[tk.Path], - alignments: List[tk.Path], - audio: Dict, - partition_epoch: int = 1, - seq_ordering: str = "laplace:.1000", - meta_args: Optional[Dict[str, Any]] = None, - ogg_args: Optional[Dict[str, Any]] = None, - hdf_args: Optional[Dict[str, Any]] = None, - acoustic_mixtures: Optional[tk.Path] = None, - ): - """ - :param oggzip_files: zipped ogg files which contain the audio - :param alignments: hdf files which contain dumped RASR alignments - :param audio: e.g. {"features": "raw", "sample_rate": 16000} for raw waveform input with a sample rate of 16 kHz - :param partition_epoch: if >1, split the full dataset into multiple sub-epochs - :param seq_ordering: sort the sequences in the dataset, e.g. "random" or "laplace:.100" - :param meta_args: parameters for the `MetaDataset` - :param ogg_args: parameters for the `OggZipDataset` - :param hdf_args: parameters for the `HdfDataset` - :param acoustic_mixtures: path to a RASR acoustic mixture file (used in System classes, not RETURNN training) - """ - self.oggzip_files = oggzip_files - self.alignments = alignments - self.audio = audio - self.partition_epoch = partition_epoch - self.seq_ordering = seq_ordering - self.meta_args = meta_args - self.ogg_args = ogg_args - self.hdf_args = hdf_args - self.acoustic_mixtures = acoustic_mixtures - - def get_data_dict(self): - return { - "class": "MetaDataset", - "data_map": {"classes": ("hdf", "classes"), "data": ("ogg", "data")}, - "datasets": { - "hdf": { - "class": "HDFDataset", - "files": self.alignments, - "use_cache_manager": True, - **(self.hdf_args or {}), - }, - "ogg": { - "class": "OggZipDataset", - "audio": self.audio, - "partition_epoch": self.partition_epoch, - "path": self.oggzip_files, - "seq_ordering": self.seq_ordering, - "use_cache_manager": True, - **(self.ogg_args or {}), - }, - }, - "seq_order_control_dataset": "ogg", - **(self.meta_args or {}), - } - - -# Attribute names are invalid identifiers, therefore use old syntax -SearchParameters = TypedDict( - "SearchParameters", - { - "beam-pruning": float, - "beam-pruning-limit": float, - "lm-state-pruning": Optional[float], - "word-end-pruning": float, - "word-end-pruning-limit": float, - }, -) - - -class LookaheadOptions(TypedDict): - cache_high: Optional[int] - cache_low: Optional[int] - history_limit: Optional[int] - laziness: Optional[int] - minimum_representation: Optional[int] - tree_cutoff: Optional[int] - - -class LatticeToCtmArgs(TypedDict): - best_path_algo: Optional[str] - encoding: Optional[str] - extra_config: Optional[Any] - extra_post_config: Optional[Any] - fill_empty_segments: Optional[bool] - - -class NnRecogArgs(TypedDict): - acoustic_mixture_path: Optional[tk.Path] - checkpoints: Optional[Dict[int, returnn.Checkpoint]] - create_lattice: Optional[bool] - epochs: Optional[List[int]] - eval_best_in_lattice: Optional[bool] - eval_single_best: Optional[bool] - feature_flow_key: str - lattice_to_ctm_kwargs: Optional[LatticeToCtmArgs] - lm_lookahead: bool - lm_scales: List[float] - lookahead_options: Optional[LookaheadOptions] - mem: int - name: str - optimize_am_lm_scale: bool - parallelize_conversion: Optional[bool] - prior_scales: List[float] - pronunciation_scales: List[float] - returnn_config: Optional[returnn.ReturnnConfig] - rtf: int - search_parameters: Optional[SearchParameters] - use_gpu: Optional[bool] - - -KeyedRecogArgsType = Dict[str, Union[Dict[str, Any], NnRecogArgs]] - - -class EpochPartitioning(TypedDict): - dev: int - train: int - - -class NnTrainingArgs(TypedDict): - buffer_size: Optional[int] - class_label_file: Optional[tk.Path] - cpu_rqmt: Optional[int] - device: Optional[str] - disregarded_classes: Optional[Any] - extra_rasr_config: Optional[rasr.RasrConfig] - extra_rasr_post_config: Optional[rasr.RasrConfig] - horovod_num_processes: Optional[int] - keep_epochs: Optional[bool] - log_verbosity: Optional[int] - mem_rqmt: Optional[int] - num_classes: int - num_epochs: int - partition_epochs: Optional[EpochPartitioning] - save_interval: Optional[int] - time_rqmt: Optional[int] - use_python_control: Optional[bool] - - -class HybridArgs: - def __init__( - self, - returnn_training_configs: Dict[str, returnn.ReturnnConfig], - returnn_recognition_configs: Dict[str, returnn.ReturnnConfig], - training_args: Union[Dict[str, Any], NnTrainingArgs], - recognition_args: KeyedRecogArgsType, - test_recognition_args: Optional[KeyedRecogArgsType] = None, - ): - """ - ################################################## - :param returnn_training_configs - RETURNN config keyed by training corpus. - ################################################## - :param returnn_recognition_configs - If a config is not found here, the corresponding training config is used - ################################################## - :param training_args: - ################################################## - :param recognition_args: - Configuration for recognition on dev corpora. - ################################################## - :param test_recognition_args: - Additional configuration for recognition on test corpora. Merged with recognition_args. - ################################################## - """ - self.returnn_training_configs = returnn_training_configs - self.returnn_recognition_configs = returnn_recognition_configs - self.training_args = training_args - self.recognition_args = recognition_args - self.test_recognition_args = test_recognition_args - - -@dataclass() -class NnRecogArgs: - name: str - returnn_config: returnn.ReturnnConfig - checkpoints: Dict[int, returnn.Checkpoint] - acoustic_mixture_path: tk.Path - prior_scales: List[float] - pronunciation_scales: List[float] - lm_scales: List[float] - optimize_am_lm_scale: bool - feature_flow_key: str - search_parameters: Dict - lm_lookahead: bool - lattice_to_ctm_kwargs: Dict - parallelize_conversion: bool - rtf: int - mem: int - lookahead_options: Optional[Dict] = None - epochs: Optional[List[int]] = None - native_ops: Optional[List[str]] = None - - -class NnForcedAlignArgs(TypedDict): - name: str - target_corpus_keys: List[str] - feature_scorer_corpus_key: str - scorer_model_key: Union[str, List[str], Tuple[str], rasr.FeatureScorer] - epoch: int - base_flow_key: str - tf_flow_key: str - dump_alignment: bool From d4b5bad172439f460d55bacb7d65d6ed1ef7965d Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Wed, 13 Dec 2023 14:06:07 +0100 Subject: [PATCH 22/26] black --- common/baselines/tedlium2/default_tools.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/baselines/tedlium2/default_tools.py b/common/baselines/tedlium2/default_tools.py index 4913ef3f7..a595e99ec 100644 --- a/common/baselines/tedlium2/default_tools.py +++ b/common/baselines/tedlium2/default_tools.py @@ -14,7 +14,9 @@ PACKAGE = __package__ -RASR_BINARY_PATH = compile_rasr_binaries_i6mode(configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"]) # use most recent RASR +RASR_BINARY_PATH = compile_rasr_binaries_i6mode( + configure_options=["--apptainer-patch=2023-05-08_tensorflow-2.8_v1"] +) # use most recent RASR assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline" RASR_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_RASR_BINARY_PATH" From 93c563f63e9ee562d69d64fbfafcf7beb46930fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Tue, 23 Jan 2024 11:06:22 +0100 Subject: [PATCH 23/26] Update common/baselines/tedlium2/gmm/baseline_args.py --- common/baselines/tedlium2/gmm/baseline_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/baselines/tedlium2/gmm/baseline_args.py b/common/baselines/tedlium2/gmm/baseline_args.py index f305c9146..f522c9b96 100644 --- a/common/baselines/tedlium2/gmm/baseline_args.py +++ b/common/baselines/tedlium2/gmm/baseline_args.py @@ -323,7 +323,7 @@ def get_sat_args(): "iters": [8, 9, 10], "feature_flow": "uncached_mfcc+context+lda", "pronunciation_scales": [0.0], - "lm_scales": [25, 20, 8.0], + "lm_scales": [8.0, 20.0, 25.0], "lm_lookahead": True, "lookahead_options": None, "create_lattice": True, From d08d4de28e7a78fdee20c4514a618e0296a4e267 Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Tue, 23 Jan 2024 11:53:34 +0100 Subject: [PATCH 24/26] possible fix --- common/baselines/tedlium2/hybrid/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/common/baselines/tedlium2/hybrid/data.py b/common/baselines/tedlium2/hybrid/data.py index 7e952e194..4a73d5d4e 100644 --- a/common/baselines/tedlium2/hybrid/data.py +++ b/common/baselines/tedlium2/hybrid/data.py @@ -160,7 +160,6 @@ def get_corpus_data_inputs( gmm_system.crp["nn-train"].corpus_duration = DURATIONS["train"] gmm_system.add_overlay("dev", "nn-cv") - gmm_system.crp["nn-cv"].corpus_config.file = cv_corpus_path gmm_system.crp["nn-cv"].segment_path = cv_segments gmm_system.crp["nn-cv"].concurrent = 1 gmm_system.crp["nn-cv"].corpus_duration = DURATIONS["dev"] From a556253aa12affbad0ccf572cfd11136dd81890c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Tue, 23 Jan 2024 12:27:17 +0100 Subject: [PATCH 25/26] fix dict name and fix fsa --- common/baselines/tedlium2/gmm/baseline_args.py | 14 ++++++++------ common/baselines/tedlium2/hybrid/data.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/common/baselines/tedlium2/gmm/baseline_args.py b/common/baselines/tedlium2/gmm/baseline_args.py index f522c9b96..c27a8ddf6 100644 --- a/common/baselines/tedlium2/gmm/baseline_args.py +++ b/common/baselines/tedlium2/gmm/baseline_args.py @@ -5,6 +5,8 @@ from i6_experiments.common.datasets.tedlium2.cart import CartQuestions from i6_experiments.common.baselines.tedlium2.default_tools import SCTK_BINARY_PATH +USE_CORRECTED_APPLICATOR=True + def get_init_args(): samples_options = { @@ -86,7 +88,7 @@ def get_monophone_args(): "extra_merge_args": None, "extra_config": None, "extra_post_config": None, - "use_corrected_applicator": False, + "use_corrected_applicator": USE_CORRECTED_APPLICATOR, } monophone_training_args = { @@ -97,7 +99,7 @@ def get_monophone_args(): "splits": 10, "accs_per_split": 2, "dump_alignment_score_report": True, - "use_corrected_applicator": False, + "use_corrected_applicator": USE_CORRECTED_APPLICATOR, } monophone_recognition_args = { @@ -184,7 +186,7 @@ def get_triphone_args(): "align_extra_rqmt": {"mem": 8}, "accumulate_extra_rqmt": {"mem": 8}, "split_extra_rqmt": {"mem": 8}, - "use_corrected_applicator": False, + "use_corrected_applicator": USE_CORRECTED_APPLICATOR, } triphone_recognition_args = { @@ -250,7 +252,7 @@ def get_vtln_args(): "align_extra_rqmt": {"mem": 8}, "accumulate_extra_rqmt": {"mem": 8}, "split_extra_rqmt": {"mem": 8}, - "use_corrected_applicator": False, + "use_corrected_applicator": USE_CORRECTED_APPLICATOR, }, } @@ -306,7 +308,7 @@ def get_sat_args(): "align_extra_rqmt": {"mem": 8}, "accumulate_extra_rqmt": {"mem": 8}, "split_extra_rqmt": {"mem": 8}, - "use_corrected_applicator": False, + "use_corrected_applicator": USE_CORRECTED_APPLICATOR, } sat_recognition_args = { @@ -371,7 +373,7 @@ def get_vtln_sat_args(): "align_extra_rqmt": {"mem": 8}, "accumulate_extra_rqmt": {"mem": 8}, "split_extra_rqmt": {"mem": 8}, - "use_corrected_applicator": False, + "use_corrected_applicator": USE_CORRECTED_APPLICATOR, } vtln_sat_recognition_args = { diff --git a/common/baselines/tedlium2/hybrid/data.py b/common/baselines/tedlium2/hybrid/data.py index 4a73d5d4e..e012a7e1b 100644 --- a/common/baselines/tedlium2/hybrid/data.py +++ b/common/baselines/tedlium2/hybrid/data.py @@ -180,7 +180,7 @@ def get_corpus_data_inputs( allophone_labeling = AllophoneLabeling( silence_phone="[SILENCE]", allophone_file=gmm_system.allophone_files["train"], - state_tying_file=gmm_system.jobs["train"]["state_tying"].out_state_tying, + state_tying_file=gmm_system.jobs["train"]["state_tying_gmm_out"].out_state_tying, ) forced_align_args = ForcedAlignmentArgs( From fe0e8f2fe290478749b05dcb67ed5a41ceb5cbbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Tue, 23 Jan 2024 12:33:37 +0100 Subject: [PATCH 26/26] black --- common/baselines/tedlium2/gmm/baseline_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/baselines/tedlium2/gmm/baseline_args.py b/common/baselines/tedlium2/gmm/baseline_args.py index c27a8ddf6..3910410c3 100644 --- a/common/baselines/tedlium2/gmm/baseline_args.py +++ b/common/baselines/tedlium2/gmm/baseline_args.py @@ -5,7 +5,7 @@ from i6_experiments.common.datasets.tedlium2.cart import CartQuestions from i6_experiments.common.baselines.tedlium2.default_tools import SCTK_BINARY_PATH -USE_CORRECTED_APPLICATOR=True +USE_CORRECTED_APPLICATOR = True def get_init_args():