@@ -26,7 +26,7 @@ def build_hdf_data_input(
2626 partition_epoch : int = 1 ,
2727 acoustic_mixtures : Optional = None ,
2828 seq_ordering : str = "sorted" ,
29- ):
29+ ) -> HdfDataInput :
3030 """
3131 Dumps features and alignments from RASR into hdfs, to enable full RETURNN training
3232 :param features: Feature bundle generated by the dump_features_for_hybrid_training function
@@ -37,7 +37,7 @@ def build_hdf_data_input(
3737 :param partition_epoch: Partition epoch for the alignment dataset, mainly relevant for training dataset
3838 :param acoustic_mixtures: Acoustic mixture file from the GMM for prior calculation, most likely going to be replaced
3939 :param seq_ordering: sequence ordering for the align dataset, usually sorted for dev/eval and laplace for train
40- :return:
40+ :return: HdfDataInput with corresponding hdf datasets
4141 """
4242
4343 feat_dataset = {
@@ -56,14 +56,11 @@ def build_hdf_data_input(
5656 "seq_list_filter_file" : segment_list ,
5757 }
5858
59- feat_job = ReturnnDumpHDFJob (
60- data = feat_dataset ,
61- returnn_python_exe = RETURNN_EXE ,
62- returnn_root = RETURNN_RC_ROOT ,
63- )
59+ feat_job = ReturnnDumpHDFJob (data = feat_dataset , returnn_python_exe = RETURNN_EXE , returnn_root = RETURNN_RC_ROOT )
6460 if alias_prefix is not None :
6561 feat_job .add_alias (alias_prefix + "/dump_features" )
6662 feat_hdf = feat_job .out_hdf
63+
6764 align_dataset = {
6865 "class" : "SprintCacheDataset" ,
6966 "data" : {
@@ -104,7 +101,7 @@ def dump_features_for_hybrid_training(
104101 :param gmm_system: GMM system to get corpora from
105102 :param feature_extraction_args: Args for the feature extraction
106103 :param feature_extraction_class: Feature extraction class/job to be used for extraction
107- :return:
104+ :return: path to the train cv and devtrain features
108105 """
109106 features = {}
110107 for name in ["nn-train" , "nn-cv" , "nn-devtrain" ]:
@@ -119,7 +116,6 @@ def get_corpus_data_inputs(
119116 feature_extraction_args : Dict [str , Any ],
120117 feature_extraction_class : Callable [[Any ], FeatureExtractionJob ],
121118 alias_prefix : Optional [str ] = None ,
122- remove_faulty_segments : bool = False ,
123119) -> Tuple [
124120 Dict [str , HdfDataInput ],
125121 Dict [str , HdfDataInput ],
@@ -134,7 +130,7 @@ def get_corpus_data_inputs(
134130 :param feature_extraction_args: Args for the feature extraction of the hybrid features (might be different from GMM)
135131 :param feature_extraction_class: Feature extraction class/job to be used for extraction
136132 :param alias_prefix: Prefix for naming of experiments
137- :return:
133+ :return: HdfDataInputs for the train sets and ReturnnRasrDataInputs for the dev and train sets
138134 """
139135
140136 train_corpus_path = gmm_system .corpora ["train" ].corpus_file
@@ -147,11 +143,6 @@ def get_corpus_data_inputs(
147143 total_train_num_segments = NUM_SEGMENTS ["train" ]
148144
149145 all_train_segments = corpus_recipe .SegmentCorpusJob (train_corpus_path , 1 ).out_single_segment_files [1 ]
150- if remove_faulty_segments :
151- all_train_segments = corpus_recipe .FilterSegmentsByListJob (
152- segment_files = {1 : all_train_segments },
153- filter_list = ["TED-LIUM-realease2/AndrewMcAfee_2013/23" , "TED-LIUM-realease2/iOTillettWright_2012X/43" ],
154- ).out_single_segment_files [1 ]
155146 cv_segments = corpus_recipe .SegmentCorpusJob (cv_corpus_path , 1 ).out_single_segment_files [1 ]
156147
157148 dev_train_size = 500 / total_train_num_segments
@@ -213,7 +204,7 @@ def get_corpus_data_inputs(
213204 allophone_labeling = allophone_labeling ,
214205 alias_prefix = alias_prefix + "/nn_train_data" ,
215206 partition_epoch = 5 ,
216- acoustic_mixtures = gmm_system .outputs ["train" ]["final" ].acoustic_mixtures , # TODO: NN Mixtures
207+ acoustic_mixtures = gmm_system .outputs ["train" ]["final" ].acoustic_mixtures ,
217208 seq_ordering = "laplace:.1000" ,
218209 )
219210 tk .register_output (f"{ alias_prefix } /nn_train_data/features" , nn_train_data .features )
0 commit comments