Skip to content

Commit b774e79

Browse files
committed
update hybrid data
1 parent 8ecc34e commit b774e79

File tree

1 file changed

+7
-16
lines changed
  • common/baselines/tedlium2/hybrid

1 file changed

+7
-16
lines changed

common/baselines/tedlium2/hybrid/data.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def build_hdf_data_input(
2626
partition_epoch: int = 1,
2727
acoustic_mixtures: Optional = None,
2828
seq_ordering: str = "sorted",
29-
):
29+
) -> HdfDataInput:
3030
"""
3131
Dumps features and alignments from RASR into hdfs, to enable full RETURNN training
3232
:param features: Feature bundle generated by the dump_features_for_hybrid_training function
@@ -37,7 +37,7 @@ def build_hdf_data_input(
3737
:param partition_epoch: Partition epoch for the alignment dataset, mainly relevant for training dataset
3838
:param acoustic_mixtures: Acoustic mixture file from the GMM for prior calculation, most likely going to be replaced
3939
:param seq_ordering: sequence ordering for the align dataset, usually sorted for dev/eval and laplace for train
40-
:return:
40+
:return: HdfDataInput with corresponding hdf datasets
4141
"""
4242

4343
feat_dataset = {
@@ -56,14 +56,11 @@ def build_hdf_data_input(
5656
"seq_list_filter_file": segment_list,
5757
}
5858

59-
feat_job = ReturnnDumpHDFJob(
60-
data=feat_dataset,
61-
returnn_python_exe=RETURNN_EXE,
62-
returnn_root=RETURNN_RC_ROOT,
63-
)
59+
feat_job = ReturnnDumpHDFJob(data=feat_dataset, returnn_python_exe=RETURNN_EXE, returnn_root=RETURNN_RC_ROOT)
6460
if alias_prefix is not None:
6561
feat_job.add_alias(alias_prefix + "/dump_features")
6662
feat_hdf = feat_job.out_hdf
63+
6764
align_dataset = {
6865
"class": "SprintCacheDataset",
6966
"data": {
@@ -104,7 +101,7 @@ def dump_features_for_hybrid_training(
104101
:param gmm_system: GMM system to get corpora from
105102
:param feature_extraction_args: Args for the feature extraction
106103
:param feature_extraction_class: Feature extraction class/job to be used for extraction
107-
:return:
104+
:return: path to the train cv and devtrain features
108105
"""
109106
features = {}
110107
for name in ["nn-train", "nn-cv", "nn-devtrain"]:
@@ -119,7 +116,6 @@ def get_corpus_data_inputs(
119116
feature_extraction_args: Dict[str, Any],
120117
feature_extraction_class: Callable[[Any], FeatureExtractionJob],
121118
alias_prefix: Optional[str] = None,
122-
remove_faulty_segments: bool = False,
123119
) -> Tuple[
124120
Dict[str, HdfDataInput],
125121
Dict[str, HdfDataInput],
@@ -134,7 +130,7 @@ def get_corpus_data_inputs(
134130
:param feature_extraction_args: Args for the feature extraction of the hybrid features (might be different from GMM)
135131
:param feature_extraction_class: Feature extraction class/job to be used for extraction
136132
:param alias_prefix: Prefix for naming of experiments
137-
:return:
133+
:return: HdfDataInputs for the train sets and ReturnnRasrDataInputs for the dev and train sets
138134
"""
139135

140136
train_corpus_path = gmm_system.corpora["train"].corpus_file
@@ -147,11 +143,6 @@ def get_corpus_data_inputs(
147143
total_train_num_segments = NUM_SEGMENTS["train"]
148144

149145
all_train_segments = corpus_recipe.SegmentCorpusJob(train_corpus_path, 1).out_single_segment_files[1]
150-
if remove_faulty_segments:
151-
all_train_segments = corpus_recipe.FilterSegmentsByListJob(
152-
segment_files={1: all_train_segments},
153-
filter_list=["TED-LIUM-realease2/AndrewMcAfee_2013/23", "TED-LIUM-realease2/iOTillettWright_2012X/43"],
154-
).out_single_segment_files[1]
155146
cv_segments = corpus_recipe.SegmentCorpusJob(cv_corpus_path, 1).out_single_segment_files[1]
156147

157148
dev_train_size = 500 / total_train_num_segments
@@ -213,7 +204,7 @@ def get_corpus_data_inputs(
213204
allophone_labeling=allophone_labeling,
214205
alias_prefix=alias_prefix + "/nn_train_data",
215206
partition_epoch=5,
216-
acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, # TODO: NN Mixtures
207+
acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures,
217208
seq_ordering="laplace:.1000",
218209
)
219210
tk.register_output(f"{alias_prefix}/nn_train_data/features", nn_train_data.features)

0 commit comments

Comments
 (0)