From ad4da1ccaeaa6342c75a28a41bbe4bb31cb3f859 Mon Sep 17 00:00:00 2001 From: menne Date: Thu, 25 Jul 2019 06:07:13 +0200 Subject: [PATCH] load chunk from sorted ark files --- data_io.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/data_io.py b/data_io.py index 715c4679..347cbc2d 100644 --- a/data_io.py +++ b/data_io.py @@ -23,6 +23,9 @@ def _input_is_wav_file(fea_scp): def _input_is_feature_file(fea_scp): return not _input_is_wav_file(fea_scp) def _read_features_and_labels_with_kaldi(fea_scp, fea_opts, fea_only, lab_folder, lab_opts, output_folder): + def _sort_before_loading(): + return True #For now this decision is hard coded + fea = dict() lab = dict() if _input_is_feature_file(fea_scp): @@ -31,7 +34,11 @@ def _read_features_and_labels_with_kaldi(fea_scp, fea_opts, fea_only, lab_folder elif _input_is_wav_file(fea_scp): kaldi_bin="wav-copy" read_function = read_vec_flt_ark - fea = { k:m for k,m in read_function('ark:'+kaldi_bin+' scp:'+fea_scp+' ark:- |'+fea_opts,output_folder) } + if _sort_before_loading(): + fea_scp_string='"cat '+fea_scp+' | sort -k 1 |"' + else: + fea_scp_string=fea_scp + fea = { k:m for k,m in read_function('ark:'+kaldi_bin+' scp:'+fea_scp_string+' ark:- |'+fea_opts,output_folder) } if not fea_only: lab = { k:v for k,v in read_vec_int_ark('gunzip -c '+lab_folder+'/ali*.gz | '+lab_opts+' '+lab_folder+'/final.mdl ark:- ark:-|',output_folder) if k in fea} # Note that I'm copying only the aligments of the loaded fea fea = {k: v for k, v in fea.items() if k in lab} # This way I remove all the features without an aligment (see log file in alidir "Did not Succeded")