diff --git a/data_io.py b/data_io.py index 1aba86d9..7c1f8076 100644 --- a/data_io.py +++ b/data_io.py @@ -28,6 +28,9 @@ def _input_is_feature_file(fea_scp): return not _input_is_wav_file(fea_scp) def _read_features_and_labels_with_kaldi(fea_scp, fea_opts, fea_only, lab_folder, lab_opts, output_folder): + def _sort_before_loading(): + return True #For now this decision is hard coded + fea = dict() lab = dict() if _input_is_feature_file(fea_scp): @@ -36,10 +39,13 @@ def _read_features_and_labels_with_kaldi(fea_scp, fea_opts, fea_only, lab_folder elif _input_is_wav_file(fea_scp): kaldi_bin = "wav-copy" read_function = read_vec_flt_ark - fea = { - k: m - for k, m in read_function("ark:" + kaldi_bin + " scp:" + fea_scp + " ark:- |" + fea_opts, output_folder) - } + + if _sort_before_loading(): + fea_scp_string='"cat '+fea_scp+' | sort -k 1 |"' + else: + fea_scp_string=fea_scp + fea = { k:m for k,m in read_function('ark:'+kaldi_bin+' scp:'+fea_scp_string+' ark:- |'+fea_opts,output_folder) } + if not fea_only: lab = { k: v