Skip to content

Commit 90bfb5c

Browse files
Srikanth MADIKERIqindazhu
authored andcommitted
steps/nnet3/chain2/combine_egs.sh: combines two processed egs folders. this is useful for multitask training
1 parent 6302d50 commit 90bfb5c

File tree

1 file changed

+11
-10
lines changed

1 file changed

+11
-10
lines changed

egs/wsj/s5/steps/nnet3/chain2/combine_egs.sh

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ set -e
1414
cmd=
1515
block_size=256
1616
stage=0
17-
frames_per_job=3000000
17+
frames_per_job=1500000
1818
left_context=13
1919
right_context=9
2020
# TODO: add lang2weight support
@@ -86,9 +86,7 @@ for lang in $(seq 0 $[$num_langs-1]);do
8686
done
8787
num_chunks=$(fgrep num_chunks ${multi_egs_dir[$lang]}/info.txt | awk '{print $2}')
8888
curr_frames_per_chunk_avg=`awk '/^frames_per_chunk_avg/ {print $2;}' ${multi_egs_dir[$lang]}/info.txt`
89-
tot_num_archives=$[tot_num_archives+(num_chunks*curr_frames_per_chunk_avg)]
90-
train_scp_list="$train_scp_list ${args[$lang]}/train.scp"
91-
num_scps=$(fgrep num_scp_files ${multi_egs_dir[$lang]}/info.txt | awk '{print $2}')
89+
tot_num_archives=$[tot_num_archives+((num_chunks*curr_frames_per_chunk_avg)/frames_per_job+1)]
9290
tot_num_scps=$[tot_num_scps+num_scps]
9391
train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_subset.scp"
9492
valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_subset.scp"
@@ -105,14 +103,13 @@ for lang in $(seq 0 $[$num_langs-1]);do
105103
fi
106104
done
107105
done
108-
# num_scp_files=$[(tot_num_archives)/frames_per_job +1]
109-
num_scp_files=$tot_num_scps
106+
num_scp_files=$tot_num_archives
110107
echo "num_scp_files $num_scp_files" >> $megs_dir/info.txt
111108
sed_cmd=
112109
for lang in $(seq 0 $[$num_langs-1]);do
113110
lang_name=${lang_list[$lang]}
114-
weight=${lang2weight[$lang]}
115-
sed_cmd="$sed_cmd s/.*lang=${lang_name}/$weight/;"
111+
weight=`echo $lang2weight | tr ',' ' ' | cut -d ' ' -f$[$lang+1]`
112+
sed_cmd="$sed_cmd s/.*lang=${lang_name}.*/$weight/;"
116113
done
117114

118115
dir=$megs_dir/
@@ -139,7 +136,9 @@ if [ $stage -le 0 ]; then
139136
# the shuffling is probably not required because we will do it once again before
140137
# merging examples
141138
cat $input_list | utils/shuffle_list.pl > $dir/train.$j.scp
142-
sed "$sed_cmd" < $dir/train.$j.scp > $dir/train.weight.$j.scp
139+
sed "$sed_cmd" < <(awk '{print $1}' $dir/train.$j.scp) > $dir/train.weight.$j.ark.col2
140+
paste -d ' ' <(awk '{print $1}' $dir/train.$j.scp) $dir/train.weight.$j.ark.col2 > $dir/train.weight.$j.ark
141+
rm $dir/train.weight.$j.ark.col2
143142
done
144143
fi
145144

@@ -151,7 +150,9 @@ if [ $stage -le 1 ]; then
151150
awk -v lang_name="$lang_name" \
152151
'{if ($1 !~ /?/){$1=$1"?lang=" lang_name; print;} else {$1=$1"&lang=" lang_name; print;}}'
153152
done > $dir/${subset_file}.scp
154-
sed "$sed_cmd" < $dir/${subset_file}.scp > $dir/${subset_file}.weight.scp
153+
sed "$sed_cmd" < <(awk '{print $1}' $dir/${subset_file}.scp) > $dir/${subset_file}.weight.ark.col2
154+
paste -d ' ' <(awk '{print $1}' $dir/${subset_file}.scp) $dir/${subset_file}.weight.ark.col2 > $dir/${subset_file}.weight.ark
155+
rm $dir/${subset_file}.weight.ark.col2
155156
done
156157
fi
157158

0 commit comments

Comments
 (0)