1414cmd=
1515block_size=256
1616stage=0
17- frames_per_job=3000000
17+ frames_per_job=1500000
1818left_context=13
1919right_context=9
2020# TODO: add lang2weight support
@@ -86,9 +86,7 @@ for lang in $(seq 0 $[$num_langs-1]);do
8686 done
8787 num_chunks=$( fgrep num_chunks ${multi_egs_dir[$lang]} /info.txt | awk ' {print $2}' )
8888 curr_frames_per_chunk_avg=` awk ' /^frames_per_chunk_avg/ {print $2;}' ${multi_egs_dir[$lang]} /info.txt`
89- tot_num_archives=$[tot_num_archives+ (num_chunks* curr_frames_per_chunk_avg)]
90- train_scp_list=" $train_scp_list ${args[$lang]} /train.scp"
91- num_scps=$( fgrep num_scp_files ${multi_egs_dir[$lang]} /info.txt | awk ' {print $2}' )
89+ tot_num_archives=$[tot_num_archives+ ((num_chunks* curr_frames_per_chunk_avg)/frames_per_job+1)]
9290 tot_num_scps=$[tot_num_scps+num_scps]
9391 train_diagnostic_scp_list=" $train_diagnostic_scp_list ${args[$lang]} /train_subset.scp"
9492 valid_diagnostic_scp_list=" $valid_diagnostic_scp_list ${args[$lang]} /valid_subset.scp"
@@ -105,14 +103,13 @@ for lang in $(seq 0 $[$num_langs-1]);do
105103 fi
106104 done
107105done
108- # num_scp_files=$[(tot_num_archives)/frames_per_job +1]
109- num_scp_files=$tot_num_scps
106+ num_scp_files=$tot_num_archives
110107echo " num_scp_files $num_scp_files " >> $megs_dir /info.txt
111108sed_cmd=
112109for lang in $( seq 0 $[$num_langs -1]) ; do
113110 lang_name=${lang_list[$lang]}
114- weight=${ lang2weight[$lang]}
115- sed_cmd=" $sed_cmd s/.*lang=${lang_name} /$weight /;"
111+ weight=` echo $ lang2weight | tr ' , ' ' ' | cut -d ' ' -f$ [$lang +1] `
112+ sed_cmd=" $sed_cmd s/.*lang=${lang_name} .* /$weight /;"
116113done
117114
118115dir=$megs_dir /
@@ -139,7 +136,9 @@ if [ $stage -le 0 ]; then
139136 # the shuffling is probably not required because we will do it once again before
140137 # merging examples
141138 cat $input_list | utils/shuffle_list.pl > $dir /train.$j .scp
142- sed " $sed_cmd " < $dir /train.$j .scp > $dir /train.weight.$j .scp
139+ sed " $sed_cmd " < <( awk ' {print $1}' $dir /train.$j .scp) > $dir /train.weight.$j .ark.col2
140+ paste -d ' ' <( awk ' {print $1}' $dir /train.$j .scp) $dir /train.weight.$j .ark.col2 > $dir /train.weight.$j .ark
141+ rm $dir /train.weight.$j .ark.col2
143142 done
144143fi
145144
@@ -151,7 +150,9 @@ if [ $stage -le 1 ]; then
151150 awk -v lang_name=" $lang_name " \
152151 ' {if ($1 !~ /?/){$1=$1"?lang=" lang_name; print;} else {$1=$1"&lang=" lang_name; print;}}'
153152 done > $dir /${subset_file} .scp
154- sed " $sed_cmd " < $dir /${subset_file} .scp > $dir /${subset_file} .weight.scp
153+ sed " $sed_cmd " < <( awk ' {print $1}' $dir /${subset_file} .scp) > $dir /${subset_file} .weight.ark.col2
154+ paste -d ' ' <( awk ' {print $1}' $dir /${subset_file} .scp) $dir /${subset_file} .weight.ark.col2 > $dir /${subset_file} .weight.ark
155+ rm $dir /${subset_file} .weight.ark.col2
155156 done
156157fi
157158
0 commit comments