@@ -31,6 +31,7 @@ dir10=/export/corpora/LDC/LDC2018S05/
3131text10=/export/corpora/LDC/LDC2018T14/
3232
3333mgb2_dir=" "
34+ process_xml=" "
3435mer=80
3536
3637. ./utils/parse_options.sh
@@ -108,13 +109,6 @@ cd $top_pwd
108109# prepare MGB2 data
109110if [ ! -z $mgb2_dir ]; then
110111 echo " preparing MGB2 data"
111- # check xml
112- if [ -z $( which xml) ]; then
113- echo " $0 : Could not find tool xml"
114- echo " $0 : To use MGB2 you must have xml installed"
115- echo " $0 : Download and install it from xmlstar.sourceforge.net"
116- exit 1
117- fi
118112
119113 xmldir=$mgb2_dir /train/xml/bw
120114 output_dir=$gale_data /mgb2
@@ -126,12 +120,35 @@ if [ ! -z $mgb2_dir ]; then
126120 mv $output_dir /mgb2 ${output_dir} /.backup
127121 fi
128122
129- ls $mgb2_dir /train/wav/ | while read name; do
130- basename=` basename -s .wav $name `
131- [ ! -e $xmldir /$basename .xml ] && echo " Missing $xmldir /$basename .xml" && exit 1
132- xml sel -t -m ' //segments[@annotation_id="transcript_align"]' -m " segment" -n -v " concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m " element" -v " concat(text(),' ')" $xmldir /$basename .xml | local/add_to_datadir.py $basename $output_dir $mer
133- echo $basename $db_dir /train/wav/$basename .wav >> $output_dir /wav.scp
134- done
123+ if [ $process_xml == ' python' ]; then
124+ echo " using python to process xml file"
125+ # check if bs4 and lxml are installed in python
126+ local/check_tools.sh
127+ ls $mgb2_dir /train/wav/ | while read name; do
128+ basename=` basename -s .wav $name `
129+ [ ! -e $xmldir /$basename .xml ] && echo " Missing $xmldir /$basename .xml" && exit 1
130+ local/process_xml.py $xmldir /$basename .xml - | local/add_to_datadir.py $basename $train_dir $mer
131+ echo $basename $db_dir /train/wav/$basename .wav >> $output_dir /wav.scp
132+ done
133+ elif [ $process_xml == ' xml' ]; then
134+ # check if xml binary exsits
135+ if command -v xml > /dev/null 2> /dev/null; then
136+ echo " using xml"
137+ ls $mgb2_dir /train/wav/ | while read name; do
138+ basename=` basename -s .wav $name `
139+ [ ! -e $xmldir /$basename .xml ] && echo " Missing $xmldir /$basename .xml" && exit 1
140+ xml sel -t -m ' //segments[@annotation_id="transcript_align"]' -m " segment" -n -v " concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m " element" -v " concat(text(),' ')" $xmldir /$basename .xml | local/add_to_datadir.py $basename $output_dir $mer
141+ echo $basename $db_dir /train/wav/$basename .wav >> $output_dir /wav.scp
142+ done
143+ else
144+ echo " xml not found, you may use python by '--process-xml python'"
145+ exit 1;
146+ fi
147+ else
148+ # invalid option
149+ echo " $0 : invalid option for --process-xml, choose from 'xml' or 'python'"
150+ exit 1;
151+ fi
135152
136153 # add mgb2 data to training data (GALE/all and wav.scp)
137154 mv $gale_data /all $gale_data /all.gale
0 commit comments