@@ -9,11 +9,11 @@ stage=0
99datadir=/export/corpora5/LDC/LDC2006S37
1010
1111# The corpus and lexicon are on openslr.org
12- speech =" http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
13- lexicon =" http://www.openslr.org/resources/34/santiago.tar.gz"
12+ # speech_url ="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
13+ lexicon_url =" http://www.openslr.org/resources/34/santiago.tar.gz"
1414
1515# Location of the Movie subtitles text corpus
16- subs_src =" http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
16+ subtitles_url =" http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
1717
1818. utils/parse_options.sh
1919
@@ -26,14 +26,22 @@ set -u
2626tmpdir=data/local/tmp
2727
2828if [ $stage -le 0 ]; then
29- # download the corpus from openslr
30- local/heroico_download.sh $speech $lexicon
29+ if [ ! -d $datadir ]; then
30+ echo " $0 : please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
31+ echo " and set $datadir to the directory where it is located."
32+ exit 1
33+ fi
34+ if [ ! -s santiago.txt ]; then
35+ echo " $0 : downloading the lexicon"
36+ wget -c http://www.openslr.org/resources/34/santiago.tar.gz
37+ tar -xvzf santiago.tar.gz
38+ fi
3139 # Get data for lm training
32- local/subs_download.sh $subs_src
40+ local/subs_download.sh $subtitles_url
3341fi
3442
3543if [ $stage -le 1 ]; then
36- echo " Makin lists for building models."
44+ echo " Making lists for building models."
3745 local/prepare_data.sh $datadir
3846fi
3947
0 commit comments