Skip to content

Commit 41ea8cf

Browse files
authored
[egs] Some fixes to getting data in heroico recipe (kaldi-asr#3021)
1 parent a51bd96 commit 41ea8cf

File tree

4 files changed

+17
-45
lines changed

4 files changed

+17
-45
lines changed

egs/heroico/s5/cmd.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
1111
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
1212

13+
export cmd="retry.pl queue.pl"
1314
export train_cmd="retry.pl queue.pl"
1415
export decode_cmd="retry.pl queue.pl --mem 2G"
1516

egs/heroico/s5/local/heroico_download.sh

Lines changed: 0 additions & 37 deletions
This file was deleted.

egs/heroico/s5/local/subs_prepare_data.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
# input and output files
2121

22-
my $corpus = "OpenSubtitles2018.en-es.es";
22+
my $corpus = "OpenSubtitles.en-es.es";
2323
my $symbol_table = "data/lang/words.txt";
2424
my $filtered = "data/local/tmp/subs/lm/es.txt";
2525
my $oovs = "data/local/tmp/subs/lm/oovs.txt";

egs/heroico/s5/run.sh

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ stage=0
99
datadir=/export/corpora5/LDC/LDC2006S37
1010

1111
# The corpus and lexicon are on openslr.org
12-
speech="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
13-
lexicon="http://www.openslr.org/resources/34/santiago.tar.gz"
12+
#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
13+
lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz"
1414

1515
# Location of the Movie subtitles text corpus
16-
subs_src="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
16+
subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
1717

1818
. utils/parse_options.sh
1919

@@ -26,14 +26,22 @@ set -u
2626
tmpdir=data/local/tmp
2727

2828
if [ $stage -le 0 ]; then
29-
# download the corpus from openslr
30-
local/heroico_download.sh $speech $lexicon
29+
if [ ! -d $datadir ]; then
30+
echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
31+
echo " and set $datadir to the directory where it is located."
32+
exit 1
33+
fi
34+
if [ ! -s santiago.txt ]; then
35+
echo "$0: downloading the lexicon"
36+
wget -c http://www.openslr.org/resources/34/santiago.tar.gz
37+
tar -xvzf santiago.tar.gz
38+
fi
3139
# Get data for lm training
32-
local/subs_download.sh $subs_src
40+
local/subs_download.sh $subtitles_url
3341
fi
3442

3543
if [ $stage -le 1 ]; then
36-
echo "Makin lists for building models."
44+
echo "Making lists for building models."
3745
local/prepare_data.sh $datadir
3846
fi
3947

0 commit comments

Comments
 (0)