Skip to content

Commit 2c216c9

Browse files
authored
[egs] Add script to prepare Arabic Gigawords as external data for LM training, add results (#3917)
1 parent 84d19f6 commit 2c216c9

File tree

3 files changed

+76
-35
lines changed

3 files changed

+76
-35
lines changed

egs/gale_arabic/s5d/RESULTS

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
tri1
2+
%WER 40.91 [ 32272 / 78894, 2147 ins, 7478 del, 22647 sub ] exp/tri1/decode/wer_12_0.5
3+
tri2b
4+
%WER 36.68 [ 28936 / 78894, 2752 ins, 5682 del, 20502 sub ] exp/tri2b/decode/wer_13_0.0
5+
tri3b
6+
%WER 35.35 [ 27892 / 78894, 2587 ins, 7024 del, 18281 sub ] exp/tri3b/decode/wer_14_0.0
7+
8+
chain for dev set
9+
%WER 16.60 [ 13094 / 78894, 1314 ins, 2992 del, 8788 sub ] exp/chain/tdnn_1a_sp/decode_dev/wer_9_0.0
10+
rnnlm-rescoring for dev set
11+
%WER 15.02 [ 11846 / 78894, 1248 ins, 2836 del, 7762 sub ] exp/chain/tdnn_1a_sp/decode_dev_rnnlm_1e_0.45/wer_9_0.0
12+
13+
chain for test_p2 set
14+
%WER 14.95 [ 10416 / 69668, 1129 ins, 2593 del, 6694 sub ] exp/chain/tdnn_1a_sp/decode_test_p2/wer_9_0.0
15+
rnnlm-rescoring for test_p2 set
16+
%WER 13.51 [ 9413 / 69668, 1059 ins, 2517 del, 5837 sub ] exp/chain/tdnn_1a_sp/decode_test_p2_rnnlm_1e_0.45/wer_9_0.0
17+
18+
rnnlm-rescoring for mt_eval set
19+
%WER 12.02 [ 10829 / 90112, 1483 ins, 2401 del, 6945 sub ] exp/chain/tdnn_1a_sp/decode_mt_all_rnnlm_1e_0.45/wer_9_0.0

egs/gale_arabic/s5d/local/gale_train_lms.sh

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ lexicon=$2 # data/local/dict/lexicon.txt
4343
dir=$3 # data/local/lm
4444

4545
shift 3
46-
giga_dir=( $@ )
47-
[ -z $giga_dir ] && echo "Training LM without using external Arabic Gigaword."
46+
giga_dirs=( $@ )
4847

4948
for f in "$text" "$lexicon"; do
5049
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
@@ -95,43 +94,44 @@ if [ $stage -le 1 ]; then
9594
echo "training 4-gram lm"
9695
ngram-count -text $dir/train.gz -order 4 -limit-vocab -vocab $dir/wordlist \
9796
-unk -map-unk "<UNK>" -${smoothing}discount -interpolate -lm $dir/gale.o4g.${smoothing}.gz
98-
echo "PPL for SWBD1 4gram LM:"
97+
echo "PPL for GALE Arabic 4gram LM:"
9998
ngram -unk -lm $dir/gale.o4g.${smoothing}.gz -ppl $dir/heldout
10099
ngram -unk -lm $dir/gale.o4g.${smoothing}.gz -ppl $dir/heldout -debug 2 >& $dir/4gram.${smoothing}.ppl2
101100
fi
102101

102+
if [ ! -z $giga_dirs ]; then
103+
mkdir -p $dir/giga
104+
if [ ! -f $giga_dirs/text.2000k ]; then
105+
echo "Arabic Gigaword text not found, prepare it"
106+
local/prepare_giga.sh $giga_dirs
107+
fi
103108

104-
if [ $stage -le 2 ]; then
105-
if [ ! -z $giga_dir ]; then
106-
echo "Using external data."
107-
mkdir -p $dir/giga
108-
cp $giga_dir/text.2000k $dir/giga
109-
cat $dir/giga/text.2000k | gzip -c > $dir/giga/text2000k.gz
109+
cp $giga_dirs/text.2000k $dir/giga
110+
cat $dir/giga/text.2000k | gzip -c > $dir/giga/text2000k.gz
110111

111-
for x in 3 4; do
112-
smoothing="kn"
113-
ngram-count -text $dir/giga/text2000k.gz -order $x -limit-vocab \
114-
-vocab $dir/wordlist -unk -map-unk "<UNK>" -${smoothing}discount -interpolate \
115-
-lm $dir/giga/giga.o${x}g.${smoothing}.gz
116-
echo "PPL for Gigaword ${x}gram LM:"
117-
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
118-
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout -debug 2 \
119-
>& $dir/giga/${x}gram.${smoothing}.ppl2
120-
compute-best-mix $dir/${x}gram.${smoothing}.ppl2 \
121-
$dir/giga/${x}gram.${smoothing}.ppl2 >& $dir/gale_giga_mix.${x}gram.${smoothing}.log
122-
grep 'best lambda' $dir/gale_giga_mix.${x}gram.${smoothing}.log | perl -e '
123-
$_=<>;
124-
s/.*\(//; s/\).*//;
125-
@A = split;
126-
die "Expecting 2 numbers; found: $_" if(@A!=2);
127-
print "$A[0]\n$A[1]\n";' > $dir/gale_giga_mix.${x}gram.${smoothing}.weights
128-
gale_weight=$(head -1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
129-
giga_weight=$(tail -n 1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
130-
ngram -order $x -lm $dir/gale.o${x}g.${smoothing}.gz -lambda $swb1_weight \
131-
-mix-lm $dir/giga/giga.o${x}g.${smoothing}.gz \
132-
-unk -write-lm $dir/gale_giga.o${x}g.${smoothing}.gz
133-
echo "PPL for GALE + Gigaword ${x}gram LM:"
134-
ngram -unk -lm $dir/gale_giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
135-
done
136-
fi
112+
for x in 3 4; do
113+
smoothing="kn"
114+
ngram-count -text $dir/giga/text2000k.gz -order $x -limit-vocab \
115+
-vocab $dir/wordlist -unk -map-unk "<UNK>" -${smoothing}discount -interpolate \
116+
-lm $dir/giga/giga.o${x}g.${smoothing}.gz
117+
echo "PPL for Gigaword ${x}gram LM:"
118+
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
119+
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout -debug 2 \
120+
>& $dir/giga/${x}gram.${smoothing}.ppl2
121+
compute-best-mix $dir/${x}gram.${smoothing}.ppl2 \
122+
$dir/giga/${x}gram.${smoothing}.ppl2 >& $dir/gale_giga_mix.${x}gram.${smoothing}.log
123+
grep 'best lambda' $dir/gale_giga_mix.${x}gram.${smoothing}.log | perl -e '
124+
$_=<>;
125+
s/.*\(//; s/\).*//;
126+
@A = split;
127+
die "Expecting 2 numbers; found: $_" if(@A!=2);
128+
print "$A[0]\n$A[1]\n";' > $dir/gale_giga_mix.${x}gram.${smoothing}.weights
129+
gale_weight=$(head -1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
130+
giga_weight=$(tail -n 1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
131+
ngram -order $x -lm $dir/gale.o${x}g.${smoothing}.gz -lambda $swb1_weight \
132+
-mix-lm $dir/giga/giga.o${x}g.${smoothing}.gz \
133+
-unk -write-lm $dir/gale_giga.o${x}g.${smoothing}.gz
134+
echo "PPL for GALE + Gigaword ${x}gram LM:"
135+
ngram -unk -lm $dir/gale_giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
136+
done
137137
fi
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
giga_dir=$1
4+
5+
source_dir=/export/corpora/LDC/LDC2011T11/arb_gw_5
6+
num=2000000
7+
suffix="2000k"
8+
9+
[ ! -d $source_dir ] && echo "source Arabic Gigaword does not exist." && exit 1;
10+
11+
[ -f $giga_dir/text ] && mv $giga_dir/text $giga_dir/text.bkp
12+
mkdir -p $giga_dir/
13+
14+
find $source_dir/data/ -name "*.gz" | while read file; do
15+
gunzip -c $file | local/arabic_convert.py - >> $giga_dir/text.arb
16+
done
17+
18+
head -n $num $giga_dir/text.arb > $giga_dir/text.arb.${suffix}
19+
local/normalize_transcript_BW.pl $giga_dir/text.arb.${suffix} $giga_dir/text.${suffix}
20+
21+
echo "finish preparing Arabic Gigaword"
22+
exit 0

0 commit comments

Comments
 (0)