@@ -43,8 +43,7 @@ lexicon=$2 # data/local/dict/lexicon.txt
4343dir=$3 # data/local/lm
4444
4545shift 3
46- giga_dir=( $@ )
47- [ -z $giga_dir ] && echo " Training LM without using external Arabic Gigaword."
46+ giga_dirs=( $@ )
4847
4948for f in " $text " " $lexicon " ; do
5049 [ ! -f $x ] && echo " $0 : No such file $f " && exit 1;
@@ -95,43 +94,44 @@ if [ $stage -le 1 ]; then
9594 echo " training 4-gram lm"
9695 ngram-count -text $dir /train.gz -order 4 -limit-vocab -vocab $dir /wordlist \
9796 -unk -map-unk " <UNK>" -${smoothing} discount -interpolate -lm $dir /gale.o4g.${smoothing} .gz
98- echo " PPL for SWBD1 4gram LM:"
97+ echo " PPL for GALE Arabic 4gram LM:"
9998 ngram -unk -lm $dir /gale.o4g.${smoothing} .gz -ppl $dir /heldout
10099 ngram -unk -lm $dir /gale.o4g.${smoothing} .gz -ppl $dir /heldout -debug 2 >& $dir /4gram.${smoothing} .ppl2
101100fi
102101
102+ if [ ! -z $giga_dirs ]; then
103+ mkdir -p $dir /giga
104+ if [ ! -f $giga_dirs /text.2000k ]; then
105+ echo " Arabic Gigaword text not found, prepare it"
106+ local/prepare_giga.sh $giga_dirs
107+ fi
103108
104- if [ $stage -le 2 ]; then
105- if [ ! -z $giga_dir ]; then
106- echo " Using external data."
107- mkdir -p $dir /giga
108- cp $giga_dir /text.2000k $dir /giga
109- cat $dir /giga/text.2000k | gzip -c > $dir /giga/text2000k.gz
109+ cp $giga_dirs /text.2000k $dir /giga
110+ cat $dir /giga/text.2000k | gzip -c > $dir /giga/text2000k.gz
110111
111- for x in 3 4; do
112- smoothing=" kn"
113- ngram-count -text $dir /giga/text2000k.gz -order $x -limit-vocab \
114- -vocab $dir /wordlist -unk -map-unk " <UNK>" -${smoothing} discount -interpolate \
115- -lm $dir /giga/giga.o${x} g.${smoothing} .gz
116- echo " PPL for Gigaword ${x} gram LM:"
117- ngram -unk -lm $dir /giga/giga.o${x} g.${smoothing} .gz -ppl $dir /heldout
118- ngram -unk -lm $dir /giga/giga.o${x} g.${smoothing} .gz -ppl $dir /heldout -debug 2 \
119- >& $dir /giga/${x} gram.${smoothing} .ppl2
120- compute-best-mix $dir /${x} gram.${smoothing} .ppl2 \
121- $dir /giga/${x} gram.${smoothing} .ppl2 >& $dir /gale_giga_mix.${x} gram.${smoothing} .log
122- grep ' best lambda' $dir /gale_giga_mix.${x} gram.${smoothing} .log | perl -e '
123- $_=<>;
124- s/.*\(//; s/\).*//;
125- @A = split;
126- die "Expecting 2 numbers; found: $_" if(@A!=2);
127- print "$A[0]\n$A[1]\n";' > $dir /gale_giga_mix.${x} gram.${smoothing} .weights
128- gale_weight=$( head -1 $dir /gale_giga_mix.${x} gram.${smoothing} .weights)
129- giga_weight=$( tail -n 1 $dir /gale_giga_mix.${x} gram.${smoothing} .weights)
130- ngram -order $x -lm $dir /gale.o${x} g.${smoothing} .gz -lambda $swb1_weight \
131- -mix-lm $dir /giga/giga.o${x} g.${smoothing} .gz \
132- -unk -write-lm $dir /gale_giga.o${x} g.${smoothing} .gz
133- echo " PPL for GALE + Gigaword ${x} gram LM:"
134- ngram -unk -lm $dir /gale_giga.o${x} g.${smoothing} .gz -ppl $dir /heldout
135- done
136- fi
112+ for x in 3 4; do
113+ smoothing=" kn"
114+ ngram-count -text $dir /giga/text2000k.gz -order $x -limit-vocab \
115+ -vocab $dir /wordlist -unk -map-unk " <UNK>" -${smoothing} discount -interpolate \
116+ -lm $dir /giga/giga.o${x} g.${smoothing} .gz
117+ echo " PPL for Gigaword ${x} gram LM:"
118+ ngram -unk -lm $dir /giga/giga.o${x} g.${smoothing} .gz -ppl $dir /heldout
119+ ngram -unk -lm $dir /giga/giga.o${x} g.${smoothing} .gz -ppl $dir /heldout -debug 2 \
120+ >& $dir /giga/${x} gram.${smoothing} .ppl2
121+ compute-best-mix $dir /${x} gram.${smoothing} .ppl2 \
122+ $dir /giga/${x} gram.${smoothing} .ppl2 >& $dir /gale_giga_mix.${x} gram.${smoothing} .log
123+ grep ' best lambda' $dir /gale_giga_mix.${x} gram.${smoothing} .log | perl -e '
124+ $_=<>;
125+ s/.*\(//; s/\).*//;
126+ @A = split;
127+ die "Expecting 2 numbers; found: $_" if(@A!=2);
128+ print "$A[0]\n$A[1]\n";' > $dir /gale_giga_mix.${x} gram.${smoothing} .weights
129+ gale_weight=$( head -1 $dir /gale_giga_mix.${x} gram.${smoothing} .weights)
130+ giga_weight=$( tail -n 1 $dir /gale_giga_mix.${x} gram.${smoothing} .weights)
131+ ngram -order $x -lm $dir /gale.o${x} g.${smoothing} .gz -lambda $swb1_weight \
132+ -mix-lm $dir /giga/giga.o${x} g.${smoothing} .gz \
133+ -unk -write-lm $dir /gale_giga.o${x} g.${smoothing} .gz
134+ echo " PPL for GALE + Gigaword ${x} gram LM:"
135+ ngram -unk -lm $dir /gale_giga.o${x} g.${smoothing} .gz -ppl $dir /heldout
136+ done
137137fi
0 commit comments