11import logging
2+ import math
23import os
34import subprocess
45import sys
56import tempfile
67
78from paths import get_binary
8- from generate_wp import language_model_from_word_sequence
9+ from metasentence import MetaSentence
910
1011MKGRAPH_PATH = get_binary ("mkgraph" )
1112
13+ def make_bigram_lm_fst (word_sequence ):
14+ '''
15+ Use the given token sequence to make a bigram language model
16+ in OpenFST plain text format.
17+ '''
18+ word_sequence = ['[oov]' , '[oov]' ] + word_sequence + ['[oov]' ]
19+
20+ bigrams = {}
21+ prev_word = word_sequence [0 ]
22+ for word in word_sequence [1 :]:
23+ bigrams .setdefault (prev_word , set ()).add (word )
24+ prev_word = word
25+
26+ node_ids = {}
27+ def get_node_id (word ):
28+ node_id = node_ids .get (word , len (node_ids ) + 1 )
29+ node_ids [word ] = node_id
30+ return node_id
31+
32+ output = ""
33+ for from_word in sorted (bigrams .keys ()):
34+ from_id = get_node_id (from_word )
35+
36+ successors = bigrams [from_word ]
37+ if len (successors ) > 0 :
38+ weight = - math .log (1.0 / len (successors ))
39+ else :
40+ weight = 0
41+
42+ for to_word in sorted (successors ):
43+ to_id = get_node_id (to_word )
44+ output += '%d %d %s %s %f' % (from_id , to_id , to_word , to_word , weight )
45+ output += "\n "
46+
47+ output += "%d 0\n " % (len (node_ids ))
48+
49+ return output
50+
1251def get_language_model (kaldi_seq , proto_langdir = 'PROTO_LANGDIR' ):
1352 """Generates a language model to fit the text
1453
@@ -32,7 +71,7 @@ def get_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'):
3271 os .symlink (abspath , dstpath )
3372
3473 # Generate a textual FST
35- txt_fst = language_model_from_word_sequence (kaldi_seq )
74+ txt_fst = make_bigram_lm_fst (kaldi_seq )
3675 txt_fst_file = os .path .join (lang_model_dir , 'G.txt' )
3776 open (txt_fst_file , 'w' ).write (txt_fst )
3877
0 commit comments