Skip to content

Commit 105413d

Browse files
committed
move language_model_from_word_sequence into language_model.py
1 parent 3149bb9 commit 105413d

File tree

2 files changed

+41
-62
lines changed

2 files changed

+41
-62
lines changed

gentle/generate_wp.py

Lines changed: 0 additions & 60 deletions
This file was deleted.

gentle/language_model.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,53 @@
11
import logging
2+
import math
23
import os
34
import subprocess
45
import sys
56
import tempfile
67

78
from paths import get_binary
8-
from generate_wp import language_model_from_word_sequence
9+
from metasentence import MetaSentence
910

1011
MKGRAPH_PATH = get_binary("mkgraph")
1112

13+
def make_bigram_lm_fst(word_sequence):
14+
'''
15+
Use the given token sequence to make a bigram language model
16+
in OpenFST plain text format.
17+
'''
18+
word_sequence = ['[oov]', '[oov]'] + word_sequence + ['[oov]']
19+
20+
bigrams = {}
21+
prev_word = word_sequence[0]
22+
for word in word_sequence[1:]:
23+
bigrams.setdefault(prev_word, set()).add(word)
24+
prev_word = word
25+
26+
node_ids = {}
27+
def get_node_id(word):
28+
node_id = node_ids.get(word, len(node_ids) + 1)
29+
node_ids[word] = node_id
30+
return node_id
31+
32+
output = ""
33+
for from_word in sorted(bigrams.keys()):
34+
from_id = get_node_id(from_word)
35+
36+
successors = bigrams[from_word]
37+
if len(successors) > 0:
38+
weight = -math.log(1.0 / len(successors))
39+
else:
40+
weight = 0
41+
42+
for to_word in sorted(successors):
43+
to_id = get_node_id(to_word)
44+
output += '%d %d %s %s %f' % (from_id, to_id, to_word, to_word, weight)
45+
output += "\n"
46+
47+
output += "%d 0\n" % (len(node_ids))
48+
49+
return output
50+
1251
def get_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'):
1352
"""Generates a language model to fit the text
1453
@@ -32,7 +71,7 @@ def get_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'):
3271
os.symlink(abspath, dstpath)
3372

3473
# Generate a textual FST
35-
txt_fst = language_model_from_word_sequence(kaldi_seq)
74+
txt_fst = make_bigram_lm_fst(kaldi_seq)
3675
txt_fst_file = os.path.join(lang_model_dir, 'G.txt')
3776
open(txt_fst_file, 'w').write(txt_fst)
3877

0 commit comments

Comments
 (0)