Skip to content

Commit d386f8c

Browse files
committed
language model: fewer symlinks
1 parent 105413d commit d386f8c

File tree

2 files changed

+29
-36
lines changed

2 files changed

+29
-36
lines changed

gentle/language_model.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import math
33
import os
4+
import shutil
45
import subprocess
56
import sys
67
import tempfile
@@ -55,36 +56,28 @@ def get_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'):
5556
`kaldi_seq` is a list of words within kaldi's vocabulary.
5657
"""
5758

58-
# Create a language model directory
59-
lang_model_dir = tempfile.mkdtemp()
60-
logging.info('saving language model to %s', lang_model_dir)
61-
62-
# Symlink in necessary files from the prototype directory
63-
for dirpath, dirnames, filenames in os.walk(proto_langdir, followlinks=True):
64-
for dirname in dirnames:
65-
relpath = os.path.relpath(os.path.join(dirpath, dirname), proto_langdir)
66-
os.makedirs(os.path.join(lang_model_dir, relpath))
67-
for filename in filenames:
68-
abspath = os.path.abspath(os.path.join(dirpath, filename))
69-
relpath = os.path.relpath(os.path.join(dirpath, filename), proto_langdir)
70-
dstpath = os.path.join(lang_model_dir, relpath)
71-
os.symlink(abspath, dstpath)
72-
7359
# Generate a textual FST
7460
txt_fst = make_bigram_lm_fst(kaldi_seq)
75-
txt_fst_file = os.path.join(lang_model_dir, 'G.txt')
76-
open(txt_fst_file, 'w').write(txt_fst)
61+
txt_fst_file = tempfile.NamedTemporaryFile(delete=False)
62+
txt_fst_file.write(txt_fst)
63+
txt_fst_file.close()
7764

78-
words_file = os.path.join(proto_langdir, "graphdir/words.txt")
79-
subprocess.check_output([MKGRAPH_PATH,
80-
os.path.join(lang_model_dir, 'langdir'),
81-
os.path.join(lang_model_dir, 'modeldir'),
82-
txt_fst_file,
83-
words_file,
84-
os.path.join(lang_model_dir, 'graphdir', 'HCLG.fst')])
85-
86-
# Return the language model directory
87-
return lang_model_dir
65+
out_dir = tempfile.mkdtemp()
66+
67+
try:
68+
subprocess.check_output([MKGRAPH_PATH,
69+
os.path.join(proto_langdir, 'langdir'),
70+
os.path.join(proto_langdir, 'modeldir'),
71+
txt_fst_file.name,
72+
os.path.join(proto_langdir, "graphdir/words.txt"),
73+
os.path.join(out_dir, 'HCLG.fst')])
74+
except Exception, e:
75+
shutil.rmtree(out_dir)
76+
raise e
77+
finally:
78+
os.unlink(txt_fst_file.name)
79+
80+
return out_dir
8881

8982
if __name__=='__main__':
9083
import sys

gentle/language_model_transcribe.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,17 @@ def lm_transcribe(audio_f, transcript, proto_langdir, nnet_dir,
2828
ks = ms.get_kaldi_sequence()
2929

3030
gen_model_dir = language_model.get_language_model(ks, proto_langdir)
31+
try:
32+
gen_hclg_path = os.path.join(gen_model_dir, 'HCLG.fst')
33+
k = standard_kaldi.Kaldi(nnet_dir, gen_hclg_path, proto_langdir)
3134

32-
gen_hclg_path = os.path.join(gen_model_dir, 'graphdir', 'HCLG.fst')
33-
k = standard_kaldi.Kaldi(nnet_dir, gen_hclg_path, proto_langdir)
35+
trans = standard_kaldi.transcribe(k, audio_f,
36+
partial_results_cb=partial_cb,
37+
partial_results_kwargs=partial_kwargs)
3438

35-
trans = standard_kaldi.transcribe(k, audio_f,
36-
partial_results_cb=partial_cb,
37-
partial_results_kwargs=partial_kwargs)
38-
39-
ret = diff_align.align(trans["words"], ms)
40-
41-
shutil.rmtree(gen_model_dir)
39+
ret = diff_align.align(trans["words"], ms)
40+
finally:
41+
shutil.rmtree(gen_model_dir)
4242

4343
return {
4444
"transcript": transcript,

0 commit comments

Comments
 (0)