language model: fewer symlinks

maxhawkins · maxhawkins · commit d386f8c64f1a · 2015-11-26T23:48:37.000+01:00
diff --git a/gentle/language_model.py b/gentle/language_model.py
@@ -1,6 +1,7 @@
 import logging
 import math
 import os
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -55,36 +56,28 @@ def get_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'):
     `kaldi_seq` is a list of words within kaldi's vocabulary.
     """
 
-    # Create a language model directory
-    lang_model_dir = tempfile.mkdtemp()
-    logging.info('saving language model to %s', lang_model_dir)
-
-    # Symlink in necessary files from the prototype directory
-    for dirpath, dirnames, filenames in os.walk(proto_langdir, followlinks=True):
-        for dirname in dirnames:
-            relpath = os.path.relpath(os.path.join(dirpath, dirname), proto_langdir)
-            os.makedirs(os.path.join(lang_model_dir, relpath))
-        for filename in filenames:
-            abspath = os.path.abspath(os.path.join(dirpath, filename))
-            relpath = os.path.relpath(os.path.join(dirpath, filename), proto_langdir)
-            dstpath = os.path.join(lang_model_dir, relpath)
-            os.symlink(abspath, dstpath)
-
     # Generate a textual FST
     txt_fst = make_bigram_lm_fst(kaldi_seq)
-    txt_fst_file = os.path.join(lang_model_dir, 'G.txt')
-    open(txt_fst_file, 'w').write(txt_fst)
+    txt_fst_file = tempfile.NamedTemporaryFile(delete=False)
+    txt_fst_file.write(txt_fst)
+    txt_fst_file.close()
     
-    words_file = os.path.join(proto_langdir, "graphdir/words.txt")
-    subprocess.check_output([MKGRAPH_PATH,
-                     os.path.join(lang_model_dir, 'langdir'),
-                     os.path.join(lang_model_dir, 'modeldir'),
-                     txt_fst_file,
-                     words_file,
-                     os.path.join(lang_model_dir, 'graphdir', 'HCLG.fst')])
-
-    # Return the language model directory
-    return lang_model_dir
+    out_dir = tempfile.mkdtemp()
+
+    try:
+        subprocess.check_output([MKGRAPH_PATH,
+                         os.path.join(proto_langdir, 'langdir'),
+                         os.path.join(proto_langdir, 'modeldir'),
+                         txt_fst_file.name,
+                         os.path.join(proto_langdir, "graphdir/words.txt"),
+                         os.path.join(out_dir, 'HCLG.fst')])
+    except Exception, e:
+        shutil.rmtree(out_dir)
+        raise e
+    finally:
+        os.unlink(txt_fst_file.name)
+
+    return out_dir
 
 if __name__=='__main__':
     import sys
diff --git a/gentle/language_model_transcribe.py b/gentle/language_model_transcribe.py
@@ -28,17 +28,17 @@ def lm_transcribe(audio_f, transcript, proto_langdir, nnet_dir,
     ks = ms.get_kaldi_sequence()
 
     gen_model_dir = language_model.get_language_model(ks, proto_langdir)
+    try:
+        gen_hclg_path = os.path.join(gen_model_dir, 'HCLG.fst')
+        k = standard_kaldi.Kaldi(nnet_dir, gen_hclg_path, proto_langdir)
 
-    gen_hclg_path = os.path.join(gen_model_dir, 'graphdir', 'HCLG.fst')
-    k = standard_kaldi.Kaldi(nnet_dir, gen_hclg_path, proto_langdir)
+        trans = standard_kaldi.transcribe(k, audio_f,
+                                          partial_results_cb=partial_cb,
+                                          partial_results_kwargs=partial_kwargs)
 
-    trans = standard_kaldi.transcribe(k, audio_f,
-                                      partial_results_cb=partial_cb,
-                                      partial_results_kwargs=partial_kwargs)
-
-    ret = diff_align.align(trans["words"], ms)
-
-    shutil.rmtree(gen_model_dir)
+        ret = diff_align.align(trans["words"], ms)
+    finally:
+        shutil.rmtree(gen_model_dir)
 
     return {
         "transcript": transcript,