ctc-bpe-lexicon jobs

JackTemaki · JackTemaki · commit ef5b3e68cebb · 2023-09-11T18:06:48.000+02:00
diff --git a/users/rossenbach/lexicon/bpe_lexicon.py b/users/rossenbach/lexicon/bpe_lexicon.py
@@ -0,0 +1,107 @@
+__all__ = ["CreateBPELexiconJob"]
+
+import subprocess as sp
+import os
+import sys
+import xml.etree.ElementTree as ET
+
+from sisyphus import *
+
+Path = setup_path(__package__)
+
+from i6_core.lib.lexicon import Lexicon, Lemma
+import i6_core.util as util
+
+
+class CreateBPELexiconJob(Job):
+    """
+    Create a Bliss lexicon from bpe transcriptions.
+    """
+
+    def __init__(
+        self,
+        base_lexicon_path,
+        bpe_codes,
+        bpe_vocab,
+        subword_nmt_repo=None,
+        unk_label="UNK",
+    ):
+        """
+        :param Path base_lexicon_path:
+        :param Path bpe_codes:
+        :param Path|None bpe_vocab:
+        :param Path|str|None subword_nmt_repo:
+        """
+        self.base_lexicon_path = base_lexicon_path
+        self.bpe_codes = bpe_codes
+        self.bpe_vocab = bpe_vocab
+        self.subword_nmt_repo = subword_nmt_repo if subword_nmt_repo is not None else gs.SUBWORD_NMT_PATH
+        self.unk_label = unk_label
+
+        self.out_lexicon = self.output_path("lexicon.xml.gz", cached=True)
+
+    def tasks(self):
+        yield Task("run", resume="run", mini_task=True)
+
+    def run(self):
+        lexicon = Lexicon()
+
+        lm_tokens = set()
+
+        base_lexicon = Lexicon()
+        base_lexicon.load(self.base_lexicon_path)
+        for l in base_lexicon.lemmata:
+            if l.special is None:
+                for orth in l.orth:
+                    lm_tokens.add(orth)
+                for token in l.synt or []:  # l.synt can be None
+                    lm_tokens.add(token)
+                for eval in l.eval:
+                    for t in eval:
+                        lm_tokens.add(t)
+
+        lm_tokens = list(lm_tokens)
+
+        with util.uopen("words", "wt") as f:
+            for t in lm_tokens:
+                f.write(f"{t}\n")
+
+        vocab = set()
+        with util.uopen(self.bpe_vocab.get_path(), "rt") as f, util.uopen("fake_count_vocab.txt", "wt") as vocab_file:
+            for line in f:
+                if "{" in line or "<s>" in line or "</s>" in line or "}" in line:
+                    continue
+                symbol = line.split(":")[0][1:-1]
+                if symbol != self.unk_label:
+                    vocab_file.write(symbol + " -1\n")
+                    symbol = symbol.replace(".", "_")
+                    vocab.add(symbol)
+                    lexicon.add_phoneme(symbol.replace(".", "_"))
+
+        apply_binary = os.path.join(tk.uncached_path(self.subword_nmt_repo), "apply_bpe.py")
+        args = [
+            sys.executable,
+            apply_binary,
+            "--input",
+            "words",
+            "--codes",
+            self.bpe_codes.get_path(),
+            "--vocabulary",
+            "fake_count_vocab.txt",
+            "--output",
+            "bpes",
+        ]
+        sp.run(args, check=True)
+
+        with util.uopen("bpes", "rt") as f:
+            bpe_tokens = [l.strip() for l in f]
+
+        w2b = {w: b for w, b in zip(lm_tokens, bpe_tokens)}
+
+        for w, b in w2b.items():
+            b = " ".join([token if token in vocab else self.unk_label for token in b.split()])
+            lexicon.add_lemma(Lemma([w], [b.replace(".", "_")]))
+
+        elem = lexicon.to_xml()
+        tree = ET.ElementTree(elem)
+        util.write_xml(self.out_lexicon.get_path(), tree)
diff --git a/users/rossenbach/lexicon/conversion.py b/users/rossenbach/lexicon/conversion.py
@@ -0,0 +1,37 @@
+import collections
+import gzip
+import os.path
+import xml.etree.ElementTree as ET
+import xml.dom.minidom as minidom
+from i6_core.lib.lexicon import Lexicon
+
+from sisyphus import *
+
+Path = setup_path(__package__)
+
+import i6_core.lib.lexicon as lexicon
+from i6_core.util import uopen, write_xml
+
+
+class BlissLexiconToWordLexicon(Job):
+    def __init__(self, bliss_lexicon: Path, apply_filter: bool = True):
+        self.set_vis_name("Lexicon to Word List")
+
+        self.bliss_lexicon = bliss_lexicon
+        self.apply_filter = apply_filter
+
+        self.out_lexicon = self.output_path("lexicon.txt")
+
+    def tasks(self):
+        yield Task("run", mini_task=True)
+
+    def run(self):
+        lex = Lexicon()
+        lex.load(self.bliss_lexicon.get_path())
+
+        with open(self.out_lexicon.get_path(), "w") as word_file:
+            for lemma in lex.lemmata:
+                for orth in lemma.orth:
+                    for phon in lemma.phon:
+                        if len(phon) > 0 and len(orth) > 0:  # we can have empty phonemes or orth
+                            word_file.write(f"{orth} {phon}\n")