|
| 1 | +__all__ = ["CreateBPELexiconJob"] |
| 2 | + |
| 3 | +import subprocess as sp |
| 4 | +import os |
| 5 | +import sys |
| 6 | +import xml.etree.ElementTree as ET |
| 7 | + |
| 8 | +from sisyphus import * |
| 9 | + |
| 10 | +Path = setup_path(__package__) |
| 11 | + |
| 12 | +from i6_core.lib.lexicon import Lexicon, Lemma |
| 13 | +import i6_core.util as util |
| 14 | + |
| 15 | + |
| 16 | +class CreateBPELexiconJob(Job): |
| 17 | + """ |
| 18 | + Create a Bliss lexicon from bpe transcriptions. |
| 19 | + """ |
| 20 | + |
| 21 | + def __init__( |
| 22 | + self, |
| 23 | + base_lexicon_path, |
| 24 | + bpe_codes, |
| 25 | + bpe_vocab, |
| 26 | + subword_nmt_repo=None, |
| 27 | + unk_label="UNK", |
| 28 | + ): |
| 29 | + """ |
| 30 | + :param Path base_lexicon_path: |
| 31 | + :param Path bpe_codes: |
| 32 | + :param Path|None bpe_vocab: |
| 33 | + :param Path|str|None subword_nmt_repo: |
| 34 | + """ |
| 35 | + self.base_lexicon_path = base_lexicon_path |
| 36 | + self.bpe_codes = bpe_codes |
| 37 | + self.bpe_vocab = bpe_vocab |
| 38 | + self.subword_nmt_repo = subword_nmt_repo if subword_nmt_repo is not None else gs.SUBWORD_NMT_PATH |
| 39 | + self.unk_label = unk_label |
| 40 | + |
| 41 | + self.out_lexicon = self.output_path("lexicon.xml.gz", cached=True) |
| 42 | + |
| 43 | + def tasks(self): |
| 44 | + yield Task("run", resume="run", mini_task=True) |
| 45 | + |
| 46 | + def run(self): |
| 47 | + lexicon = Lexicon() |
| 48 | + |
| 49 | + lm_tokens = set() |
| 50 | + |
| 51 | + base_lexicon = Lexicon() |
| 52 | + base_lexicon.load(self.base_lexicon_path) |
| 53 | + for l in base_lexicon.lemmata: |
| 54 | + if l.special is None: |
| 55 | + for orth in l.orth: |
| 56 | + lm_tokens.add(orth) |
| 57 | + for token in l.synt or []: # l.synt can be None |
| 58 | + lm_tokens.add(token) |
| 59 | + for eval in l.eval: |
| 60 | + for t in eval: |
| 61 | + lm_tokens.add(t) |
| 62 | + |
| 63 | + lm_tokens = list(lm_tokens) |
| 64 | + |
| 65 | + with util.uopen("words", "wt") as f: |
| 66 | + for t in lm_tokens: |
| 67 | + f.write(f"{t}\n") |
| 68 | + |
| 69 | + vocab = set() |
| 70 | + with util.uopen(self.bpe_vocab.get_path(), "rt") as f, util.uopen("fake_count_vocab.txt", "wt") as vocab_file: |
| 71 | + for line in f: |
| 72 | + if "{" in line or "<s>" in line or "</s>" in line or "}" in line: |
| 73 | + continue |
| 74 | + symbol = line.split(":")[0][1:-1] |
| 75 | + if symbol != self.unk_label: |
| 76 | + vocab_file.write(symbol + " -1\n") |
| 77 | + symbol = symbol.replace(".", "_") |
| 78 | + vocab.add(symbol) |
| 79 | + lexicon.add_phoneme(symbol.replace(".", "_")) |
| 80 | + |
| 81 | + apply_binary = os.path.join(tk.uncached_path(self.subword_nmt_repo), "apply_bpe.py") |
| 82 | + args = [ |
| 83 | + sys.executable, |
| 84 | + apply_binary, |
| 85 | + "--input", |
| 86 | + "words", |
| 87 | + "--codes", |
| 88 | + self.bpe_codes.get_path(), |
| 89 | + "--vocabulary", |
| 90 | + "fake_count_vocab.txt", |
| 91 | + "--output", |
| 92 | + "bpes", |
| 93 | + ] |
| 94 | + sp.run(args, check=True) |
| 95 | + |
| 96 | + with util.uopen("bpes", "rt") as f: |
| 97 | + bpe_tokens = [l.strip() for l in f] |
| 98 | + |
| 99 | + w2b = {w: b for w, b in zip(lm_tokens, bpe_tokens)} |
| 100 | + |
| 101 | + for w, b in w2b.items(): |
| 102 | + b = " ".join([token if token in vocab else self.unk_label for token in b.split()]) |
| 103 | + lexicon.add_lemma(Lemma([w], [b.replace(".", "_")])) |
| 104 | + |
| 105 | + elem = lexicon.to_xml() |
| 106 | + tree = ET.ElementTree(elem) |
| 107 | + util.write_xml(self.out_lexicon.get_path(), tree) |
0 commit comments