Skip to content

Commit ef5b3e6

Browse files
committed
ctc-bpe-lexicon jobs
1 parent 2cdbc4b commit ef5b3e6

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
__all__ = ["CreateBPELexiconJob"]
2+
3+
import subprocess as sp
4+
import os
5+
import sys
6+
import xml.etree.ElementTree as ET
7+
8+
from sisyphus import *
9+
10+
Path = setup_path(__package__)
11+
12+
from i6_core.lib.lexicon import Lexicon, Lemma
13+
import i6_core.util as util
14+
15+
16+
class CreateBPELexiconJob(Job):
17+
"""
18+
Create a Bliss lexicon from bpe transcriptions.
19+
"""
20+
21+
def __init__(
22+
self,
23+
base_lexicon_path,
24+
bpe_codes,
25+
bpe_vocab,
26+
subword_nmt_repo=None,
27+
unk_label="UNK",
28+
):
29+
"""
30+
:param Path base_lexicon_path:
31+
:param Path bpe_codes:
32+
:param Path|None bpe_vocab:
33+
:param Path|str|None subword_nmt_repo:
34+
"""
35+
self.base_lexicon_path = base_lexicon_path
36+
self.bpe_codes = bpe_codes
37+
self.bpe_vocab = bpe_vocab
38+
self.subword_nmt_repo = subword_nmt_repo if subword_nmt_repo is not None else gs.SUBWORD_NMT_PATH
39+
self.unk_label = unk_label
40+
41+
self.out_lexicon = self.output_path("lexicon.xml.gz", cached=True)
42+
43+
def tasks(self):
44+
yield Task("run", resume="run", mini_task=True)
45+
46+
def run(self):
47+
lexicon = Lexicon()
48+
49+
lm_tokens = set()
50+
51+
base_lexicon = Lexicon()
52+
base_lexicon.load(self.base_lexicon_path)
53+
for l in base_lexicon.lemmata:
54+
if l.special is None:
55+
for orth in l.orth:
56+
lm_tokens.add(orth)
57+
for token in l.synt or []: # l.synt can be None
58+
lm_tokens.add(token)
59+
for eval in l.eval:
60+
for t in eval:
61+
lm_tokens.add(t)
62+
63+
lm_tokens = list(lm_tokens)
64+
65+
with util.uopen("words", "wt") as f:
66+
for t in lm_tokens:
67+
f.write(f"{t}\n")
68+
69+
vocab = set()
70+
with util.uopen(self.bpe_vocab.get_path(), "rt") as f, util.uopen("fake_count_vocab.txt", "wt") as vocab_file:
71+
for line in f:
72+
if "{" in line or "<s>" in line or "</s>" in line or "}" in line:
73+
continue
74+
symbol = line.split(":")[0][1:-1]
75+
if symbol != self.unk_label:
76+
vocab_file.write(symbol + " -1\n")
77+
symbol = symbol.replace(".", "_")
78+
vocab.add(symbol)
79+
lexicon.add_phoneme(symbol.replace(".", "_"))
80+
81+
apply_binary = os.path.join(tk.uncached_path(self.subword_nmt_repo), "apply_bpe.py")
82+
args = [
83+
sys.executable,
84+
apply_binary,
85+
"--input",
86+
"words",
87+
"--codes",
88+
self.bpe_codes.get_path(),
89+
"--vocabulary",
90+
"fake_count_vocab.txt",
91+
"--output",
92+
"bpes",
93+
]
94+
sp.run(args, check=True)
95+
96+
with util.uopen("bpes", "rt") as f:
97+
bpe_tokens = [l.strip() for l in f]
98+
99+
w2b = {w: b for w, b in zip(lm_tokens, bpe_tokens)}
100+
101+
for w, b in w2b.items():
102+
b = " ".join([token if token in vocab else self.unk_label for token in b.split()])
103+
lexicon.add_lemma(Lemma([w], [b.replace(".", "_")]))
104+
105+
elem = lexicon.to_xml()
106+
tree = ET.ElementTree(elem)
107+
util.write_xml(self.out_lexicon.get_path(), tree)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import collections
2+
import gzip
3+
import os.path
4+
import xml.etree.ElementTree as ET
5+
import xml.dom.minidom as minidom
6+
from i6_core.lib.lexicon import Lexicon
7+
8+
from sisyphus import *
9+
10+
Path = setup_path(__package__)
11+
12+
import i6_core.lib.lexicon as lexicon
13+
from i6_core.util import uopen, write_xml
14+
15+
16+
class BlissLexiconToWordLexicon(Job):
17+
def __init__(self, bliss_lexicon: Path, apply_filter: bool = True):
18+
self.set_vis_name("Lexicon to Word List")
19+
20+
self.bliss_lexicon = bliss_lexicon
21+
self.apply_filter = apply_filter
22+
23+
self.out_lexicon = self.output_path("lexicon.txt")
24+
25+
def tasks(self):
26+
yield Task("run", mini_task=True)
27+
28+
def run(self):
29+
lex = Lexicon()
30+
lex.load(self.bliss_lexicon.get_path())
31+
32+
with open(self.out_lexicon.get_path(), "w") as word_file:
33+
for lemma in lex.lemmata:
34+
for orth in lemma.orth:
35+
for phon in lemma.phon:
36+
if len(phon) > 0 and len(orth) > 0: # we can have empty phonemes or orth
37+
word_file.write(f"{orth} {phon}\n")

0 commit comments

Comments
 (0)