Skip to content

Commit 3603013

Browse files
authored
Add check for orths with space (#602)
1 parent 8638dcf commit 3603013

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

g2p/convert.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def tasks(self):
4444
def run(self):
4545
with uopen(self.bliss_lexicon, "rt") as f:
4646
tree = ET.parse(f)
47+
48+
orths_with_space = []
4749
with uopen(self.out_g2p_lexicon, "wt") as out:
4850
all_lemmas = tree.findall(".//lemma")
4951
assert len(all_lemmas) > 0, "No lemma tag found in the lexicon file! Wrong format file?"
@@ -58,6 +60,8 @@ def run(self):
5860
orths = [lemma.find("orth").text.strip()]
5961

6062
for orth in orths:
63+
if " " in orth:
64+
orths_with_space.append(orth)
6165
if self.include_pronunciation_variants:
6266
phons = lemma.findall("phon")
6367
phon_single = []
@@ -70,6 +74,15 @@ def run(self):
7074
phon = lemma.find("phon").text.strip()
7175
out.write("%s %s\n" % (orth, phon))
7276

77+
assert not orths_with_space, (
78+
"The G2P training requires a file with the syntax `<orth><space><phonemes>` per line, "
79+
"but some lemmas have a space in their orths. "
80+
"As a consequence, some of the parts in the orth might be picked up as phonemes, which is not desirable.\n"
81+
"The offending lemmas are:"
82+
+ "\n".join(orths_with_space)
83+
+ "\nAs a suggestion, substitute the spaces by an alternative token that the G2P can disregard, like '_'.\n"
84+
)
85+
7386

7487
class G2POutputToBlissLexiconJob(Job):
7588
"""

0 commit comments

Comments
 (0)