Add check for orths with space (#602)

Icemole · web-flow · commit 3603013da06e · 2025-05-14T12:10:56.000+02:00
diff --git a/g2p/convert.py b/g2p/convert.py
@@ -44,6 +44,8 @@ def tasks(self):
     def run(self):
         with uopen(self.bliss_lexicon, "rt") as f:
             tree = ET.parse(f)
+
+        orths_with_space = []
         with uopen(self.out_g2p_lexicon, "wt") as out:
             all_lemmas = tree.findall(".//lemma")
             assert len(all_lemmas) > 0, "No lemma tag found in the lexicon file! Wrong format file?"
@@ -58,6 +60,8 @@ def run(self):
                     orths = [lemma.find("orth").text.strip()]
 
                 for orth in orths:
+                    if " " in orth:
+                        orths_with_space.append(orth)
                     if self.include_pronunciation_variants:
                         phons = lemma.findall("phon")
                         phon_single = []
@@ -70,6 +74,15 @@ def run(self):
                         phon = lemma.find("phon").text.strip()
                         out.write("%s %s\n" % (orth, phon))
 
+        assert not orths_with_space, (
+            "The G2P training requires a file with the syntax `<orth><space><phonemes>` per line, "
+            "but some lemmas have a space in their orths. "
+            "As a consequence, some of the parts in the orth might be picked up as phonemes, which is not desirable.\n"
+            "The offending lemmas are:"
+            + "\n".join(orths_with_space)
+            + "\nAs a suggestion, substitute the spaces by an alternative token that the G2P can disregard, like '_'.\n"
+        )
+
 
 class G2POutputToBlissLexiconJob(Job):
     """