@@ -44,6 +44,8 @@ def tasks(self):
4444 def run (self ):
4545 with uopen (self .bliss_lexicon , "rt" ) as f :
4646 tree = ET .parse (f )
47+
48+ orths_with_space = []
4749 with uopen (self .out_g2p_lexicon , "wt" ) as out :
4850 all_lemmas = tree .findall (".//lemma" )
4951 assert len (all_lemmas ) > 0 , "No lemma tag found in the lexicon file! Wrong format file?"
@@ -58,6 +60,8 @@ def run(self):
5860 orths = [lemma .find ("orth" ).text .strip ()]
5961
6062 for orth in orths :
63+ if " " in orth :
64+ orths_with_space .append (orth )
6165 if self .include_pronunciation_variants :
6266 phons = lemma .findall ("phon" )
6367 phon_single = []
@@ -70,6 +74,15 @@ def run(self):
7074 phon = lemma .find ("phon" ).text .strip ()
7175 out .write ("%s %s\n " % (orth , phon ))
7276
77+ assert not orths_with_space , (
78+ "The G2P training requires a file with the syntax `<orth><space><phonemes>` per line, "
79+ "but some lemmas have a space in their orths. "
80+ "As a consequence, some of the parts in the orth might be picked up as phonemes, which is not desirable.\n "
81+ "The offending lemmas are:"
82+ + "\n " .join (orths_with_space )
83+ + "\n As a suggestion, substitute the spaces by an alternative token that the G2P can disregard, like '_'.\n "
84+ )
85+
7386
7487class G2POutputToBlissLexiconJob (Job ):
7588 """
0 commit comments