Fix Foreign Lemma Prefixes (fix #56) (#57)

polm · web-flow · commit e89b90705403 · 2024-12-20T20:31:50.000+09:00
* Add tests for prefix issue

* Fix handling of foreign lemmas that are prefixes
diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py
@@ -45,7 +45,7 @@ def has_foreign_lemma(word):
     if not '-' in lemma:
         return False
 
-    cand = lemma.split('-')[-1]
+    cand = lemma.split('-', 1)[-1]
     # NOTE: some words have 外国 instead of a foreign spelling. ジル
     # (Jill?) is an example. Unclear why this is the case.
     # There are other hyphenated lemmas, like 私-代名詞.
@@ -257,6 +257,8 @@ def romaji_tokens(self, words, capitalize=True, title=False):
             if nw and nw.feature.pos1 in ('補助記号', '接尾辞'): continue
             # special case for half-width commas
             if nw and nw.surface == ',': continue
+            # special case for prefixes
+            if foreign and roma[-1] == "-": continue
             # 思えば -> omoeba
             if nw and nw.feature.pos2 in ('接続助詞'): continue
             # 333 -> 333 ; this should probably be handled in mecab
@@ -348,7 +350,7 @@ def romaji_word(self, word):
         elif (self.use_foreign_spelling and
                 has_foreign_lemma(word)):
             # this is a foreign word with known spelling
-            return word.feature.lemma.split('-')[-1]
+            return word.feature.lemma.split('-', 1)[-1]
         elif word.feature.kana:
             # for known words
             kana = jaconv.kata2hira(word.feature.kana)
diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py
@@ -101,6 +101,9 @@
     ("くヽる", "Ku ru"),
     ("今度クヾペへ行こう", "Kondo kugupe e ikou"),  # made up word
     ("彁々", "?"),
+    # prefixes, see #56
+    ("ビオハザード", "Bio-hazard"),
+    ("イントラワード", "Intra-word"),
 ]
 
 SENTENCES_KUNREI = [

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,9 @@`
`101`	`101`	`("くヽる", "Ku ru"),`
`102`	`102`	`("今度クヾペへ行こう", "Kondo kugupe e ikou"), # made up word`
`103`	`103`	`("彁々", "?"),`
	`104`	`+ # prefixes, see #56`
	`105`	`+ ("ビオハザード", "Bio-hazard"),`
	`106`	`+ ("イントラワード", "Intra-word"),`
`104`	`107`	`]`
`105`	`108`
`106`	`109`	`SENTENCES_KUNREI = [`