Skip to content

Commit e89b907

Browse files
authored
Fix Foreign Lemma Prefixes (fix #56) (#57)
* Add tests for prefix issue * Fix handling of foreign lemmas that are prefixes
1 parent 9574fad commit e89b907

File tree

2 files changed

+7
-2
lines changed

2 files changed

+7
-2
lines changed

cutlet/cutlet.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def has_foreign_lemma(word):
4545
if not '-' in lemma:
4646
return False
4747

48-
cand = lemma.split('-')[-1]
48+
cand = lemma.split('-', 1)[-1]
4949
# NOTE: some words have 外国 instead of a foreign spelling. ジル
5050
# (Jill?) is an example. Unclear why this is the case.
5151
# There are other hyphenated lemmas, like 私-代名詞.
@@ -257,6 +257,8 @@ def romaji_tokens(self, words, capitalize=True, title=False):
257257
if nw and nw.feature.pos1 in ('補助記号', '接尾辞'): continue
258258
# special case for half-width commas
259259
if nw and nw.surface == ',': continue
260+
# special case for prefixes
261+
if foreign and roma[-1] == "-": continue
260262
# 思えば -> omoeba
261263
if nw and nw.feature.pos2 in ('接続助詞'): continue
262264
# 333 -> 333 ; this should probably be handled in mecab
@@ -348,7 +350,7 @@ def romaji_word(self, word):
348350
elif (self.use_foreign_spelling and
349351
has_foreign_lemma(word)):
350352
# this is a foreign word with known spelling
351-
return word.feature.lemma.split('-')[-1]
353+
return word.feature.lemma.split('-', 1)[-1]
352354
elif word.feature.kana:
353355
# for known words
354356
kana = jaconv.kata2hira(word.feature.kana)

cutlet/test/test_basic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@
101101
("くヽる", "Ku ru"),
102102
("今度クヾペへ行こう", "Kondo kugupe e ikou"), # made up word
103103
("彁々", "?"),
104+
# prefixes, see #56
105+
("ビオハザード", "Bio-hazard"),
106+
("イントラワード", "Intra-word"),
104107
]
105108

106109
SENTENCES_KUNREI = [

0 commit comments

Comments
 (0)