Skip to content

Commit 13088ed

Browse files
committed
Lemmatization of interrogative verbs in Chinese.
1 parent 74445e4 commit 13088ed

File tree

1 file changed

+16
-9
lines changed

1 file changed

+16
-9
lines changed

udapi/block/ud/zh/lemmatize.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,25 +47,32 @@ def process_node(self, node):
4747
return
4848
# Lemmatize negated verbs to their affirmative forms.
4949
# 不是 bùshì = not be
50-
# 没有 méiyǒu = not exist
51-
# 沒能 méinéng = cannot
50+
# 沒有 没有 méiyǒu = not exist
51+
# 沒能 没能 méinéng = cannot
5252
# 未能 wèinéng = cannot
53+
# Lemmatize question verbs to their base forms.
54+
# 要不要 yàobùyào = do (you) want?
55+
# 有没有 yǒuméiyǒu = do (you) have?
5356
# Verbs that are derived from the copula and tagged as the copula need
5457
# to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi).
55-
# 亦為 亦为 Yì wèi také
56-
# 則為 则为 Zé wèi potom
57-
# 更為 更为 Gèng wèi více
58-
# 認為 认为 Rènwéi myslet, věřit
59-
# 以為 以为 Yǐwéi myslet, věřit
60-
# 以爲 以为 Yǐwéi myslet, věřit
58+
# 亦為 亦为 yìwèi = také
59+
# 則為 则为 zéwèi = potom
60+
# 更為 更为 gèngwèi = více
61+
# 認為 认为 rènwéi = myslet, věřit
62+
# 以為 以为 yǐwéi = myslet, věřit
63+
# 以爲 以为 yǐwéi = myslet, věřit
6164
if re.match(r'^(AUX|VERB)$', node.upos):
6265
m1 = re.match(r'^([不没沒未])(.+)$', node.form)
63-
m2 = re.search(r'([是爲為为])', node.form)
66+
m2 = re.match(r'^(.+)([不没沒未])\1$', node.form)
67+
m3 = re.search(r'([是爲為为])', node.form)
6468
if m1:
6569
node.lemma = m1.group(2)
6670
node.feats['Polarity'] = 'Neg'
6771
elif m2:
6872
node.lemma = m2.group(1)
73+
node.feats['Mood'] = 'Int'
74+
elif m3:
75+
node.lemma = m3.group(1)
6976
if node.lemma == '爲':
7077
node.lemma = '為'
7178
elif node.form in self.lemma:

0 commit comments

Comments
 (0)