Skip to content

Commit 35d4561

Browse files
committed
ud.pt.AddHyphenMwt
e.g. "ex - juiz" as three words is converted into a single MWT "ex-juiz" with two words ("ex" and "juiz"). Questionable compound phrases are skipped, but marked with ToDo in MISC.
1 parent f84147c commit 35d4561

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed

udapi/block/ud/pt/addhyphenmwt.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Block ud.pt.AddHyphenMwt for transforming hyphen compounds into multiword tokens in Portuguese-GSD.
2+
3+
See https://github.com/UniversalDependencies/UD_Portuguese-GSD/issues/39
4+
"""
5+
from udapi.core.block import Block
6+
7+
class AddHyphenMwt(Block):
8+
9+
def _ok(self, token):
10+
# The hyphen in "al-Assad" perhaps should be kept as a separate word.
11+
return token.form.isalnum() and token.form.lower() != 'al'
12+
13+
def process_tree(self, root):
14+
tokens, i = root.token_descendants, 1
15+
while i+1 < len(tokens):
16+
start_i = i-1
17+
if tokens[i].form == "-" and self._ok(tokens[i-1]) and self._ok(tokens[i+1]):
18+
while i+3 < len(tokens) and tokens[i+2].form == "-" and self._ok(tokens[i+3]):
19+
i += 2
20+
compound, words = tokens[start_i:i+2], []
21+
for token in compound:
22+
words += token.words
23+
heads = [w for w in words if w.parent not in words]
24+
cuckolds = [w for w in words if w not in heads and any(c not in words for c in w.children)]
25+
if len(heads) > 1:
26+
for h in heads:
27+
h.misc["ToDo"] = 'NonCatenaCompound'
28+
elif cuckolds:
29+
for c in cuckolds:
30+
c.misc["ToDo"] = 'HasChildrenOutsideCompound'
31+
else:
32+
compound_form = "".join(t.form for t in compound)
33+
for hyphen in compound[1::2]:
34+
hyphen.remove()
35+
root.create_multiword_token([w for w in words if w.form != '-'], compound_form)
36+
root.text = None
37+
i += 1

0 commit comments

Comments
 (0)