Skip to content

Commit 50aab19

Browse files
committed
AddMwt merges possible existing MWTs with the new ones
Also, some MISC attributes (SpaceAfter,...) are required to stay on the token level, other MISC attributes (Entity) are required to stay on the word level, and the remaining MISC attributes can be either word xor token level depending on whether `multiword_analysis(node)['misc'] == '*'`.
1 parent 06877ad commit 50aab19

File tree

1 file changed

+32
-2
lines changed

1 file changed

+32
-2
lines changed

udapi/block/ud/addmwt.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ def process_node(self, node):
1515
orig_attr[attr] = getattr(node, attr)
1616
orig_attr['feats'] = node.feats.copy()
1717
orig_attr['misc'] = node.misc.copy()
18+
# Defaults for the newly created MWT
19+
mwt_misc = node.misc.copy()
20+
mwt_form = node.form
1821

1922
forms = analysis['form'].split()
2023
main = analysis.get('main', 0)
@@ -37,6 +40,7 @@ def process_node(self, node):
3740
elif orig_attr['form'][0].isupper():
3841
nodes[0].form = nodes[0].form.title()
3942

43+
node.misc = None
4044
for attr in 'lemma upos xpos feats deprel misc'.split():
4145
if attr in analysis:
4246
values = analysis[attr].split()
@@ -47,6 +51,17 @@ def process_node(self, node):
4751
logging.warning("%s = %s" % (attr, analysis.get(attr, '')))
4852
if values[i] == '*':
4953
setattr(new_node, attr, orig_attr[attr])
54+
# No MISC attribute should be duplicated on the word level and token level,
55+
# so if copying MISC to a new_node, delete mwt_misc.
56+
# However, SpaceAfter should be annotated only on the token level,
57+
# so make sure it is not accidentally copied on the word level.
58+
if attr == 'misc':
59+
orig_attr['misc'].clear()
60+
for a in 'SpaceAfter SpacesAfter SpacesBefore'.split():
61+
if new_node.misc[a]:
62+
orig_attr['misc'][a] = new_node.misc[a]
63+
del new_node.misc[a]
64+
5065
elif attr == 'feats' and '*' in values[i]:
5166
new_node.feats = values[i]
5267
for feat_name, feat_value in list(new_node.feats.items()):
@@ -55,8 +70,23 @@ def process_node(self, node):
5570
else:
5671
setattr(new_node, attr, values[i])
5772

58-
mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc'])
59-
node.misc = None
73+
# Entity (coreference) annotation should be only on the word level,
74+
# so make sure it does not stay on the token level.
75+
if mwt_misc['Entity']:
76+
nodes[0].misc['Entity'] = mwt_misc['Entity']
77+
del mwt_misc['Entity']
78+
79+
# If node is already part of an MWT, we need to delete the old MWT and extend the new MWT.
80+
if node.multiword_token:
81+
mwt_words = node.multiword_token.words
82+
mwt_form = node.multiword_token.form
83+
if node.multiword_token.misc:
84+
mwt_misc.update(node.multiword_token.misc)
85+
node.multiword_token.remove()
86+
mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes
87+
nodes = mwt_words
88+
89+
mwt = node.root.create_multiword_token(nodes, mwt_form, mwt_misc)
6090
self.postprocess_mwt(mwt)
6191

6292
def multiword_analysis(self, node):

0 commit comments

Comments
 (0)