@@ -15,6 +15,9 @@ def process_node(self, node):
15
15
orig_attr [attr ] = getattr (node , attr )
16
16
orig_attr ['feats' ] = node .feats .copy ()
17
17
orig_attr ['misc' ] = node .misc .copy ()
18
+ # Defaults for the newly created MWT
19
+ mwt_misc = node .misc .copy ()
20
+ mwt_form = node .form
18
21
19
22
forms = analysis ['form' ].split ()
20
23
main = analysis .get ('main' , 0 )
@@ -37,6 +40,7 @@ def process_node(self, node):
37
40
elif orig_attr ['form' ][0 ].isupper ():
38
41
nodes [0 ].form = nodes [0 ].form .title ()
39
42
43
+ node .misc = None
40
44
for attr in 'lemma upos xpos feats deprel misc' .split ():
41
45
if attr in analysis :
42
46
values = analysis [attr ].split ()
@@ -47,6 +51,17 @@ def process_node(self, node):
47
51
logging .warning ("%s = %s" % (attr , analysis .get (attr , '' )))
48
52
if values [i ] == '*' :
49
53
setattr (new_node , attr , orig_attr [attr ])
54
+ # No MISC attribute should be duplicated on the word level and token level,
55
+ # so if copying MISC to a new_node, delete mwt_misc.
56
+ # However, SpaceAfter should be annotated only on the token level,
57
+ # so make sure it is not accidentally copied on the word level.
58
+ if attr == 'misc' :
59
+ orig_attr ['misc' ].clear ()
60
+ for a in 'SpaceAfter SpacesAfter SpacesBefore' .split ():
61
+ if new_node .misc [a ]:
62
+ orig_attr ['misc' ][a ] = new_node .misc [a ]
63
+ del new_node .misc [a ]
64
+
50
65
elif attr == 'feats' and '*' in values [i ]:
51
66
new_node .feats = values [i ]
52
67
for feat_name , feat_value in list (new_node .feats .items ()):
@@ -55,8 +70,23 @@ def process_node(self, node):
55
70
else :
56
71
setattr (new_node , attr , values [i ])
57
72
58
- mwt = node .root .create_multiword_token (nodes , orig_attr ['form' ], orig_attr ['misc' ])
59
- node .misc = None
73
+ # Entity (coreference) annotation should be only on the word level,
74
+ # so make sure it does not stay on the token level.
75
+ if mwt_misc ['Entity' ]:
76
+ nodes [0 ].misc ['Entity' ] = mwt_misc ['Entity' ]
77
+ del mwt_misc ['Entity' ]
78
+
79
+ # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT.
80
+ if node .multiword_token :
81
+ mwt_words = node .multiword_token .words
82
+ mwt_form = node .multiword_token .form
83
+ if node .multiword_token .misc :
84
+ mwt_misc .update (node .multiword_token .misc )
85
+ node .multiword_token .remove ()
86
+ mwt_words [mwt_words .index (node ):mwt_words .index (node )+ 1 ] = nodes
87
+ nodes = mwt_words
88
+
89
+ mwt = node .root .create_multiword_token (nodes , mwt_form , mwt_misc )
60
90
self .postprocess_mwt (mwt )
61
91
62
92
def multiword_analysis (self , node ):
0 commit comments