@@ -9,7 +9,8 @@ class MarkDiff(Block):
9
9
"""Mark differences between parallel trees."""
10
10
11
11
def __init__ (self , gold_zone , attributes = 'form,lemma,upos,xpos,deprel,feats,misc' ,
12
- mark = 1 , mark_attr = "Mark" , add = False , print_stats = 0 , ignore_parent = False , ** kwargs ):
12
+ mark = 1 , mark_attr = 'Mark' , add = False , print_stats = 0 , ignore_parent = False ,
13
+ align = False , align_attr = 'Align' , ** kwargs ):
13
14
"""Create the Mark block object.
14
15
Params:
15
16
gold_zone: Which of the zones should be treated as gold?
@@ -18,10 +19,19 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc
18
19
The tree topology, i.e. node parent is always considered.
19
20
mark: What value should be used in `node.misc['Mark']` of the differing nodes?
20
21
mark_attr: use this MISC attribute name instead of "Mark".
22
+ Use mark_attr=0 to prevent marking diffs in MISC.
21
23
add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block,
22
24
so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block.
23
25
print_stats: How many lines of statistics should be printed? -1 means all.
24
26
ignore_parent: ignore differences in dependency parents
27
+ align: store word alignment, possible values are False (no alignment stored, the default)
28
+ "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord,
29
+ "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and
30
+ "both", i.e. both from-pred and from-gold.
31
+ If only forms should be considered for inducing the word alignment,
32
+ you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1".
33
+ Only one-to-one alignment is supported.
34
+ align_attr: use this MISC attribute name instead of "Align".
25
35
"""
26
36
super ().__init__ (** kwargs )
27
37
self .gold_zone = gold_zone
@@ -31,7 +41,11 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc
31
41
self .add = add
32
42
self .print_stats = print_stats
33
43
self .ignore_parent = ignore_parent
44
+ self .align = align
45
+ self .align_attr = align_attr
34
46
self .stats = collections .Counter ()
47
+ if not mark_attr and not align and not print_stats :
48
+ raise ValueError ('mark_attr=0 does not make sense without align or print_stats' )
35
49
36
50
def process_tree (self , tree ):
37
51
gold_tree = tree .bundle .get_tree (self .gold_zone )
@@ -45,7 +59,7 @@ def process_tree(self, tree):
45
59
46
60
pred_nodes , gold_nodes = tree .descendants , gold_tree .descendants
47
61
# Make sure both pred and gold trees are marked, even if one has just deleted nodes.
48
- if len (pred_nodes ) != len (gold_nodes ):
62
+ if len (pred_nodes ) != len (gold_nodes ) and self . mark_attr :
49
63
tree .add_comment (f'{ self .mark_attr } = { self .mark } ' )
50
64
gold_tree .add_comment (f'{ self .mark_attr } = { self .mark } ' )
51
65
pred_tokens = ['_' .join (n .get_attrs (self .attrs )) for n in pred_nodes ]
@@ -59,18 +73,24 @@ def process_tree(self, tree):
59
73
if edit in {'equal' , 'replace' }:
60
74
for i in range (pred_lo , pred_hi ):
61
75
alignment [i ] = i - pred_lo + gold_lo
76
+ if self .align in ("both" , "from-pred" ):
77
+ pred_nodes [i ].misc [self .align_attr ] = i - pred_lo + gold_lo + 1
78
+ if self .align in ("both" , "from-gold" ):
79
+ gold_nodes [i - pred_lo + gold_lo ].misc [self .align_attr ] = i + 1
62
80
63
81
for diff in diffs :
64
82
edit , pred_lo , pred_hi , gold_lo , gold_hi = diff
65
83
if edit == 'equal' :
66
84
for p_node , g_node in zip (pred_nodes [pred_lo :pred_hi ], gold_nodes [gold_lo :gold_hi ]):
67
85
if not self .ignore_parent and alignment .get (p_node .parent .ord - 1 ) != g_node .parent .ord - 1 :
68
- p_node .misc [self .mark_attr ] = self .mark
69
- g_node .misc [self .mark_attr ] = self .mark
70
86
self .stats ['ONLY-PARENT-CHANGED' ] += 1
87
+ if self .mark_attr :
88
+ p_node .misc [self .mark_attr ] = self .mark
89
+ g_node .misc [self .mark_attr ] = self .mark
71
90
else :
72
- for node in pred_nodes [pred_lo :pred_hi ] + gold_nodes [gold_lo :gold_hi ]:
73
- node .misc [self .mark_attr ] = self .mark
91
+ if self .mark_attr :
92
+ for node in pred_nodes [pred_lo :pred_hi ] + gold_nodes [gold_lo :gold_hi ]:
93
+ node .misc [self .mark_attr ] = self .mark
74
94
if self .print_stats :
75
95
if edit == 'replace' :
76
96
# first n nodes are treated as aligned, the rest is treated as ADDED/DELETED
0 commit comments