Skip to content

Commit be3068b

Browse files
committed
util.MarkDiff align=both
1 parent 39fe8ad commit be3068b

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

udapi/block/util/markdiff.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ class MarkDiff(Block):
99
"""Mark differences between parallel trees."""
1010

1111
def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc',
12-
mark=1, mark_attr="Mark", add=False, print_stats=0, ignore_parent=False, **kwargs):
12+
mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False,
13+
align=False, align_attr='Align', **kwargs):
1314
"""Create the Mark block object.
1415
Params:
1516
gold_zone: Which of the zones should be treated as gold?
@@ -18,10 +19,19 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc
1819
The tree topology, i.e. node parent is always considered.
1920
mark: What value should be used in `node.misc['Mark']` of the differing nodes?
2021
mark_attr: use this MISC attribute name instead of "Mark".
22+
Use mark_attr=0 to prevent marking diffs in MISC.
2123
add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block,
2224
so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block.
2325
print_stats: How many lines of statistics should be printed? -1 means all.
2426
ignore_parent: ignore differences in dependency parents
27+
align: store word alignment, possible values are False (no alignment stored, the default)
28+
"from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord,
29+
"from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and
30+
"both", i.e. both from-pred and from-gold.
31+
If only forms should be considered for inducing the word alignment,
32+
you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1".
33+
Only one-to-one alignment is supported.
34+
align_attr: use this MISC attribute name instead of "Align".
2535
"""
2636
super().__init__(**kwargs)
2737
self.gold_zone = gold_zone
@@ -31,7 +41,11 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc
3141
self.add = add
3242
self.print_stats = print_stats
3343
self.ignore_parent = ignore_parent
44+
self.align = align
45+
self.align_attr = align_attr
3446
self.stats = collections.Counter()
47+
if not mark_attr and not align and not print_stats:
48+
raise ValueError('mark_attr=0 does not make sense without align or print_stats')
3549

3650
def process_tree(self, tree):
3751
gold_tree = tree.bundle.get_tree(self.gold_zone)
@@ -45,7 +59,7 @@ def process_tree(self, tree):
4559

4660
pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants
4761
# Make sure both pred and gold trees are marked, even if one has just deleted nodes.
48-
if len(pred_nodes) != len(gold_nodes):
62+
if len(pred_nodes) != len(gold_nodes) and self.mark_attr:
4963
tree.add_comment(f'{self.mark_attr} = {self.mark}')
5064
gold_tree.add_comment(f'{self.mark_attr} = {self.mark}')
5165
pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in pred_nodes]
@@ -59,18 +73,24 @@ def process_tree(self, tree):
5973
if edit in {'equal', 'replace'}:
6074
for i in range(pred_lo, pred_hi):
6175
alignment[i] = i - pred_lo + gold_lo
76+
if self.align in ("both", "from-pred"):
77+
pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1
78+
if self.align in ("both", "from-gold"):
79+
gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1
6280

6381
for diff in diffs:
6482
edit, pred_lo, pred_hi, gold_lo, gold_hi = diff
6583
if edit == 'equal':
6684
for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]):
6785
if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1:
68-
p_node.misc[self.mark_attr] = self.mark
69-
g_node.misc[self.mark_attr] = self.mark
7086
self.stats['ONLY-PARENT-CHANGED'] += 1
87+
if self.mark_attr:
88+
p_node.misc[self.mark_attr] = self.mark
89+
g_node.misc[self.mark_attr] = self.mark
7190
else:
72-
for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]:
73-
node.misc[self.mark_attr] = self.mark
91+
if self.mark_attr:
92+
for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]:
93+
node.misc[self.mark_attr] = self.mark
7494
if self.print_stats:
7595
if edit == 'replace':
7696
# first n nodes are treated as aligned, the rest is treated as ADDED/DELETED

0 commit comments

Comments
 (0)