Skip to content

Commit 2bd74ad

Browse files
authored
Merge pull request #1 from udapi/master
update
2 parents 327bb6f + 35d4561 commit 2bd74ad

35 files changed

+1419
-161
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
colorama
1+
colorama>=0.4.6
22
termcolor
33
ufal.udpipe

udapi/block/corefud/countgaps.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
from udapi.core.block import Block
2+
from collections import defaultdict, Counter
3+
4+
class CountGaps(Block):
5+
"""Block corefud.checkConsistency searches for sentence sequences with no coref annotation."""
6+
7+
def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs):
8+
super().__init__(**kwargs)
9+
self.report_per_newdoc = report_per_newdoc
10+
self.report_per_file = report_per_file
11+
self.report_total = report_total
12+
self._total_counter = defaultdict(Counter)
13+
14+
def _report_stats(self, counter, header_id=None):
15+
if header_id:
16+
print(f"============ {header_id} ============")
17+
for key in sorted(counter):
18+
print(f"{key:2d}: {counter[key]}")
19+
print("-------")
20+
print(f"SUM: {sum([k*counter[k] for k in counter])}")
21+
22+
def _count_empty_seqs(self, empty_seqs):
23+
counter = Counter()
24+
for seq in empty_seqs:
25+
counter[len(seq)] += 1
26+
return counter
27+
28+
def process_document(self, doc):
29+
file_counters = defaultdict(Counter)
30+
empty_seqs = []
31+
empty_pars = []
32+
curr_seq = []
33+
curr_par = []
34+
is_empty_par = True
35+
newdoc = None
36+
for i, tree in enumerate(doc.trees):
37+
if tree.newdoc:
38+
if i:
39+
if curr_seq:
40+
empty_seqs.append(curr_seq)
41+
newdoc_seq_counter = self._count_empty_seqs(empty_seqs)
42+
file_counters["seq"].update(newdoc_seq_counter)
43+
if is_empty_par:
44+
empty_pars.append(curr_par)
45+
newdoc_par_counter = self._count_empty_seqs(empty_pars)
46+
file_counters["par"].update(newdoc_par_counter)
47+
if self.report_per_newdoc:
48+
self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}")
49+
self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}")
50+
newdoc = tree.newdoc
51+
empty_seqs = []
52+
empty_pars = []
53+
curr_seq = []
54+
curr_par = []
55+
is_empty_par = True
56+
if tree.newpar:
57+
if not tree.newdoc and is_empty_par:
58+
empty_pars.append(curr_par)
59+
curr_par = []
60+
is_empty_par = True
61+
62+
has_mention = any(node.coref_mentions for node in tree.descendants)
63+
if not has_mention:
64+
curr_seq.append(tree.sent_id)
65+
curr_par.append(tree.sent_id)
66+
else:
67+
if curr_seq:
68+
empty_seqs.append(curr_seq)
69+
curr_seq = []
70+
is_empty_par = False
71+
72+
if curr_seq:
73+
empty_seqs.append(curr_seq)
74+
newdoc_seq_counter = self._count_empty_seqs(empty_seqs)
75+
file_counters["seq"].update(newdoc_seq_counter)
76+
if curr_par:
77+
empty_pars.append(curr_par)
78+
newdoc_par_counter = self._count_empty_seqs(empty_pars)
79+
file_counters["par"].update(newdoc_par_counter)
80+
if self.report_per_newdoc:
81+
self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}")
82+
self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}")
83+
84+
if self.report_per_file:
85+
self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE")
86+
self._report_stats(file_counters["par"], header_id="PAR STATS, FILE")
87+
88+
self._total_counter["seq"].update(file_counters["seq"])
89+
self._total_counter["par"].update(file_counters["par"])
90+
91+
def process_end(self):
92+
if self.report_total:
93+
self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL")
94+
self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL")

udapi/block/corefud/fixinterleaved.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import itertools
44

55
class FixInterleaved(Block):
6-
"""Fix mentions with interleaved or crossing spans."""
6+
"""Fix mentions with interleaved or crossing spans.
7+
https://github.com/ufal/corefUD/issues/25
8+
"""
79

810
def __init__(self, same_entity_only=True, both_discontinuous=False,
911
crossing_only=False, nested_same_subspan=True, **kwargs):
@@ -58,8 +60,8 @@ def process_tree(self, tree):
5860
pass
5961
deleted.add(mB)
6062

61-
# By changing the mA.words, we could have create another error:
62-
# making the span same as another mention. Let's fix it
63+
# By changing the mA.words, we could have created another error:
64+
# making the span same as another mention. Let's fix it.
6365
sA = set(mA.words)
6466
for mC in mentions:
6567
if mC in deleted or mC is mA or mC is mB:

udapi/block/corefud/guessspan.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from udapi.core.block import Block
2+
3+
class GuessSpan(Block):
4+
"""Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head"""
5+
6+
def process_coref_mention(self, mention):
7+
mwords = mention.head.descendants(add_self=True)
8+
# TODO add heuristics from corefud.PrintMentions almost_forest=1
9+
10+
# Add empty nodes that are causing gaps.
11+
# A node "within the span" whose enhanced parent is in the mentions
12+
# must be added to the mention as well.
13+
# "within the span" includes also empty nodes "on the boundary".
14+
# However, don't add empty nodes which are in a gap cause by non-empty nodes.
15+
to_add = []
16+
min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1
17+
max_ord = int(mwords[-1].ord) + 1
18+
root = mention.head.root
19+
for empty in root.empty_nodes:
20+
if empty in mwords:
21+
continue
22+
if empty.ord > max_ord:
23+
break
24+
if empty.ord > min_ord:
25+
if any(enh['parent'] in mwords for enh in empty.deps):
26+
to_add.append(empty)
27+
elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1:
28+
prev_nonempty = root.descendants[int(empty.ord) - 1]
29+
next_nonempty = root.descendants[int(empty.ord)]
30+
if prev_nonempty in mwords and next_nonempty in mwords:
31+
to_add.append(empty)
32+
#else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}'
33+
mention.words = sorted(mwords + to_add)

udapi/block/corefud/printmentions.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ class PrintMentions(Block):
1010
def __init__(self, continuous='include', almost_continuous='include', treelet='include',
1111
forest='include', almost_forest='include', oneword='include', singleton='include',
1212
empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5,
13-
print_total=True,
13+
print_total=True, print_should=True,
1414
print_sent_id=True, print_text=True, add_empty_line=True, indent=1,
15-
minimize_cross=True, color=True, attributes='form,upos,deprel',
15+
minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc',
1616
print_undef_as='_', print_doc_meta=True, print_comments=False,
1717
mark='(Mark)', hints=True, layout='classic',
1818
**kwargs):
@@ -33,6 +33,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i
3333
random.seed(42)
3434
self.print_other_forms = print_other_forms
3535
self.print_total = print_total,
36+
self.print_should = print_should,
3637
print_class = TextModeTreesHtml if html else TextModeTrees
3738
self.print_block = print_class(
3839
print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent,
@@ -61,7 +62,9 @@ def _ok(self, condition, value):
6162
return (condition and value == 'only') or (not condition and value=='exclude')
6263

6364
def _is_auxiliary_etc(self, node):
64-
if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}:
65+
if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}:
66+
return True
67+
if node.deprel == 'advmod:emph':
6568
return True
6669
if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}:
6770
return True
@@ -79,8 +82,25 @@ def _is_forest(self, mention, mwords, almost):
7982
for ch in w.children:
8083
if ch not in mwords:
8184
if not almost:
85+
if self.print_should:
86+
ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
8287
return False
88+
# Punctuation before or after the mention span can depend on any of the mwords
89+
# without breaking the almost_forest property.
90+
# According to the UD guidelines, it should depend on the highest node within the phrase,
91+
# i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines.
92+
if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]):
93+
continue
94+
# Some auxiliary words (e.g. prepositions) may be excluded from the mention span
95+
# without breaking the almost_forest property, but they need to depend
96+
# on the mention head (or if the mention is not a catena, they need to depend
97+
# on one of the potential heads, i.e. a node from mwords whose parent is not in mwords).
98+
# For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head),
99+
# but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest
100+
# because "with" depends on "Mary", which is not the mention head (nor a potential mention head).
83101
if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)):
102+
if self.print_should:
103+
ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
84104
return False
85105
return True
86106

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from udapi.core.block import Block
2+
import udapi.core.coref
3+
import re
4+
import logging
5+
6+
class RemoveNoCorefEntities(Block):
7+
"""
8+
Some corpora (e.g., AnCora) include annotation of named entities that are
9+
not annotated for coreference. To distinguish them, their cluster ID starts
10+
with 'NOCOREF' (optionally followed by entity type, so that one cluster
11+
still has just one type). We may want to remove such entities from datasets
12+
that are used to train coreference resolves, to prevent the resolvers from
13+
thinking that all members of a NOCOREF cluster are coreferential. That is
14+
what this block does.
15+
"""
16+
17+
def process_document(self, doc):
18+
entities = doc.coref_entities
19+
if not entities:
20+
return
21+
doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)}

udapi/block/read/conllu.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,34 @@ def parse_comment_line(self, line, root):
7373
if entity_match is not None:
7474
global_entity = entity_match.group(1)
7575
if self._global_entity and self._global_entity != global_entity:
76-
logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity))
76+
logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}")
7777
self._global_entity = global_entity
7878
root.comment += '$GLOBAL.ENTITY\n'
7979
return
8080

8181
root.comment += line[1:] + "\n"
8282

8383
def read_trees(self):
84-
return [self.read_tree_from_lines(s.split('\n')) for s in
85-
self.filehandle.read().split('\n\n') if s]
84+
if not self.max_docs:
85+
return [self.read_tree_from_lines(s.split('\n')) for s in
86+
self.filehandle.read().split('\n\n') if s]
87+
# udapi.core.basereader takes care about the max_docs parameter.
88+
# However, we can make the loading much faster by not reading
89+
# the whole file if the user wants just first N documents.
90+
trees, lines, loaded_docs = [], [], 0
91+
for line in self.filehandle:
92+
line = line.rstrip()
93+
if line == '':
94+
tree = self.read_tree_from_lines(lines)
95+
lines = []
96+
if tree.newdoc:
97+
if loaded_docs == self.max_docs:
98+
return trees
99+
loaded_docs += 1
100+
trees.append(tree)
101+
else:
102+
lines.append(line)
103+
return
86104

87105
def read_tree(self):
88106
if self.filehandle is None:
@@ -193,7 +211,11 @@ def read_tree_from_lines(self, lines):
193211

194212
# Create multi-word tokens.
195213
for fields in mwts:
196-
range_start, range_end = fields[0].split('-')
214+
try:
215+
range_start, range_end = fields[0].split('-')
216+
except ValueError:
217+
logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}")
218+
raise
197219
words = nodes[int(range_start):int(range_end) + 1]
198220
root.create_multiword_token(words, form=fields[1], misc=fields[-1])
199221

udapi/block/read/sentences.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,21 @@ class Sentences(BaseReader):
99
Args:
1010
ignore_empty_lines: if True, delete empty lines from the input.
1111
Default=False.
12+
newdoc_if_empty_line: if True, empty lines mark document boundaries,
13+
which are marked with `root.newdoc`. Default=False.
1214
rstrip: a set of characters to be stripped from the end of each line.
1315
Default='\r\n '. You can use rstrip='\n' if you want to preserve
1416
any space or '\r' (Carriage Return) at end of line,
1517
so that `udpipe.Base` keeps these characters in `SpacesAfter`.
1618
As most blocks do not expect whitespace other than a space to appear
1719
in the processed text, using this feature is at your own risk.
1820
"""
19-
def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs):
21+
def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False,
22+
rstrip='\r\n ', **kwargs):
23+
if ignore_empty_lines and newdoc_if_empty_line:
24+
raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line")
2025
self.ignore_empty_lines = ignore_empty_lines
26+
self.newdoc_if_empty_line = newdoc_if_empty_line
2127
self.rstrip = rstrip
2228
super().__init__(**kwargs)
2329

@@ -38,11 +44,20 @@ def read_tree(self, document=None):
3844
# (or '\r\n' if reading a Windows file on Unix machine).
3945
if line == '':
4046
return None
41-
if self.ignore_empty_lines:
47+
preceded_by_empty_line = False
48+
if self.ignore_empty_lines or self.newdoc_if_empty_line:
4249
while line in {'\n', '\r\n'}:
50+
preceded_by_empty_line = True
4351
line = self.filehandle.readline()
4452
if line == '':
4553
return None
4654
root = Root()
4755
root.text = line.rstrip(self.rstrip)
56+
if self.newdoc_if_empty_line and preceded_by_empty_line:
57+
root.newdoc = True
4858
return root
59+
60+
# The first line in a file also marks a start of new document
61+
def after_process_document(self, document):
62+
if self.newdoc_if_empty_line:
63+
document.bundles[0].trees[0].newdoc = True

0 commit comments

Comments
 (0)