udapi
diff --git a/‎requirements.txt
Lines changed: 1 addition & 1 deletion b/‎requirements.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎udapi/block/corefud/countgaps.py
Lines changed: 94 additions & 0 deletions b/‎udapi/block/corefud/countgaps.py
Lines changed: 94 additions & 0 deletions
diff --git a/‎udapi/block/corefud/fixinterleaved.py
Lines changed: 5 additions & 3 deletions b/‎udapi/block/corefud/fixinterleaved.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎udapi/block/corefud/guessspan.py
Lines changed: 33 additions & 0 deletions b/‎udapi/block/corefud/guessspan.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎udapi/block/corefud/printmentions.py
Lines changed: 23 additions & 3 deletions b/‎udapi/block/corefud/printmentions.py
Lines changed: 23 additions & 3 deletions
diff --git a/‎udapi/block/corefud/removenocorefentities.py
Lines changed: 21 additions & 0 deletions b/‎udapi/block/corefud/removenocorefentities.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎udapi/block/read/conllu.py
Lines changed: 26 additions & 4 deletions b/‎udapi/block/read/conllu.py
Lines changed: 26 additions & 4 deletions
diff --git a/‎udapi/block/read/sentences.py
Lines changed: 17 additions & 2 deletions b/‎udapi/block/read/sentences.py
Lines changed: 17 additions & 2 deletions
@@ -1,3 +1,3 @@
-colorama
+colorama>=0.4.6
 termcolor
 ufal.udpipe
@@ -0,0 +1,94 @@
+from udapi.core.block import Block
+from collections import defaultdict, Counter
+
+class CountGaps(Block):
+    """Block corefud.checkConsistency searches for sentence sequences with no coref annotation."""
+
+    def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs):
+        super().__init__(**kwargs)
+        self.report_per_newdoc = report_per_newdoc
+        self.report_per_file = report_per_file
+        self.report_total = report_total
+        self._total_counter = defaultdict(Counter)
+
+    def _report_stats(self, counter, header_id=None):
+        if header_id:
+            print(f"============ {header_id} ============")
+        for key in sorted(counter):
+            print(f"{key:2d}: {counter[key]}")
+        print("-------")
+        print(f"SUM: {sum([k*counter[k] for k in counter])}")
+
+    def _count_empty_seqs(self, empty_seqs):
+        counter = Counter()
+        for seq in empty_seqs:
+            counter[len(seq)] += 1
+        return counter
+
+    def process_document(self, doc):
+        file_counters = defaultdict(Counter)
+        empty_seqs = []
+        empty_pars = []
+        curr_seq = []
+        curr_par = []
+        is_empty_par = True
+        newdoc = None
+        for i, tree in enumerate(doc.trees):
+            if tree.newdoc:
+                if i:
+                    if curr_seq:
+                        empty_seqs.append(curr_seq)
+                    newdoc_seq_counter = self._count_empty_seqs(empty_seqs)
+                    file_counters["seq"].update(newdoc_seq_counter)
+                    if is_empty_par:
+                        empty_pars.append(curr_par)
+                    newdoc_par_counter = self._count_empty_seqs(empty_pars)
+                    file_counters["par"].update(newdoc_par_counter)
+                    if self.report_per_newdoc:
+                        self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}")
+                        self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}")
+                newdoc = tree.newdoc
+                empty_seqs = []
+                empty_pars = []
+                curr_seq = []
+                curr_par = []
+                is_empty_par = True
+            if tree.newpar:
+                if not tree.newdoc and is_empty_par:
+                    empty_pars.append(curr_par)
+                curr_par = []
+                is_empty_par = True
+
+            has_mention = any(node.coref_mentions for node in tree.descendants)
+            if not has_mention:
+                curr_seq.append(tree.sent_id)
+                curr_par.append(tree.sent_id)
+            else:
+                if curr_seq:
+                    empty_seqs.append(curr_seq)
+                    curr_seq = []
+                is_empty_par = False
+        
+        if curr_seq:
+            empty_seqs.append(curr_seq)
+        newdoc_seq_counter = self._count_empty_seqs(empty_seqs)
+        file_counters["seq"].update(newdoc_seq_counter)
+        if curr_par:
+            empty_pars.append(curr_par)
+        newdoc_par_counter = self._count_empty_seqs(empty_pars)
+        file_counters["par"].update(newdoc_par_counter)
+        if self.report_per_newdoc:
+            self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}")
+            self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}")
+
+        if self.report_per_file:
+            self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE")
+            self._report_stats(file_counters["par"], header_id="PAR STATS, FILE")
+
+        self._total_counter["seq"].update(file_counters["seq"])
+        self._total_counter["par"].update(file_counters["par"])
+
+    def process_end(self):
+        if self.report_total:
+            self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL")
+            self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL")
@@ -3,7 +3,9 @@
 import itertools
 
 class FixInterleaved(Block):
-    """Fix mentions with interleaved or crossing spans."""
+    """Fix mentions with interleaved or crossing spans.
+       https://github.com/ufal/corefUD/issues/25
+    """
 
     def __init__(self, same_entity_only=True, both_discontinuous=False,
                  crossing_only=False, nested_same_subspan=True, **kwargs):
@@ -58,8 +60,8 @@ def process_tree(self, tree):
                 pass
             deleted.add(mB)
 
-            # By changing the mA.words, we could have create another error:
-            # making the span same as another mention. Let's fix it
+            # By changing the mA.words, we could have created another error:
+            # making the span same as another mention. Let's fix it.
             sA = set(mA.words)
             for mC in mentions:
                 if mC in deleted or mC is mA or mC is mB:
 
@@ -0,0 +1,33 @@
+from udapi.core.block import Block
+
+class GuessSpan(Block):
+    """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head"""
+
+    def process_coref_mention(self, mention):
+        mwords = mention.head.descendants(add_self=True)
+        # TODO add heuristics from corefud.PrintMentions almost_forest=1
+
+        # Add empty nodes that are causing gaps.
+        # A node "within the span" whose enhanced parent is in the mentions
+        # must be added to the mention as well.
+        # "within the span" includes also empty nodes "on the boundary".
+        # However, don't add empty nodes which are in a gap cause by non-empty nodes.
+        to_add = []
+        min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1
+        max_ord = int(mwords[-1].ord) + 1
+        root = mention.head.root
+        for empty in root.empty_nodes:
+            if empty in mwords:
+                continue
+            if empty.ord > max_ord:
+                break
+            if empty.ord > min_ord:
+                if any(enh['parent'] in mwords for enh in empty.deps):
+                    to_add.append(empty)
+                elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1:
+                    prev_nonempty = root.descendants[int(empty.ord) - 1]
+                    next_nonempty = root.descendants[int(empty.ord)]
+                    if prev_nonempty in mwords and next_nonempty in mwords:
+                        to_add.append(empty)
+                    #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}'
+        mention.words = sorted(mwords + to_add)
@@ -10,9 +10,9 @@ class PrintMentions(Block):
     def __init__(self, continuous='include', almost_continuous='include', treelet='include',
                  forest='include', almost_forest='include', oneword='include', singleton='include',
                  empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5,
-                 print_total=True,
+                 print_total=True, print_should=True,
                  print_sent_id=True, print_text=True, add_empty_line=True, indent=1,
-                 minimize_cross=True, color=True, attributes='form,upos,deprel',
+                 minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc',
                  print_undef_as='_', print_doc_meta=True, print_comments=False,
                  mark='(Mark)', hints=True, layout='classic',
                  **kwargs):
@@ -33,6 +33,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i
             random.seed(42)
         self.print_other_forms = print_other_forms
         self.print_total = print_total,
+        self.print_should = print_should,
         print_class = TextModeTreesHtml if html else TextModeTrees
         self.print_block = print_class(
                 print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent,
@@ -61,7 +62,9 @@ def _ok(self, condition, value):
         return (condition and value == 'only') or (not condition and value=='exclude')
 
     def _is_auxiliary_etc(self, node):
-        if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}:
+        if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}:
+            return True
+        if node.deprel == 'advmod:emph':
             return True
         if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}:
             return True
@@ -79,8 +82,25 @@ def _is_forest(self, mention, mwords, almost):
             for ch in w.children:
                 if ch not in mwords:
                     if not almost:
+                        if self.print_should:
+                            ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
                         return False
+                    # Punctuation before or after the mention span can depend on any of the mwords
+                    # without breaking the almost_forest property.
+                    # According to the UD guidelines, it should depend on the highest node within the phrase,
+                    # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines.
+                    if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]):
+                        continue
+                    # Some auxiliary words (e.g. prepositions) may be excluded from the mention span
+                    # without breaking the almost_forest property, but they need to depend
+                    # on the mention head (or if the mention is not a catena, they need to depend
+                    # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords).
+                    # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head),
+                    # but          "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest
+                    # because "with" depends on "Mary", which is not the mention head (nor a potential mention head).
                     if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)):
+                        if self.print_should:
+                            ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
                         return False
         return True
 
 
@@ -0,0 +1,21 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import re
+import logging
+
+class RemoveNoCorefEntities(Block):
+    """
+    Some corpora (e.g., AnCora) include annotation of named entities that are
+    not annotated for coreference. To distinguish them, their cluster ID starts
+    with 'NOCOREF' (optionally followed by entity type, so that one cluster
+    still has just one type). We may want to remove such entities from datasets
+    that are used to train coreference resolves, to prevent the resolvers from
+    thinking that all members of a NOCOREF cluster are coreferential. That is
+    what this block does.
+    """
+
+    def process_document(self, doc):
+        entities = doc.coref_entities
+        if not entities:
+            return
+        doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)}
@@ -73,16 +73,34 @@ def parse_comment_line(self, line, root):
         if entity_match is not None:
             global_entity = entity_match.group(1)
             if self._global_entity and self._global_entity != global_entity:
-                logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity))
+                logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}")
             self._global_entity = global_entity
             root.comment += '$GLOBAL.ENTITY\n'
             return
 
         root.comment += line[1:] + "\n"
 
     def read_trees(self):
-        return [self.read_tree_from_lines(s.split('\n')) for s in
-                self.filehandle.read().split('\n\n') if s]
+        if not self.max_docs:
+            return [self.read_tree_from_lines(s.split('\n')) for s in
+                    self.filehandle.read().split('\n\n') if s]
+        # udapi.core.basereader takes care about the max_docs parameter.
+        # However, we can make the loading much faster by not reading
+        # the whole file if the user wants just first N documents.
+        trees, lines, loaded_docs = [], [], 0
+        for line in self.filehandle:
+            line = line.rstrip()
+            if line == '':
+               tree = self.read_tree_from_lines(lines)
+               lines = []
+               if tree.newdoc:
+                   if loaded_docs == self.max_docs:
+                       return trees
+                   loaded_docs += 1
+               trees.append(tree)
+            else:
+                lines.append(line)
+        return
 
     def read_tree(self):
         if self.filehandle is None:
@@ -193,7 +211,11 @@ def read_tree_from_lines(self, lines):
 
         # Create multi-word tokens.
         for fields in mwts:
-            range_start, range_end = fields[0].split('-')
+            try:
+                range_start, range_end = fields[0].split('-')
+            except ValueError:
+                logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}")
+                raise
             words = nodes[int(range_start):int(range_end) + 1]
             root.create_multiword_token(words, form=fields[1], misc=fields[-1])
 
 
@@ -9,15 +9,21 @@ class Sentences(BaseReader):
     Args:
     ignore_empty_lines: if True, delete empty lines from the input.
         Default=False.
+    newdoc_if_empty_line: if True, empty lines mark document boundaries,
+        which are marked with `root.newdoc`. Default=False.
     rstrip: a set of characters to be stripped from the end of each line.
         Default='\r\n '. You can use rstrip='\n' if you want to preserve
         any space or '\r' (Carriage Return) at end of line,
         so that `udpipe.Base` keeps these characters in `SpacesAfter`.
         As most blocks do not expect whitespace other than a space to appear
         in the processed text, using this feature is at your own risk.
     """
-    def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs):
+    def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False,
+                 rstrip='\r\n ', **kwargs):
+        if ignore_empty_lines and newdoc_if_empty_line:
+            raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line")
         self.ignore_empty_lines = ignore_empty_lines
+        self.newdoc_if_empty_line = newdoc_if_empty_line
         self.rstrip = rstrip
         super().__init__(**kwargs)
 
@@ -38,11 +44,20 @@ def read_tree(self, document=None):
         # (or '\r\n' if reading a Windows file on Unix machine).
         if line == '':
             return None
-        if self.ignore_empty_lines:
+        preceded_by_empty_line = False
+        if self.ignore_empty_lines or self.newdoc_if_empty_line:
             while line in {'\n', '\r\n'}:
+                preceded_by_empty_line = True
                 line = self.filehandle.readline()
                 if line == '':
                     return None
         root = Root()
         root.text = line.rstrip(self.rstrip)
+        if self.newdoc_if_empty_line and preceded_by_empty_line:
+            root.newdoc = True
         return root
+
+    # The first line in a file also marks a start of new document
+    def after_process_document(self, document):
+        if self.newdoc_if_empty_line:
+            document.bundles[0].trees[0].newdoc = True