udapi
diff --git a/‎docs/conf.py
Lines changed: 4 additions & 4 deletions b/‎docs/conf.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎docs/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎udapi/block/corefud/stats.py
Lines changed: 3 additions & 0 deletions b/‎udapi/block/corefud/stats.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎udapi/block/read/conllu.py
Lines changed: 13 additions & 8 deletions b/‎udapi/block/read/conllu.py
Lines changed: 13 additions & 8 deletions
diff --git a/‎udapi/block/read/text.py
Lines changed: 57 additions & 0 deletions b/‎udapi/block/read/text.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎udapi/block/ud/addmwt.py
Lines changed: 32 additions & 2 deletions b/‎udapi/block/ud/addmwt.py
Lines changed: 32 additions & 2 deletions
diff --git a/‎udapi/block/ud/fixpunct.py
Lines changed: 33 additions & 20 deletions b/‎udapi/block/ud/fixpunct.py
Lines changed: 33 additions & 20 deletions
diff --git a/‎udapi/block/ud/markbugs.py
Lines changed: 2 additions & 2 deletions b/‎udapi/block/ud/markbugs.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎udapi/block/udpipe/base.py
Lines changed: 11 additions & 4 deletions b/‎udapi/block/udpipe/base.py
Lines changed: 11 additions & 4 deletions
@@ -51,7 +51,7 @@
 
 # General information about the project.
 project = 'Udapi'
-copyright = '2017, Martin Popel'
+copyright = '2023, Martin Popel'
 author = 'Martin Popel'
 
 # The version info for the project you're documenting, acts as replacement for
@@ -61,14 +61,14 @@
 # The short X.Y version.
 version = '0'
 # The full version, including alpha/beta/rc tags.
-release = '2'
+release = '3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -167,7 +167,7 @@ def run_apidoc(_):
     module = os.path.abspath(os.path.join(cur_dir, "..", "udapi"))
     print(module)
 
-    from sphinx.apidoc import main
+    from sphinx.ext.apidoc import main
     main(['--separate', '-o', cur_dir, module, '--force'])
 
 def setup(app):
 
@@ -1,3 +1,4 @@
 colorama>=0.4.6
 termcolor
 ufal.udpipe
+sphinx_rtd_theme
@@ -22,6 +22,7 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entiti
         self.counter = Counter()
         self.mentions = 0
         self.entities = 0
+        self.singletons = 0
         self.total_nodes = 0
         self.longest_mention = 0
         self.longest_entity = 0
@@ -32,6 +33,8 @@ def process_document(self, doc):
         self.total_nodes += len(list(doc.nodes))
         for entity in doc.coref_entities:
             len_mentions = len(entity.mentions)
+            if len_mentions == 1:
+                self.singletons += 1
             if len_mentions == 1 and self.exclude_singletons:
                 continue
             elif len_mentions > 1 and self.exclude_nonsingletons:
 
@@ -82,22 +82,27 @@ def parse_comment_line(self, line, root):
 
     def read_trees(self):
         if not self.max_docs:
+            # Valid CoNLL-U files must have sentences separated by a single empty line.
+            # However, some users have to work with invalid files e.g. ending with two empty lines.
+            # It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow
+            # as s.split('\n\n') and this time is negligble
+            # relative to the main CoNLL-U parsing in read_tree_from_lines().
             return [self.read_tree_from_lines(s.split('\n')) for s in
-                    self.filehandle.read().split('\n\n') if s]
+                    re.split(r'\n\n+', self.filehandle.read()) if s]
         # udapi.core.basereader takes care about the max_docs parameter.
         # However, we can make the loading much faster by not reading
         # the whole file if the user wants just first N documents.
         trees, lines, loaded_docs = [], [], 0
         for line in self.filehandle:
             line = line.rstrip()
             if line == '':
-               tree = self.read_tree_from_lines(lines)
-               lines = []
-               if tree.newdoc:
-                   if loaded_docs == self.max_docs:
-                       return trees
-                   loaded_docs += 1
-               trees.append(tree)
+                tree = self.read_tree_from_lines(lines)
+                lines = []
+                if tree.newdoc:
+                    if loaded_docs == self.max_docs:
+                        return trees
+                    loaded_docs += 1
+                trees.append(tree)
             else:
                 lines.append(line)
         return
 
@@ -0,0 +1,57 @@
+"""Text class is a reader for word-wrapped plain-text files."""
+from udapi.core.basereader import BaseReader
+from udapi.core.root import Root
+
+
+class Text(BaseReader):
+    r"""A reader for plain-text files with sentences on one or more lines.
+    
+    Sentences are separated by one or more empty lines.
+    Newlines within sentences are substituted by a space.
+
+    Args:
+    rstrip: a set of characters to be stripped from the end of each line.
+        Default='\r\n '. You can use rstrip='\n' if you want to preserve
+        any space or '\r' (Carriage Return) at end of line,
+        so that `udpipe.Base` keeps these characters in `SpacesAfter`.
+        As most blocks do not expect whitespace other than a space to appear
+        in the processed text, using this feature is at your own risk.
+    """
+    def __init__(self, rstrip='\r\n ', **kwargs):
+        self.rstrip = rstrip
+        super().__init__(**kwargs)
+
+    @staticmethod
+    def is_multizone_reader():
+        """Can this reader read bundles which contain more zones?.
+
+        This implementation returns always False.
+        """
+        return False
+
+    def read_tree(self, document=None):
+        if self.filehandle is None:
+            return None
+        lines = []
+        line = None
+        while True:
+            line = self.filehandle.readline()
+            # if readline() returns an empty string, the end of the file has been
+            # reached, while a blank line is represented by '\n'
+            # (or '\r\n' if reading a Windows file on Unix machine).
+            if line == '':
+                if not lines:
+                    return None
+                else:
+                    break
+            elif line in {'\n', '\r\n'}:
+                if not lines:
+                    continue
+                else:
+                    break
+            else:
+                lines.append(line.rstrip(self.rstrip))
+
+        root = Root()
+        root.text = " ".join(lines)
+        return root
@@ -15,6 +15,9 @@ def process_node(self, node):
             orig_attr[attr] = getattr(node, attr)
         orig_attr['feats'] = node.feats.copy()
         orig_attr['misc'] = node.misc.copy()
+        # Defaults for the newly created MWT
+        mwt_misc = node.misc.copy()
+        mwt_form = node.form
 
         forms = analysis['form'].split()
         main = analysis.get('main', 0)
@@ -37,6 +40,7 @@ def process_node(self, node):
         elif orig_attr['form'][0].isupper():
             nodes[0].form = nodes[0].form.title()
 
+        node.misc = None
         for attr in 'lemma upos xpos feats deprel misc'.split():
             if attr in analysis:
                 values = analysis[attr].split()
@@ -47,6 +51,17 @@ def process_node(self, node):
                             logging.warning("%s = %s" % (attr, analysis.get(attr, '')))
                     if values[i] == '*':
                         setattr(new_node, attr, orig_attr[attr])
+                        # No MISC attribute should be duplicated on the word level and token level,
+                        # so if copying MISC to a new_node, delete mwt_misc.
+                        # However, SpaceAfter should be annotated only on the token level,
+                        # so make sure it is not accidentally copied on the word level.
+                        if attr == 'misc':
+                            orig_attr['misc'].clear()
+                            for a in 'SpaceAfter SpacesAfter SpacesBefore'.split():
+                                if new_node.misc[a]:
+                                    orig_attr['misc'][a] = new_node.misc[a]
+                                    del new_node.misc[a]
+
                     elif attr == 'feats' and '*' in values[i]:
                         new_node.feats = values[i]
                         for feat_name, feat_value in list(new_node.feats.items()):
@@ -55,8 +70,23 @@ def process_node(self, node):
                     else:
                         setattr(new_node, attr, values[i])
 
-        mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc'])
-        node.misc = None
+        # Entity (coreference) annotation should be only on the word level,
+        # so make sure it does not stay on the token level.
+        if mwt_misc['Entity']:
+            nodes[0].misc['Entity'] = mwt_misc['Entity']
+            del mwt_misc['Entity']
+
+        # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT.
+        if node.multiword_token:
+            mwt_words = node.multiword_token.words
+            mwt_form = node.multiword_token.form
+            if node.multiword_token.misc:
+                mwt_misc.update(node.multiword_token.misc)
+            node.multiword_token.remove()
+            mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes
+            nodes = mwt_words
+
+        mwt = node.root.create_multiword_token(nodes, mwt_form, mwt_misc)
         self.postprocess_mwt(mwt)
 
     def multiword_analysis(self, node):
 
@@ -50,22 +50,34 @@ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwar
         Args:
         check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT.
             The default is false, which means that fixed punctuation is detected only
-            based on the form with the exception of single quote / apostrophe character,
-            which is frequently ambiguous, so UPOS=PUNCT is checked always.
-        copy_to_enhanced: for all PUNCT nodes, let the enhanced depencies be the same
-            as the basic dependencies.
+            based on the form with the exception of single & double quote character,
+            which is frequently ambiguous*, so UPOS=PUNCT is checked always.
+            *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol.
+        copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies
+            be the same as the basic dependencies.
         """
         super().__init__(**kwargs)
         self._punct_type = None
         self.check_paired_punct_upos = check_paired_punct_upos
         self.copy_to_enhanced = copy_to_enhanced
 
+    def _is_punct(self, node):
+        if node.upos == 'PUNCT':
+            return True
+        if self.check_paired_punct_upos:
+            return False
+        if node.form in "'\"":
+            return False
+        if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values():
+            return True
+        return False
+
     def process_tree(self, root):
         # First, make sure no PUNCT has children.
         # This may introduce multiple subroots, which will be fixed later on
         # (preventing to temporarily create multiple subroots here would prevent fixing some errors).
         for node in root.descendants:
-            while node.parent.upos == 'PUNCT':
+            while self._is_punct(node.parent):
                 node.parent = node.parent.parent
 
         # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type.
@@ -77,7 +89,7 @@ def process_tree(self, root):
         self._punct_type = [None] * (1 + len(root.descendants))
         for node in root.descendants:
             if self._punct_type[node.ord] != 'closing':
-                closing_punct = PAIRED_PUNCT.get(node.form, None)
+                closing_punct = PAIRED_PUNCT.get(node.form)
                 if closing_punct is not None:
                     self._fix_paired_punct(root, node, closing_punct)
 
@@ -99,6 +111,8 @@ def process_tree(self, root):
         # This may not hold if the original subroot was a paired punctuation, which was rehanged.
         if root.children[0].udeprel != 'root':
             root.children[0].udeprel = 'root'
+            if self.copy_to_enhanced:
+                root.children[0].deps = [{'parent': root, 'deprel': 'root'}]
             for another_node in root.children[0].descendants:
                 if another_node.udeprel == 'root':
                     another_node.udeprel = 'punct'
@@ -107,7 +121,7 @@ def process_tree(self, root):
         if self.copy_to_enhanced:
             for node in root.descendants:
                 if node.upos == 'PUNCT':
-                    node.deps = [{'parent': node.parent, 'deprel': 'punct'}]
+                    node.deps = [{'parent': node.parent, 'deprel': node.deprel}]
 
     def _fix_subord_punct(self, node):
         # Dot used as the ordinal-number marker (in some languages) or abbreviation marker.
@@ -148,13 +162,13 @@ def _fix_subord_punct(self, node):
         if l_cand is None or l_cand.is_root():
             l_cand, l_path = None, []
         else:
-            while (not l_cand.parent.is_root() and l_cand.parent.precedes(node)
-                   and not node.precedes(l_cand.descendants(add_self=1)[-1])):
+            while (not l_cand.parent.is_root() and l_cand.parent < node
+                   and not node < l_cand.descendants(add_self=1)[-1]):
                 l_cand = l_cand.parent
                 l_path.append(l_cand)
         if r_cand is not None:
-            while (not r_cand.parent.is_root() and node.precedes(r_cand.parent)
-                   and not r_cand.descendants(add_self=1)[0].precedes(node)):
+            while (not r_cand.parent.is_root() and node < r_cand.parent
+                   and not r_cand.descendants(add_self=1)[0] < node):
                 r_cand = r_cand.parent
                 r_path.append(r_cand)
 
@@ -203,7 +217,7 @@ def _causes_gap(self, node):
 
     def _fix_paired_punct(self, root, opening_node, closing_punct):
         if (self.check_paired_punct_upos
-            or opening_node.form == "'") and opening_node.upos != 'PUNCT':
+            or opening_node.form in "'\"") and opening_node.upos != 'PUNCT':
             return
         nested_level = 0
         for node in root.descendants[opening_node.ord:]:
@@ -226,8 +240,8 @@ def _fix_pair(self, root, opening_node, closing_node):
             if node == opening_node or node == closing_node:
                 continue
             # If this is a node inside of the pair, is its parent outside?
-            if opening_node.precedes(node) and node.precedes(closing_node):
-                if node.parent.precedes(opening_node) or closing_node.precedes(node.parent):
+            if node > opening_node and node < closing_node:
+                if node.parent < opening_node or node.parent > closing_node:
                     if node.upos == 'PUNCT':
                         punct_heads.append(node)
                     else:
@@ -236,12 +250,11 @@ def _fix_pair(self, root, opening_node, closing_node):
             # they also must not cause non-projectivity of other relations. This could
             # happen if an outside node is attached to an inside node. To account for
             # this, mark the inside parent as a head, too.
-            else:
-                if opening_node.precedes(node.parent) and node.parent.precedes(closing_node):
-                    if node.parent.upos == 'PUNCT':
-                        punct_heads.append(node.parent)
-                    else:
-                        heads.append(node.parent)
+            elif node.parent > opening_node and node.parent < closing_node:
+                if node.parent.upos == 'PUNCT':
+                    punct_heads.append(node.parent)
+                else:
+                    heads.append(node.parent)
 
         # Punctuation should not have children, but if there is no other head candidate,
         # let's break this rule.
 
@@ -131,9 +131,9 @@ def process_node(self, node):
             self.log(node, 'degree-upos',
                      'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos))
 
-        subject_children = [n for n in node.children if 'subj' in n.udeprel]
+        subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer']
         if len(subject_children) > 1:
-            self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child')
+            self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child')
 
         object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')]
         if len(object_children) > 1:
 
@@ -1,6 +1,7 @@
 """Block udpipe.Base for tagging and parsing using UDPipe."""
 from udapi.core.block import Block
 from udapi.tool.udpipe import UDPipe
+from udapi.tool.udpipeonline import UDPipeOnline
 from udapi.core.bundle import Bundle
 
 KNOWN_MODELS = {
@@ -118,11 +119,11 @@ class Base(Block):
     """Base class for all UDPipe blocks."""
 
     # pylint: disable=too-many-arguments
-    def __init__(self, model=None, model_alias=None,
+    def __init__(self, model=None, model_alias=None, online=False,
                  tokenize=True, tag=True, parse=True, resegment=False, **kwargs):
         """Create the udpipe.En block object."""
         super().__init__(**kwargs)
-        self.model, self.model_alias = model, model_alias
+        self.model, self.model_alias, self.online = model, model_alias, online
         self._tool = None
         self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment
 
@@ -134,8 +135,14 @@ def tool(self):
         if not self.model:
             if not self.model_alias:
                 raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!')
-            self.model = KNOWN_MODELS[self.model_alias]
-        self._tool = UDPipe(model=self.model)
+            if self.online:
+                self.model = self.model_alias
+            else:
+                self.model = KNOWN_MODELS[self.model_alias]
+        if self.online:
+            self._tool = UDPipeOnline(model=self.model)
+        else:
+            self._tool = UDPipe(model=self.model)
         return self._tool
 
     def process_document(self, doc):