Skip to content

Commit 9af81d5

Browse files
authored
Merge branch 'master' into fix_shift_edeps
2 parents a4a183d + 28dbce9 commit 9af81d5

File tree

19 files changed

+366
-63
lines changed

19 files changed

+366
-63
lines changed

docs/conf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151

5252
# General information about the project.
5353
project = 'Udapi'
54-
copyright = '2017, Martin Popel'
54+
copyright = '2023, Martin Popel'
5555
author = 'Martin Popel'
5656

5757
# The version info for the project you're documenting, acts as replacement for
@@ -61,14 +61,14 @@
6161
# The short X.Y version.
6262
version = '0'
6363
# The full version, including alpha/beta/rc tags.
64-
release = '2'
64+
release = '3'
6565

6666
# The language for content autogenerated by Sphinx. Refer to documentation
6767
# for a list of supported languages.
6868
#
6969
# This is also used if you do content translation via gettext catalogs.
7070
# Usually you set "language" from the command line for these cases.
71-
language = None
71+
language = "en"
7272

7373
# List of patterns, relative to source directory, that match files and
7474
# directories to ignore when looking for source files.
@@ -167,7 +167,7 @@ def run_apidoc(_):
167167
module = os.path.abspath(os.path.join(cur_dir, "..", "udapi"))
168168
print(module)
169169

170-
from sphinx.apidoc import main
170+
from sphinx.ext.apidoc import main
171171
main(['--separate', '-o', cur_dir, module, '--force'])
172172

173173
def setup(app):

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
colorama>=0.4.6
22
termcolor
33
ufal.udpipe
4+
sphinx_rtd_theme

udapi/block/corefud/stats.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entiti
2222
self.counter = Counter()
2323
self.mentions = 0
2424
self.entities = 0
25+
self.singletons = 0
2526
self.total_nodes = 0
2627
self.longest_mention = 0
2728
self.longest_entity = 0
@@ -32,6 +33,8 @@ def process_document(self, doc):
3233
self.total_nodes += len(list(doc.nodes))
3334
for entity in doc.coref_entities:
3435
len_mentions = len(entity.mentions)
36+
if len_mentions == 1:
37+
self.singletons += 1
3538
if len_mentions == 1 and self.exclude_singletons:
3639
continue
3740
elif len_mentions > 1 and self.exclude_nonsingletons:

udapi/block/read/conllu.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,22 +82,27 @@ def parse_comment_line(self, line, root):
8282

8383
def read_trees(self):
8484
if not self.max_docs:
85+
# Valid CoNLL-U files must have sentences separated by a single empty line.
86+
# However, some users have to work with invalid files e.g. ending with two empty lines.
87+
# It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow
88+
# as s.split('\n\n') and this time is negligble
89+
# relative to the main CoNLL-U parsing in read_tree_from_lines().
8590
return [self.read_tree_from_lines(s.split('\n')) for s in
86-
self.filehandle.read().split('\n\n') if s]
91+
re.split(r'\n\n+', self.filehandle.read()) if s]
8792
# udapi.core.basereader takes care about the max_docs parameter.
8893
# However, we can make the loading much faster by not reading
8994
# the whole file if the user wants just first N documents.
9095
trees, lines, loaded_docs = [], [], 0
9196
for line in self.filehandle:
9297
line = line.rstrip()
9398
if line == '':
94-
tree = self.read_tree_from_lines(lines)
95-
lines = []
96-
if tree.newdoc:
97-
if loaded_docs == self.max_docs:
98-
return trees
99-
loaded_docs += 1
100-
trees.append(tree)
99+
tree = self.read_tree_from_lines(lines)
100+
lines = []
101+
if tree.newdoc:
102+
if loaded_docs == self.max_docs:
103+
return trees
104+
loaded_docs += 1
105+
trees.append(tree)
101106
else:
102107
lines.append(line)
103108
return

udapi/block/read/text.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Text class is a reader for word-wrapped plain-text files."""
2+
from udapi.core.basereader import BaseReader
3+
from udapi.core.root import Root
4+
5+
6+
class Text(BaseReader):
7+
r"""A reader for plain-text files with sentences on one or more lines.
8+
9+
Sentences are separated by one or more empty lines.
10+
Newlines within sentences are substituted by a space.
11+
12+
Args:
13+
rstrip: a set of characters to be stripped from the end of each line.
14+
Default='\r\n '. You can use rstrip='\n' if you want to preserve
15+
any space or '\r' (Carriage Return) at end of line,
16+
so that `udpipe.Base` keeps these characters in `SpacesAfter`.
17+
As most blocks do not expect whitespace other than a space to appear
18+
in the processed text, using this feature is at your own risk.
19+
"""
20+
def __init__(self, rstrip='\r\n ', **kwargs):
21+
self.rstrip = rstrip
22+
super().__init__(**kwargs)
23+
24+
@staticmethod
25+
def is_multizone_reader():
26+
"""Can this reader read bundles which contain more zones?.
27+
28+
This implementation returns always False.
29+
"""
30+
return False
31+
32+
def read_tree(self, document=None):
33+
if self.filehandle is None:
34+
return None
35+
lines = []
36+
line = None
37+
while True:
38+
line = self.filehandle.readline()
39+
# if readline() returns an empty string, the end of the file has been
40+
# reached, while a blank line is represented by '\n'
41+
# (or '\r\n' if reading a Windows file on Unix machine).
42+
if line == '':
43+
if not lines:
44+
return None
45+
else:
46+
break
47+
elif line in {'\n', '\r\n'}:
48+
if not lines:
49+
continue
50+
else:
51+
break
52+
else:
53+
lines.append(line.rstrip(self.rstrip))
54+
55+
root = Root()
56+
root.text = " ".join(lines)
57+
return root

udapi/block/ud/addmwt.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ def process_node(self, node):
1515
orig_attr[attr] = getattr(node, attr)
1616
orig_attr['feats'] = node.feats.copy()
1717
orig_attr['misc'] = node.misc.copy()
18+
# Defaults for the newly created MWT
19+
mwt_misc = node.misc.copy()
20+
mwt_form = node.form
1821

1922
forms = analysis['form'].split()
2023
main = analysis.get('main', 0)
@@ -37,6 +40,7 @@ def process_node(self, node):
3740
elif orig_attr['form'][0].isupper():
3841
nodes[0].form = nodes[0].form.title()
3942

43+
node.misc = None
4044
for attr in 'lemma upos xpos feats deprel misc'.split():
4145
if attr in analysis:
4246
values = analysis[attr].split()
@@ -47,6 +51,17 @@ def process_node(self, node):
4751
logging.warning("%s = %s" % (attr, analysis.get(attr, '')))
4852
if values[i] == '*':
4953
setattr(new_node, attr, orig_attr[attr])
54+
# No MISC attribute should be duplicated on the word level and token level,
55+
# so if copying MISC to a new_node, delete mwt_misc.
56+
# However, SpaceAfter should be annotated only on the token level,
57+
# so make sure it is not accidentally copied on the word level.
58+
if attr == 'misc':
59+
orig_attr['misc'].clear()
60+
for a in 'SpaceAfter SpacesAfter SpacesBefore'.split():
61+
if new_node.misc[a]:
62+
orig_attr['misc'][a] = new_node.misc[a]
63+
del new_node.misc[a]
64+
5065
elif attr == 'feats' and '*' in values[i]:
5166
new_node.feats = values[i]
5267
for feat_name, feat_value in list(new_node.feats.items()):
@@ -55,8 +70,23 @@ def process_node(self, node):
5570
else:
5671
setattr(new_node, attr, values[i])
5772

58-
mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc'])
59-
node.misc = None
73+
# Entity (coreference) annotation should be only on the word level,
74+
# so make sure it does not stay on the token level.
75+
if mwt_misc['Entity']:
76+
nodes[0].misc['Entity'] = mwt_misc['Entity']
77+
del mwt_misc['Entity']
78+
79+
# If node is already part of an MWT, we need to delete the old MWT and extend the new MWT.
80+
if node.multiword_token:
81+
mwt_words = node.multiword_token.words
82+
mwt_form = node.multiword_token.form
83+
if node.multiword_token.misc:
84+
mwt_misc.update(node.multiword_token.misc)
85+
node.multiword_token.remove()
86+
mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes
87+
nodes = mwt_words
88+
89+
mwt = node.root.create_multiword_token(nodes, mwt_form, mwt_misc)
6090
self.postprocess_mwt(mwt)
6191

6292
def multiword_analysis(self, node):

udapi/block/ud/fixpunct.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,34 @@ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwar
5050
Args:
5151
check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT.
5252
The default is false, which means that fixed punctuation is detected only
53-
based on the form with the exception of single quote / apostrophe character,
54-
which is frequently ambiguous, so UPOS=PUNCT is checked always.
55-
copy_to_enhanced: for all PUNCT nodes, let the enhanced depencies be the same
56-
as the basic dependencies.
53+
based on the form with the exception of single & double quote character,
54+
which is frequently ambiguous*, so UPOS=PUNCT is checked always.
55+
*) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol.
56+
copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies
57+
be the same as the basic dependencies.
5758
"""
5859
super().__init__(**kwargs)
5960
self._punct_type = None
6061
self.check_paired_punct_upos = check_paired_punct_upos
6162
self.copy_to_enhanced = copy_to_enhanced
6263

64+
def _is_punct(self, node):
65+
if node.upos == 'PUNCT':
66+
return True
67+
if self.check_paired_punct_upos:
68+
return False
69+
if node.form in "'\"":
70+
return False
71+
if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values():
72+
return True
73+
return False
74+
6375
def process_tree(self, root):
6476
# First, make sure no PUNCT has children.
6577
# This may introduce multiple subroots, which will be fixed later on
6678
# (preventing to temporarily create multiple subroots here would prevent fixing some errors).
6779
for node in root.descendants:
68-
while node.parent.upos == 'PUNCT':
80+
while self._is_punct(node.parent):
6981
node.parent = node.parent.parent
7082

7183
# Second, fix paired punctuations: quotes and brackets, marking them in _punct_type.
@@ -77,7 +89,7 @@ def process_tree(self, root):
7789
self._punct_type = [None] * (1 + len(root.descendants))
7890
for node in root.descendants:
7991
if self._punct_type[node.ord] != 'closing':
80-
closing_punct = PAIRED_PUNCT.get(node.form, None)
92+
closing_punct = PAIRED_PUNCT.get(node.form)
8193
if closing_punct is not None:
8294
self._fix_paired_punct(root, node, closing_punct)
8395

@@ -99,6 +111,8 @@ def process_tree(self, root):
99111
# This may not hold if the original subroot was a paired punctuation, which was rehanged.
100112
if root.children[0].udeprel != 'root':
101113
root.children[0].udeprel = 'root'
114+
if self.copy_to_enhanced:
115+
root.children[0].deps = [{'parent': root, 'deprel': 'root'}]
102116
for another_node in root.children[0].descendants:
103117
if another_node.udeprel == 'root':
104118
another_node.udeprel = 'punct'
@@ -107,7 +121,7 @@ def process_tree(self, root):
107121
if self.copy_to_enhanced:
108122
for node in root.descendants:
109123
if node.upos == 'PUNCT':
110-
node.deps = [{'parent': node.parent, 'deprel': 'punct'}]
124+
node.deps = [{'parent': node.parent, 'deprel': node.deprel}]
111125

112126
def _fix_subord_punct(self, node):
113127
# Dot used as the ordinal-number marker (in some languages) or abbreviation marker.
@@ -148,13 +162,13 @@ def _fix_subord_punct(self, node):
148162
if l_cand is None or l_cand.is_root():
149163
l_cand, l_path = None, []
150164
else:
151-
while (not l_cand.parent.is_root() and l_cand.parent.precedes(node)
152-
and not node.precedes(l_cand.descendants(add_self=1)[-1])):
165+
while (not l_cand.parent.is_root() and l_cand.parent < node
166+
and not node < l_cand.descendants(add_self=1)[-1]):
153167
l_cand = l_cand.parent
154168
l_path.append(l_cand)
155169
if r_cand is not None:
156-
while (not r_cand.parent.is_root() and node.precedes(r_cand.parent)
157-
and not r_cand.descendants(add_self=1)[0].precedes(node)):
170+
while (not r_cand.parent.is_root() and node < r_cand.parent
171+
and not r_cand.descendants(add_self=1)[0] < node):
158172
r_cand = r_cand.parent
159173
r_path.append(r_cand)
160174

@@ -203,7 +217,7 @@ def _causes_gap(self, node):
203217

204218
def _fix_paired_punct(self, root, opening_node, closing_punct):
205219
if (self.check_paired_punct_upos
206-
or opening_node.form == "'") and opening_node.upos != 'PUNCT':
220+
or opening_node.form in "'\"") and opening_node.upos != 'PUNCT':
207221
return
208222
nested_level = 0
209223
for node in root.descendants[opening_node.ord:]:
@@ -226,8 +240,8 @@ def _fix_pair(self, root, opening_node, closing_node):
226240
if node == opening_node or node == closing_node:
227241
continue
228242
# If this is a node inside of the pair, is its parent outside?
229-
if opening_node.precedes(node) and node.precedes(closing_node):
230-
if node.parent.precedes(opening_node) or closing_node.precedes(node.parent):
243+
if node > opening_node and node < closing_node:
244+
if node.parent < opening_node or node.parent > closing_node:
231245
if node.upos == 'PUNCT':
232246
punct_heads.append(node)
233247
else:
@@ -236,12 +250,11 @@ def _fix_pair(self, root, opening_node, closing_node):
236250
# they also must not cause non-projectivity of other relations. This could
237251
# happen if an outside node is attached to an inside node. To account for
238252
# this, mark the inside parent as a head, too.
239-
else:
240-
if opening_node.precedes(node.parent) and node.parent.precedes(closing_node):
241-
if node.parent.upos == 'PUNCT':
242-
punct_heads.append(node.parent)
243-
else:
244-
heads.append(node.parent)
253+
elif node.parent > opening_node and node.parent < closing_node:
254+
if node.parent.upos == 'PUNCT':
255+
punct_heads.append(node.parent)
256+
else:
257+
heads.append(node.parent)
245258

246259
# Punctuation should not have children, but if there is no other head candidate,
247260
# let's break this rule.

udapi/block/ud/markbugs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,9 @@ def process_node(self, node):
131131
self.log(node, 'degree-upos',
132132
'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos))
133133

134-
subject_children = [n for n in node.children if 'subj' in n.udeprel]
134+
subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer']
135135
if len(subject_children) > 1:
136-
self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child')
136+
self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child')
137137

138138
object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')]
139139
if len(object_children) > 1:

udapi/block/udpipe/base.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Block udpipe.Base for tagging and parsing using UDPipe."""
22
from udapi.core.block import Block
33
from udapi.tool.udpipe import UDPipe
4+
from udapi.tool.udpipeonline import UDPipeOnline
45
from udapi.core.bundle import Bundle
56

67
KNOWN_MODELS = {
@@ -118,11 +119,11 @@ class Base(Block):
118119
"""Base class for all UDPipe blocks."""
119120

120121
# pylint: disable=too-many-arguments
121-
def __init__(self, model=None, model_alias=None,
122+
def __init__(self, model=None, model_alias=None, online=False,
122123
tokenize=True, tag=True, parse=True, resegment=False, **kwargs):
123124
"""Create the udpipe.En block object."""
124125
super().__init__(**kwargs)
125-
self.model, self.model_alias = model, model_alias
126+
self.model, self.model_alias, self.online = model, model_alias, online
126127
self._tool = None
127128
self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment
128129

@@ -134,8 +135,14 @@ def tool(self):
134135
if not self.model:
135136
if not self.model_alias:
136137
raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!')
137-
self.model = KNOWN_MODELS[self.model_alias]
138-
self._tool = UDPipe(model=self.model)
138+
if self.online:
139+
self.model = self.model_alias
140+
else:
141+
self.model = KNOWN_MODELS[self.model_alias]
142+
if self.online:
143+
self._tool = UDPipeOnline(model=self.model)
144+
else:
145+
self._tool = UDPipe(model=self.model)
139146
return self._tool
140147

141148
def process_document(self, doc):

0 commit comments

Comments
 (0)