Skip to content

Commit b27a213

Browse files
committed
udpipe.Cs online=1 #uses the Lindat web service
1 parent 16fc5e5 commit b27a213

File tree

3 files changed

+130
-7
lines changed

3 files changed

+130
-7
lines changed

udapi/block/udpipe/base.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Block udpipe.Base for tagging and parsing using UDPipe."""
22
from udapi.core.block import Block
33
from udapi.tool.udpipe import UDPipe
4+
from udapi.tool.udpipeonline import UDPipeOnline
45
from udapi.core.bundle import Bundle
56

67
KNOWN_MODELS = {
@@ -118,11 +119,11 @@ class Base(Block):
118119
"""Base class for all UDPipe blocks."""
119120

120121
# pylint: disable=too-many-arguments
121-
def __init__(self, model=None, model_alias=None,
122+
def __init__(self, model=None, model_alias=None, online=False,
122123
tokenize=True, tag=True, parse=True, resegment=False, **kwargs):
123124
"""Create the udpipe.En block object."""
124125
super().__init__(**kwargs)
125-
self.model, self.model_alias = model, model_alias
126+
self.model, self.model_alias, self.online = model, model_alias, online
126127
self._tool = None
127128
self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment
128129

@@ -134,8 +135,14 @@ def tool(self):
134135
if not self.model:
135136
if not self.model_alias:
136137
raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!')
137-
self.model = KNOWN_MODELS[self.model_alias]
138-
self._tool = UDPipe(model=self.model)
138+
if self.online:
139+
self.model = self.model_alias
140+
else:
141+
self.model = KNOWN_MODELS[self.model_alias]
142+
if self.online:
143+
self._tool = UDPipeOnline(model=self.model)
144+
else:
145+
self._tool = UDPipe(model=self.model)
139146
return self._tool
140147

141148
def process_document(self, doc):

udapi/tool/udpipe.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@ def tag_parse_tree(self, root):
3434
raise IOError("UDPipe error " + self.error.message)
3535
self.conllu_reader.files.filehandle = io.StringIO(out_data)
3636
parsed_root = self.conllu_reader.read_tree()
37-
nodes = [root] + descendants
37+
root.flatten()
3838
for parsed_node in parsed_root.descendants:
39-
node = nodes[parsed_node.ord]
40-
node.parent = nodes[parsed_node.parent.ord]
39+
node = descendants[parsed_node.ord - 1]
40+
node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root
4141
for attr in 'upos xpos lemma feats deprel'.split():
4242
setattr(node, attr, getattr(parsed_node, attr))
4343

udapi/tool/udpipeonline.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""Wrapper for UDPipe online web service."""
2+
import io
3+
import sys
4+
import email.mime.multipart
5+
import email.mime.nonmultipart
6+
import email.policy
7+
import json
8+
import os
9+
import sys
10+
import urllib.error
11+
import urllib.request
12+
13+
from udapi.block.read.conllu import Conllu as ConlluReader
14+
from udapi.core.root import Root
15+
16+
class UDPipeOnline:
17+
"""Wrapper for UDPipe online web service."""
18+
19+
def __init__(self, model, server="https://lindat.mff.cuni.cz/services/udpipe/api"):
20+
"""Create the UDPipeOnline tool object."""
21+
self.model = model
22+
self.server = server
23+
24+
def list_models(self):
25+
with urllib.request.urlopen(self.server + "/models") as request:
26+
response = json.loads(request.read())
27+
return list(response["models"].keys())
28+
29+
def perform_request(self, params, method="process"):
30+
if not params:
31+
request_headers, request_data = {}, None
32+
else:
33+
message = email.mime.multipart.MIMEMultipart("form-data", policy=email.policy.HTTP)
34+
35+
for name, value in params.items():
36+
payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain")
37+
payload.add_header("Content-Disposition", "form-data; name=\"{}\"".format(name))
38+
payload.add_header("Content-Transfer-Encoding", "8bit")
39+
payload.set_payload(value, charset="utf-8")
40+
message.attach(payload)
41+
42+
request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1]
43+
request_headers = {"Content-Type": message["Content-Type"]}
44+
45+
try:
46+
with urllib.request.urlopen(urllib.request.Request(
47+
url=f"{self.server}/{method}", headers=request_headers, data=request_data
48+
)) as request:
49+
response = json.loads(request.read())
50+
except urllib.error.HTTPError as e:
51+
print("An exception was raised during UDPipe 'process' REST request.\n"
52+
"The service returned the following error:\n"
53+
" {}".format(e.fp.read().decode("utf-8")), file=sys.stderr)
54+
raise
55+
except json.JSONDecodeError as e:
56+
print("Cannot parse the JSON response of UDPipe 'process' REST request.\n"
57+
" {}".format(e.msg), file=sys.stderr)
58+
raise
59+
60+
if "model" not in response or "result" not in response:
61+
raise ValueError("Cannot parse the UDPipe 'process' REST request response.")
62+
63+
return response["result"]
64+
65+
def tag_parse_tree(self, root):
66+
"""Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
67+
descendants = root.descendants
68+
if not descendants:
69+
return
70+
in_data = " ".join([n.form for n in descendants])
71+
out_data = self.perform_request(params={"data": in_data, "input":"horizontal", "tagger":"", "parser":""})
72+
conllu_reader = ConlluReader()
73+
conllu_reader.files.filehandle = io.StringIO(out_data)
74+
parsed_root = conllu_reader.read_tree()
75+
root.flatten()
76+
for parsed_node in parsed_root.descendants:
77+
node = descendants[parsed_node.ord - 1]
78+
node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root
79+
for attr in 'upos xpos lemma feats deprel'.split():
80+
setattr(node, attr, getattr(parsed_node, attr))
81+
82+
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True):
83+
"""Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.
84+
85+
If resegment=True, the returned list of Udapi trees may contain multiple trees.
86+
"""
87+
if parse and not tag:
88+
raise ValueError('Combination parse=True tag=False is not allowed.')
89+
if root.children:
90+
raise ValueError('Tree already contained nodes before tokenization')
91+
92+
# Tokenize and possibly segment the input text
93+
params = {"model": self.model, "data": root.text, "tokenizer":"" if resegment else "presegmented"}
94+
if tag:
95+
params["tagger"] = ""
96+
if parse:
97+
params["parser"] = ""
98+
out_data = self.perform_request(params=params)
99+
conllu_reader = ConlluReader(empty_parent="ignore")
100+
conllu_reader.files.filehandle = io.StringIO(out_data)
101+
trees = conllu_reader.read_trees()
102+
103+
# The input "root" object must be the first item in "trees".
104+
for attr in ('_children', '_descendants', '_mwts', 'text', 'comment'):
105+
setattr(root, attr, getattr(trees[0], attr))
106+
for node in root._children:
107+
node._parent = root
108+
for node in root._descendants:
109+
node._root = root
110+
trees[0] = root
111+
return trees
112+
113+
def segment_text(self, text):
114+
"""Segment the provided text into sentences returned as a Python list."""
115+
params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"}
116+
return self.perform_request(params=params).rstrip().split("\n")

0 commit comments

Comments
 (0)