Skip to content

Commit f7a6e9a

Browse files
new: Add prodigy to tsv command
1 parent 8f5c24f commit f7a6e9a

File tree

2 files changed

+514
-0
lines changed

2 files changed

+514
-0
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
"""
5+
Class used in scripts/prodigy_to_tsv.py which converts token annotated jsonl
6+
files to tab-separated-values files for use in the deep reference parser
7+
"""
8+
9+
import csv
10+
import re
11+
12+
import numpy as np
13+
import plac
14+
15+
from ..io import read_jsonl
16+
17+
from ..logger import logger
18+
19+
20+
class TokenLabelPairs:
21+
"""
22+
Convert prodigy format docs or list of lists into tuples of (token, label).
23+
"""
24+
25+
def __init__(self, line_limit=73, respect_line_endings=True, respect_doc_endings=True):
26+
"""
27+
Args:
28+
line_limit(int): Maximum number of tokens allowed per training
29+
example. If you are planning to use this data for making
30+
predictions, then this should correspond to the max_words
31+
attribute for the DeepReferenceParser class used to train the
32+
model.
33+
respect_line_endings(bool): If true, line endings appearing in the
34+
text will be respected, leading to much shorter line lengths
35+
usually <10. Typically this results in a much worser performing
36+
model, but follows the convention set by Rodrigues et al.
37+
respect_doc_endings(bool): If true, a line ending is added at the
38+
end of each document. If false, then the end of a document flows
39+
into the beginning of the next document.
40+
"""
41+
42+
self.line_count = 0
43+
self.line_lengths = []
44+
self.line_limit = line_limit
45+
self.respect_doc_endings = respect_doc_endings
46+
self.respect_line_endings = respect_line_endings
47+
48+
def run(self, docs):
49+
"""
50+
"""
51+
52+
out = []
53+
54+
for doc in docs:
55+
out.extend(self.yield_token_label_pair(doc))
56+
57+
self.stats(out)
58+
59+
return out
60+
61+
62+
def stats(self, out):
63+
64+
avg_line_len = np.round(np.mean(self.line_lengths), 2)
65+
66+
logger.debug("Returning %s examples", self.line_count)
67+
logger.debug("Average line length: %s", avg_line_len)
68+
69+
def yield_token_label_pair(self, doc, lists=False):
70+
"""
71+
Expect list of jsons loaded from a jsonl
72+
73+
Args:
74+
doc (dict): Document in prodigy format or list of lists
75+
lists (bool): Expect a list of lists rather than a prodigy format
76+
dict?
77+
78+
NOTE: Makes the assumption that every token has been labelled in spans. This
79+
assumption will be true if the data has been labelled with prodigy, then
80+
spans covering entire references have been converted to token spans. OR that
81+
there are no spans at all, and this is being used to prepare data for
82+
prediction.
83+
"""
84+
85+
# Ensure that spans and tokens are sorted (they should be)
86+
87+
if lists:
88+
tokens = doc
89+
else:
90+
tokens = sorted(doc["tokens"], key=lambda k: k["id"])
91+
92+
# For prediction, documents may not yet have spans. If they do, sort
93+
# them too based on token_start which is equivalent to id in
94+
# doc["tokens"].
95+
96+
spans = doc.get("spans")
97+
98+
if spans:
99+
spans = sorted(doc["spans"], key=lambda k: k["token_start"])
100+
101+
# Set a token counter that is used to limit the number of tokens to
102+
# line_limit.
103+
104+
token_counter = int(0)
105+
106+
doc_len = len(tokens)
107+
108+
for i, token in enumerate(tokens, 1):
109+
110+
label = None
111+
112+
# For case when tokens have been labelled with spans (for training
113+
# data).
114+
115+
if spans:
116+
# Need to remove one from index as it starts at 1!
117+
label = spans[i - 1].get("label")
118+
119+
text = token["text"]
120+
121+
# If the token is empty even if it has been labelled, pass it
122+
123+
if text == "":
124+
125+
pass
126+
127+
# If the token is a newline (and possibly other characters) and we want
128+
# to respect line endings in the text, then yield a (None, None) tuple
129+
# which will be converted to a blank line when the resulting tsv file
130+
# is read.
131+
132+
elif re.search(r"\n", text) and self.respect_line_endings:
133+
134+
# Is it blank after whitespace is removed?
135+
136+
if text.strip() == "":
137+
138+
yield (None, None)
139+
140+
self.line_lengths.append(token_counter)
141+
self.line_count += 1
142+
143+
token_counter = 0
144+
145+
elif token_counter == self.line_limit:
146+
147+
# Yield None, None to signify a line ending, then yield the next
148+
# token.
149+
150+
yield (None, None)
151+
yield (text.strip(), label)
152+
153+
# Set to one to account for the first token being added.
154+
155+
self.line_lengths.append(token_counter)
156+
self.line_count += 1
157+
158+
token_counter = 1
159+
160+
elif i == doc_len and self.respect_doc_endings:
161+
162+
# Case when the end of the document has been reached, but it is
163+
# less than self.lime_limit. This assumes that we want to retain
164+
# a line ending which denotes the end of a document, and the
165+
# start of new one.
166+
167+
yield (text.strip(), label)
168+
yield (None, None)
169+
170+
self.line_lengths.append(token_counter)
171+
self.line_count += 1
172+
173+
else:
174+
175+
# Returned the stripped label.
176+
177+
yield (text.strip(), label)
178+
179+
token_counter += 1
180+
181+
182+
@plac.annotations(
183+
input_file=(
184+
"Path to jsonl file containing prodigy docs.",
185+
"positional",
186+
None,
187+
str
188+
),
189+
output_file=(
190+
"Path to output tsv file.",
191+
"positional",
192+
None,
193+
str
194+
)
195+
)
196+
def prodigy_to_tsv(input_file, output_file):
197+
"""
198+
Convert token annotated jsonl to token annotated tsv ready for use in the
199+
Rodrigues model.
200+
"""
201+
202+
annotated_data = read_jsonl(input_file)
203+
204+
logger.info("Loaded %s prodigy docs", len(annotated_data))
205+
206+
tlp = TokenLabelPairs()
207+
token_label_pairs = list(tlp.run(annotated_data))
208+
209+
with open(output_file, 'w') as fb:
210+
writer = csv.writer(fb, delimiter="\t")
211+
# Write DOCSTART and a blank line
212+
writer.writerows([("DOCSTART", None), (None, None)])
213+
writer.writerows(token_label_pairs)
214+
215+
logger.info("Wrote %s token/label pairs to %s", len(token_label_pairs),
216+
output_file)
217+

0 commit comments

Comments
 (0)