Skip to content

Commit 8f5c24f

Browse files
new: Add reach_to_prodigy
1 parent 947a12f commit 8f5c24f

File tree

2 files changed

+315
-0
lines changed

2 files changed

+315
-0
lines changed
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
import copy
5+
import itertools
6+
7+
import en_core_web_sm as model
8+
import plac
9+
10+
from ..io import read_jsonl, write_jsonl
11+
from ..logger import logger
12+
13+
14+
class ReachToProdigy:
15+
"""
16+
Converts json of scraped reference section into prodigy style json.
17+
18+
The resulting json can then be loaded into prodigy is required.
19+
20+
Expects dict in the following format:
21+
22+
```
23+
{
24+
...,
25+
"sections": {
26+
"Reference": "References\n1. Upson. M. (2018) ..."
27+
}
28+
}
29+
30+
```
31+
32+
Returns references in the following format:
33+
34+
```
35+
[{
36+
'text': ' This is an example with a linebreak\n',
37+
'meta': {'doc_hash': None, 'provider': None, 'line_number': 3},
38+
'tokens': [
39+
{'text': ' ', 'start': 0, 'end': 1, 'id': 0},
40+
{'text': 'This', 'start': 1, 'end': 5, 'id': 1},
41+
{'text': 'is', 'start': 6, 'end': 8, 'id': 2},
42+
{'text': 'an', 'start': 9, 'end': 11, 'id': 3},
43+
{'text': 'example', 'start': 12, 'end': 19, 'id': 4},
44+
{'text': 'with', 'start': 20, 'end': 24, 'id': 5},
45+
{'text': 'a', 'start': 25, 'end': 26, 'id': 6},
46+
{'text': 'linebreak', 'start': 27, 'end': 36, 'id': 7},
47+
{'text': '\n', 'start': 36, 'end': 37, 'id': 8}]
48+
},
49+
...
50+
]
51+
52+
```
53+
"""
54+
55+
def __init__(self, ref_sections, lines=10, split_char="\n",
56+
add_linebreak=True, join_char=" "):
57+
"""
58+
Args:
59+
ref_sections(list): List of dicts extracted in scrape.
60+
lines(int): Number of lines to combine into one chunk
61+
split_char(str): Character to split lines on.
62+
add_linebreak(bool): Should a linebreak be re-added so that it is
63+
clear where a break was made?
64+
join_chars(str): Which character will be used to join lines at the
65+
point which they are merged into a chunk.
66+
"""
67+
68+
self.ref_sections = ref_sections
69+
self.lines = lines
70+
self.split_char = split_char
71+
self.add_linebreak = add_linebreak
72+
self.join_char = join_char
73+
74+
self.nlp = model.load()
75+
76+
def run(self):
77+
"""
78+
Main method of the class
79+
"""
80+
81+
prodigy_format = []
82+
83+
for i, refs in enumerate(self.ref_sections):
84+
85+
one_record = self.one_record_to_prodigy_format(refs, self.nlp,
86+
self.lines, self.split_char, self.add_linebreak, self.join_char)
87+
88+
# If something is returned (i.e. there is a ref section)
89+
# then append to prodigy_format.
90+
91+
if one_record:
92+
93+
prodigy_format.append(one_record)
94+
95+
out = list(itertools.chain.from_iterable(prodigy_format))
96+
97+
logger.info("Returned %s reference sections", len(out))
98+
99+
return out
100+
101+
def one_record_to_prodigy_format(self, input_dict, nlp, lines=10, split_char="\n",
102+
add_linebreak=True, join_char=" "):
103+
"""
104+
Convert one dict produced by the scrape to a list of prodigy dicts
105+
106+
Args:
107+
input_dict(dict): One reference section dict from the scrape
108+
nlp: A spacy model, for example loaded with spacy.load("en_core_web_sm")
109+
lines(int): Number of lines to combine into one chunk
110+
split_char(str): Character to split lines on.
111+
add_linebreak(bool): Should a linebreak be re-added so that it is
112+
clear where a break was made?
113+
join_chars(str): Which character will be used to join lines at the
114+
point which they are merged into a chunk.
115+
"""
116+
117+
out = []
118+
119+
# Only continue if references are found
120+
121+
if input_dict:
122+
123+
sections = input_dict.get("sections")
124+
125+
# If there is something in sections: this will be a keyword for example
126+
# reference, or bibliography, etc
127+
128+
if sections:
129+
130+
# In case there are more than one keyword, cycle through them
131+
132+
for _, refs in sections.items():
133+
134+
# Refs will be a list, so cycle through it in case there was
135+
# more than one section found with the same keyword
136+
137+
for ref in refs:
138+
139+
if refs:
140+
141+
refs_lines = self.split_lines(ref, split_char=split_char, add_linebreak=add_linebreak)
142+
refs_grouped = self.combine_n_rows(refs_lines, n=lines, join_char=join_char)
143+
144+
_meta = {
145+
"doc_hash": input_dict.get("file_hash"),
146+
"provider": input_dict.get("provider"),
147+
}
148+
149+
for i, lines in enumerate(refs_grouped):
150+
151+
meta = copy.deepcopy(_meta)
152+
153+
meta["line_number"] = i
154+
155+
tokens = nlp.tokenizer(lines)
156+
formatted_tokens = [self.format_token(i) for i in tokens]
157+
158+
out.append({"text": lines, "meta": meta, "tokens": formatted_tokens})
159+
160+
return out
161+
162+
def format_token(self, token):
163+
"""
164+
Converts prodigy token to dict of format:
165+
166+
{"text":"of","start":32,"end":34,"id":5}
167+
"""
168+
out = dict()
169+
out["text"] = token.text
170+
out["start"] = token.idx
171+
out["end"] = token.idx + len(token)
172+
out["id"] = token.i
173+
174+
return out
175+
176+
def combine_n_rows(self, doc, n=5, join_char=" "):
177+
"""
178+
Splits a document into chunks of length `n` lines.
179+
180+
Args:
181+
doc(str): A document as a string.
182+
n(int): The number of lines allowed in each chunk.
183+
join_char(str): The character used to join lines within a chunk.
184+
185+
Returns:
186+
list: A list of chunks containing `n` lines.
187+
"""
188+
189+
indices = list(range(len(doc)))
190+
191+
# Split the document into blocks
192+
193+
groups = list(zip(indices[0::n], indices[n::n]))
194+
195+
# Iterate through each group of n rows, convert all the items
196+
# to str, and concatenate into a single string
197+
198+
out = [join_char.join([str(j) for j in doc[beg:end]]) for beg, end in groups]
199+
200+
# Check whether there is a remainder and concatenate if so
201+
202+
max_index = len(groups) * n
203+
204+
last_group = join_char.join([str(j) for j in doc[max_index:len(doc)]])
205+
206+
out.append(last_group)
207+
208+
return out
209+
210+
def split_lines(self, doc, split_char="\\n", add_linebreak=True):
211+
"""
212+
Split a document by `split_char`
213+
214+
Args:
215+
doc(str): A document containing references
216+
split_char(str): Character by which `doc` will be split
217+
add_linebreak(bool): If `True`, re-adds the linebreak character to the
218+
end of each line that is split.
219+
220+
Returns:
221+
(list): List of split lines (str).
222+
223+
"""
224+
225+
lines = doc.split(split_char)
226+
227+
if add_linebreak:
228+
lines = [i + split_char for i in lines]
229+
230+
return lines
231+
232+
233+
234+
@plac.annotations(
235+
input_file=(
236+
"Path to jsonl file containing produced by scraper and containing reference sections.",
237+
"positional", None, str),
238+
output_file=(
239+
"Path to jsonl file into which prodigy format references will be saved.",
240+
"positional", None, str),
241+
lines=(
242+
"How many lines to include in an annotation example.",
243+
"option", "l", int),
244+
split_char=("Which character to split lines on.", "option", "s", str),
245+
no_linebreak=(
246+
"Don't re-add linebreaks to the annotation examples after splitting.",
247+
"flag", "n", str),
248+
join_char=(
249+
"Which character should be used to join lines into an annotation example.",
250+
"option", "j", str),
251+
)
252+
def reach_to_prodigy(input_file, output_file, lines=10, split_char="\\n",
253+
no_linebreak=False, join_char=" "):
254+
255+
print(split_char)
256+
257+
scraped_json = read_jsonl(input_file)
258+
259+
logger.info("Loaded %s scraped examples", len(scraped_json))
260+
261+
if no_linebreak:
262+
add_linebreak = False
263+
else:
264+
add_linebreak = True
265+
266+
prodigy_format_references = ReachToProdigy(
267+
scraped_json, lines=lines, split_char=split_char,
268+
add_linebreak=add_linebreak, join_char=join_char
269+
)
270+
271+
references = prodigy_format_references.run()
272+
273+
write_jsonl(references, output_file=output_file)
274+
275+
logger.info("Prodigy format written to %s", output_file)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
import pytest
5+
from deep_reference_parser.prodigy.reach_to_prodigy import ReachToProdigy
6+
7+
@pytest.fixture(scope="function")
8+
def stp():
9+
ref_sections = [{}, {}, {}]
10+
return ReachToProdigy(ref_sections)
11+
12+
def test_combine_n_rows(stp):
13+
14+
doc = list(range(100, 200))
15+
out = stp.combine_n_rows(doc, n=5, join_char=" ")
16+
17+
last_in_doc = doc[len(doc) -1]
18+
last_in_out = int(out[-1].split(" ")[-1])
19+
20+
assert last_in_doc == last_in_out
21+
22+
assert out[0] == '100 101 102 103 104'
23+
assert out[-2] == '190 191 192 193 194'
24+
assert out[-1] == '195 196 197 198 199'
25+
26+
def test_combine_n_rows_uneven_split(stp):
27+
28+
doc = list(range(100, 200))
29+
out = stp.combine_n_rows(doc, n=7, join_char=" ")
30+
31+
last_in_doc = doc[len(doc) -1]
32+
last_in_out = int(out[-1].split(" ")[-1])
33+
34+
assert last_in_doc == last_in_out
35+
assert len(out[-1].split(" ")) == 2
36+
assert len(out[-2].split(" ")) == 7
37+
38+
assert out[0] == '100 101 102 103 104 105 106'
39+
assert out[-2] == '191 192 193 194 195 196 197'
40+
assert out[-1] == '198 199'

0 commit comments

Comments
 (0)