Skip to content

Commit 0d3b7ee

Browse files
new: Add numbered_reference_annotator
1 parent f7a6e9a commit 0d3b7ee

File tree

2 files changed

+359
-0
lines changed

2 files changed

+359
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# coding: utf-8
2+
#!/usr/bin/env python3
3+
4+
import re
5+
6+
import plac
7+
8+
from ..io import read_jsonl, write_jsonl
9+
from ..logger import logger
10+
11+
REGEX = r"\n{1,2}(?:(?:\s)|(?:\(|\[))?(?:\d{1,2})(?:(?:\.\)|\.\]|\]\n|\.|\s)|(?:\]|\)))(\s+)?(?:\n)?(?:\s+)?(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
12+
13+
class NumberedReferenceAnnotator:
14+
"""
15+
Takes reference sections with numeric labelling scraped by Reach in prodigy
16+
format, and labels the references as spans by splitting them using regex.
17+
18+
Note that you must identify numbered reference section first. This can be
19+
done with a simple textcat model trained in prodigy.
20+
"""
21+
22+
def __init__(self):
23+
24+
self.regex = r""
25+
26+
def run(self, docs, regex=REGEX):
27+
28+
self.regex = regex
29+
30+
for doc in docs:
31+
32+
spans = self.label_numbered_references(doc["text"], doc["tokens"])
33+
doc["spans"] = spans
34+
35+
yield doc
36+
37+
def label_numbered_references(self, text, tokens):
38+
39+
# Search for number reference using regex
40+
41+
splits = list(re.finditer(self.regex, text))
42+
spans = []
43+
44+
for index in range(0, len(splits) - 1):
45+
46+
# Calculate the approximate start and end of the reference using
47+
# the character offsets returned by re.finditer.
48+
49+
start = splits[index].end()
50+
end = splits[index + 1].start()
51+
52+
# Calculate which is the closest token to the character offset
53+
# returned above.
54+
55+
token_start = self._find_closest_token(tokens, start, "start")
56+
token_end = self._find_closest_token(tokens, end, "end")
57+
58+
# To avoid the possibility of mismatches between the character
59+
# offset and the token offset, reset the character offsets
60+
# based on the token offsets.
61+
62+
start = self._get_token_offset(tokens, token_start, "start")
63+
end = self._get_token_offset(tokens, token_end, "end")
64+
65+
# Create dict and append
66+
67+
span = {
68+
"start": start,
69+
"end": end,
70+
"token_start": token_start,
71+
"token_end": token_end,
72+
"label": "BE"
73+
}
74+
75+
spans.append(span)
76+
77+
return spans
78+
79+
80+
def _find_closest_token(self, tokens, char_offset, pos_string):
81+
"""
82+
Find the token start/end closest to "number"
83+
84+
Args:
85+
tokens: A list of token dicts from a prodigy document.
86+
char_offset(int): A character offset relating to either the start or the
87+
end of a token.
88+
pos_string(str): One of ["start", "end"] denoting whether `char_offset`
89+
is a start or the end of a token
90+
"""
91+
token_map = self._token_start_mapper(tokens, pos_string)
92+
token_key = self._find_closest_number(token_map.keys(), char_offset)
93+
94+
return token_map[token_key]
95+
96+
def _get_token_offset(self, tokens, token_id, pos_string):
97+
"""
98+
Return the character offset for the token with id == token_id
99+
"""
100+
101+
token_match = (token[pos_string] for token in tokens if token["id"] == token_id)
102+
103+
return next(token_match, None)
104+
105+
def _find_closest_number(self, numbers, number):
106+
""" Find the closest match in a list of numbers when presented with
107+
a number
108+
"""
109+
110+
return min(numbers, key=lambda x:abs(x - number))
111+
112+
def _token_start_mapper(self, tokens, pos_string):
113+
""" Map token id by the token start/end position
114+
"""
115+
116+
return {token[pos_string]:token["id"] for token in tokens}
117+
118+
119+
@plac.annotations(
120+
input_file=(
121+
"Path to jsonl file containing numbered reference sections as docs.",
122+
"positional",
123+
None,
124+
str
125+
),
126+
output_file=(
127+
"Path to output jsonl file containing prodigy docs with numbered references labelled.",
128+
"positional",
129+
None,
130+
str
131+
)
132+
)
133+
def annotate_numbered_references(input_file, output_file):
134+
"""
135+
Takes reference sections with numeric labelling scraped by Reach in prodigy
136+
format, and labels the references as spans by splitting them using regex.
137+
"""
138+
139+
numbered_reference_sections = read_jsonl(input_file)
140+
141+
logger.info("Loaded %s prodigy docs", len(numbered_reference_sections))
142+
143+
nra = NumberedReferenceAnnotator()
144+
docs = list(nra.run[numbered_reference_sections])
145+
146+
write_jsonl(output_file)
147+
148+
logger.info("Wrote %s annotated references to %s", len(docs),
149+
output_file)
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
import pytest
5+
import spacy
6+
from deep_reference_parser.prodigy.numbered_reference_annotator import NumberedReferenceAnnotator
7+
8+
@pytest.fixture(scope="function")
9+
def nra():
10+
return NumberedReferenceAnnotator()
11+
12+
13+
def test_numbered_reference_splitter(nra):
14+
15+
numbered_reference = {
16+
"text": "References\n 1. \n Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168. \n 2. \n WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126. \n 3. \n Consolidated guidelines on the use of antiretroviral drugs for treating and preventing HIV infection: \n recommendations for a public health approach. Geneva: World Health Organization; 2013:272. \n 4.",
17+
"tokens": [
18+
{"text": "References", "start": 0, "end": 10, "id": 0},
19+
{"text": "\n ", "start": 10, "end": 12, "id": 1},
20+
{"text": "1", "start": 12, "end": 13, "id": 2},
21+
{"text": ".", "start": 13, "end": 14, "id": 3},
22+
{"text": "\n ", "start": 15, "end": 17, "id": 4},
23+
{"text": "Global", "start": 17, "end": 23, "id": 5},
24+
{"text": "update", "start": 24, "end": 30, "id": 6},
25+
{"text": "on", "start": 31, "end": 33, "id": 7},
26+
{"text": "the", "start": 34, "end": 37, "id": 8},
27+
{"text": "health", "start": 38, "end": 44, "id": 9},
28+
{"text": "sector", "start": 45, "end": 51, "id": 10},
29+
{"text": "response", "start": 52, "end": 60, "id": 11},
30+
{"text": "to", "start": 61, "end": 63, "id": 12},
31+
{"text": "HIV", "start": 64, "end": 67, "id": 13},
32+
{"text": ",", "start": 67, "end": 68, "id": 14},
33+
{"text": "2014", "start": 69, "end": 73, "id": 15},
34+
{"text": ".", "start": 73, "end": 74, "id": 16},
35+
{"text": "Geneva", "start": 75, "end": 81, "id": 17},
36+
{"text": ":", "start": 81, "end": 82, "id": 18},
37+
{"text": "World", "start": 83, "end": 88, "id": 19},
38+
{"text": "Health", "start": 89, "end": 95, "id": 20},
39+
{"text": "Organization", "start": 96, "end": 108, "id": 21},
40+
{"text": ";", "start": 108, "end": 109, "id": 22},
41+
{"text": "\n ", "start": 110, "end": 112, "id": 23},
42+
{"text": "2014:168", "start": 112, "end": 120, "id": 24},
43+
{"text": ".", "start": 120, "end": 121, "id": 25},
44+
{"text": "\n ", "start": 122, "end": 124, "id": 26},
45+
{"text": "2", "start": 124, "end": 125, "id": 27},
46+
{"text": ".", "start": 125, "end": 126, "id": 28},
47+
{"text": "\n ", "start": 127, "end": 129, "id": 29},
48+
{"text": "WHO", "start": 129, "end": 132, "id": 30},
49+
{"text": ",", "start": 132, "end": 133, "id": 31},
50+
{"text": "UNICEF", "start": 134, "end": 140, "id": 32},
51+
{"text": ",", "start": 140, "end": 141, "id": 33},
52+
{"text": "UNAIDS", "start": 142, "end": 148, "id": 34},
53+
{"text": ".", "start": 148, "end": 149, "id": 35},
54+
{"text": "Global", "start": 150, "end": 156, "id": 36},
55+
{"text": "update", "start": 157, "end": 163, "id": 37},
56+
{"text": "on", "start": 164, "end": 166, "id": 38},
57+
{"text": "HIV", "start": 167, "end": 170, "id": 39},
58+
{"text": "treatment", "start": 171, "end": 180, "id": 40},
59+
{"text": "2013", "start": 181, "end": 185, "id": 41},
60+
{"text": ":", "start": 185, "end": 186, "id": 42},
61+
{"text": "results", "start": 187, "end": 194, "id": 43},
62+
{"text": ",", "start": 194, "end": 195, "id": 44},
63+
{"text": "impact", "start": 196, "end": 202, "id": 45},
64+
{"text": "and", "start": 203, "end": 206, "id": 46},
65+
{"text": "\n ", "start": 207, "end": 209, "id": 47},
66+
{"text": "opportunities", "start": 209, "end": 222, "id": 48},
67+
{"text": ".", "start": 222, "end": 223, "id": 49},
68+
{"text": "Geneva", "start": 224, "end": 230, "id": 50},
69+
{"text": ":", "start": 230, "end": 231, "id": 51},
70+
{"text": "World", "start": 232, "end": 237, "id": 52},
71+
{"text": "Health", "start": 238, "end": 244, "id": 53},
72+
{"text": "Organization", "start": 245, "end": 257, "id": 54},
73+
{"text": ";", "start": 257, "end": 258, "id": 55},
74+
{"text": "2013:126", "start": 259, "end": 267, "id": 56},
75+
{"text": ".", "start": 267, "end": 268, "id": 57},
76+
{"text": "\n ", "start": 269, "end": 271, "id": 58},
77+
{"text": "3", "start": 271, "end": 272, "id": 59},
78+
{"text": ".", "start": 272, "end": 273, "id": 60},
79+
{"text": "\n ", "start": 274, "end": 276, "id": 61},
80+
{"text": "Consolidated", "start": 276, "end": 288, "id": 62},
81+
{"text": "guidelines", "start": 289, "end": 299, "id": 63},
82+
{"text": "on", "start": 300, "end": 302, "id": 64},
83+
{"text": "the", "start": 303, "end": 306, "id": 65},
84+
{"text": "use", "start": 307, "end": 310, "id": 66},
85+
{"text": "of", "start": 311, "end": 313, "id": 67},
86+
{"text": "antiretroviral", "start": 314, "end": 328, "id": 68},
87+
{"text": "drugs", "start": 329, "end": 334, "id": 69},
88+
{"text": "for", "start": 335, "end": 338, "id": 70},
89+
{"text": "treating", "start": 339, "end": 347, "id": 71},
90+
{"text": "and", "start": 348, "end": 351, "id": 72},
91+
{"text": "preventing", "start": 352, "end": 362, "id": 73},
92+
{"text": "HIV", "start": 363, "end": 366, "id": 74},
93+
{"text": "infection", "start": 367, "end": 376, "id": 75},
94+
{"text": ":", "start": 376, "end": 377, "id": 76},
95+
{"text": "\n ", "start": 378, "end": 380, "id": 77},
96+
{"text": "recommendations", "start": 380, "end": 395, "id": 78},
97+
{"text": "for", "start": 396, "end": 399, "id": 79},
98+
{"text": "a", "start": 400, "end": 401, "id": 80},
99+
{"text": "public", "start": 402, "end": 408, "id": 81},
100+
{"text": "health", "start": 409, "end": 415, "id": 82},
101+
{"text": "approach", "start": 416, "end": 424, "id": 83},
102+
{"text": ".", "start": 424, "end": 425, "id": 84},
103+
{"text": "Geneva", "start": 426, "end": 432, "id": 85},
104+
{"text": ":", "start": 432, "end": 433, "id": 86},
105+
{"text": "World", "start": 434, "end": 439, "id": 87},
106+
{"text": "Health", "start": 440, "end": 446, "id": 88},
107+
{"text": "Organization", "start": 447, "end": 459, "id": 89},
108+
{"text": ";", "start": 459, "end": 460, "id": 90},
109+
{"text": "2013:272", "start": 461, "end": 469, "id": 91},
110+
{"text": ".", "start": 469, "end": 470, "id": 92},
111+
{"text": "\n", "start": 470, "end": 471, "id": 92},
112+
{"text": "3", "start": 471, "end": 472, "id": 92},
113+
{"text": ".", "start": 472, "end": 473, "id": 92},
114+
]
115+
}
116+
117+
docs = list(nra.run([numbered_reference]))
118+
text = docs[0]["text"]
119+
spans = docs[0]["spans"]
120+
ref_1 = text[spans[0]["start"]:spans[0]["end"]]
121+
ref_2 = text[spans[1]["start"]:spans[1]["end"]]
122+
ref_3 = text[spans[2]["start"]:spans[2]["end"]]
123+
124+
assert len(spans) == 3
125+
assert ref_1 == "Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168."
126+
assert ref_2.strip() == "WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126."
127+
assert ref_3.strip() == "Consolidated guidelines on the use of antiretroviral drugs for treating and preventing HIV infection: \n recommendations for a public health approach. Geneva: World Health Organization; 2013:272."
128+
129+
def test_numbered_reference_splitter_line_endings(nra):
130+
"""
131+
Test case where there two line enedings immediately preceding the reference
132+
index.
133+
"""
134+
135+
numbered_reference = {
136+
"text": "References\n\n1. \n Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168. \n\n2. \n WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126.\n\n3.",
137+
"tokens": [
138+
{"text": "References", "start": 0, "end": 10, "id": 0},
139+
{"text": "\n\n", "start": 10, "end": 12, "id": 1},
140+
{"text": "1", "start": 12, "end": 13, "id": 2},
141+
{"text": ".", "start": 13, "end": 14, "id": 3},
142+
{"text": "\n ", "start": 15, "end": 17, "id": 4},
143+
{"text": "Global", "start": 17, "end": 23, "id": 5},
144+
{"text": "update", "start": 24, "end": 30, "id": 6},
145+
{"text": "on", "start": 31, "end": 33, "id": 7},
146+
{"text": "the", "start": 34, "end": 37, "id": 8},
147+
{"text": "health", "start": 38, "end": 44, "id": 9},
148+
{"text": "sector", "start": 45, "end": 51, "id": 10},
149+
{"text": "response", "start": 52, "end": 60, "id": 11},
150+
{"text": "to", "start": 61, "end": 63, "id": 12},
151+
{"text": "HIV", "start": 64, "end": 67, "id": 13},
152+
{"text": ",", "start": 67, "end": 68, "id": 14},
153+
{"text": "2014", "start": 69, "end": 73, "id": 15},
154+
{"text": ".", "start": 73, "end": 74, "id": 16},
155+
{"text": "Geneva", "start": 75, "end": 81, "id": 17},
156+
{"text": ":", "start": 81, "end": 82, "id": 18},
157+
{"text": "World", "start": 83, "end": 88, "id": 19},
158+
{"text": "Health", "start": 89, "end": 95, "id": 20},
159+
{"text": "Organization", "start": 96, "end": 108, "id": 21},
160+
{"text": ";", "start": 108, "end": 109, "id": 22},
161+
{"text": "\n ", "start": 110, "end": 112, "id": 23},
162+
{"text": "2014:168", "start": 112, "end": 120, "id": 24},
163+
{"text": ".", "start": 120, "end": 121, "id": 25},
164+
{"text": "\n\n", "start": 122, "end": 124, "id": 26},
165+
{"text": "2", "start": 124, "end": 125, "id": 27},
166+
{"text": ".", "start": 125, "end": 126, "id": 28},
167+
{"text": "\n ", "start": 127, "end": 129, "id": 29},
168+
{"text": "WHO", "start": 129, "end": 132, "id": 30},
169+
{"text": ",", "start": 132, "end": 133, "id": 31},
170+
{"text": "UNICEF", "start": 134, "end": 140, "id": 32},
171+
{"text": ",", "start": 140, "end": 141, "id": 33},
172+
{"text": "UNAIDS", "start": 142, "end": 148, "id": 34},
173+
{"text": ".", "start": 148, "end": 149, "id": 35},
174+
{"text": "Global", "start": 150, "end": 156, "id": 36},
175+
{"text": "update", "start": 157, "end": 163, "id": 37},
176+
{"text": "on", "start": 164, "end": 166, "id": 38},
177+
{"text": "HIV", "start": 167, "end": 170, "id": 39},
178+
{"text": "treatment", "start": 171, "end": 180, "id": 40},
179+
{"text": "2013", "start": 181, "end": 185, "id": 41},
180+
{"text": ":", "start": 185, "end": 186, "id": 42},
181+
{"text": "results", "start": 187, "end": 194, "id": 43},
182+
{"text": ",", "start": 194, "end": 195, "id": 44},
183+
{"text": "impact", "start": 196, "end": 202, "id": 45},
184+
{"text": "and", "start": 203, "end": 206, "id": 46},
185+
{"text": "\n ", "start": 207, "end": 209, "id": 47},
186+
{"text": "opportunities", "start": 209, "end": 222, "id": 48},
187+
{"text": ".", "start": 222, "end": 223, "id": 49},
188+
{"text": "Geneva", "start": 224, "end": 230, "id": 50},
189+
{"text": ":", "start": 230, "end": 231, "id": 51},
190+
{"text": "World", "start": 232, "end": 237, "id": 52},
191+
{"text": "Health", "start": 238, "end": 244, "id": 53},
192+
{"text": "Organization", "start": 245, "end": 257, "id": 54},
193+
{"text": ";", "start": 257, "end": 258, "id": 55},
194+
{"text": "2013:126", "start": 259, "end": 267, "id": 56},
195+
{"text": ".", "start": 260, "end": 261, "id": 57},
196+
{"text": "\n\n", "start": 261, "end": 263, "id": 58},
197+
{"text": "3", "start": 262, "end": 264, "id": 59},
198+
{"text": ".", "start": 263, "end": 265, "id": 60},
199+
]
200+
}
201+
202+
docs = list(nra.run([numbered_reference]))
203+
text = docs[0]["text"]
204+
spans = docs[0]["spans"]
205+
ref_1 = text[spans[0]["start"]:spans[0]["end"]]
206+
ref_2 = text[spans[1]["start"]:spans[1]["end"]]
207+
208+
assert len(spans) == 2
209+
assert ref_1.strip() == "Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168."
210+
assert ref_2.strip() == "WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126"

0 commit comments

Comments
 (0)