Skip to content

Commit 947a12f

Browse files
new: Add test_reference_to_token_annotations command
1 parent 4c521f5 commit 947a12f

File tree

2 files changed

+523
-0
lines changed

2 files changed

+523
-0
lines changed
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
import itertools
5+
6+
import plac
7+
8+
from ..io import read_jsonl, write_jsonl
9+
from ..logger import logger
10+
11+
12+
class TokenTagger:
13+
"""
14+
Converts data in prodigy format with full reference spans to per-token spans
15+
16+
Expects one of four lables for the spans:
17+
18+
* BE: A complete reference
19+
* BI: A frgament of reference that captures the beginning but not the end
20+
* IE: A frgament of reference that captures the end but not the beginning
21+
* II: A fragment of a reference that captures neither the beginning nor the
22+
end .
23+
"""
24+
25+
def __init__(self):
26+
27+
self.out = []
28+
29+
def tag_doc(self, doc):
30+
"""
31+
Tags a document with the appropriate labels
32+
33+
Args:
34+
doc(dict): A single document in prodigy dict format to be labelled.
35+
"""
36+
37+
bie_spans = self.reference_spans(doc["spans"], doc["tokens"])
38+
o_spans = self.outside_spans(bie_spans, doc["tokens"])
39+
40+
# Flatten into one list.
41+
42+
spans = itertools.chain(bie_spans, o_spans)
43+
44+
# Sort by token id to ensure it is ordered.
45+
46+
spans = sorted(spans, key=lambda k: k['token_start'])
47+
48+
doc["spans"] = spans
49+
50+
return doc
51+
52+
def run(self, docs):
53+
"""
54+
Main class method for tagging multiple documents.
55+
56+
Args:
57+
docs(dict): A list of docs in prodigy dict format to be labelled.
58+
"""
59+
60+
for doc in docs:
61+
62+
self.out.append(self.tag_doc(doc))
63+
64+
return self.out
65+
66+
def reference_spans(self, spans, tokens):
67+
"""
68+
Given a whole reference span as labelled in prodigy, break this into
69+
appropriate single token spans depending on the label that was applied to
70+
the whole reference span.
71+
"""
72+
split_spans = []
73+
74+
for span in spans:
75+
if span["label"] in ["BE", "be"]:
76+
77+
split_spans.extend(
78+
self.split_long_span(tokens, span, "b-r", "e-r")
79+
)
80+
81+
elif span["label"] in ["BI", "bi"]:
82+
83+
split_spans.extend(
84+
self.split_long_span(tokens, span, "b-r", "i-r")
85+
)
86+
87+
elif span["label"] in ["IE", "ie"]:
88+
89+
split_spans.extend(
90+
self.split_long_span(tokens, span, "i-r", "e-r")
91+
)
92+
93+
elif span["label"] in ["II", "ii"]:
94+
95+
split_spans.extend(
96+
self.split_long_span(tokens, span, "i-r", "i-r")
97+
)
98+
99+
return split_spans
100+
101+
102+
def outside_spans(self, spans, tokens):
103+
"""
104+
Label tokens with `o` if they are outside a reference
105+
106+
Args:
107+
spans(list): Spans in prodigy format.
108+
tokens(list): Tokens in prodigy format.
109+
110+
Returns:
111+
list: A list of spans in prodigy format that comprises the tokens which
112+
are outside of a reference.
113+
"""
114+
# Get the diff between inside and outside tokens
115+
116+
span_indices = set([span["token_start"] for span in spans])
117+
token_indices = set([token["id"] for token in tokens])
118+
119+
outside_indices = token_indices - span_indices
120+
121+
outside_spans = []
122+
123+
for index in outside_indices:
124+
outside_spans.append(self.create_span(tokens, index, "o"))
125+
126+
return outside_spans
127+
128+
129+
def create_span(self, tokens, index, label):
130+
"""
131+
Given a list of tokens, (in prodigy format) and an index relating to one of
132+
those tokens, and a new label: create a single token span using the new
133+
label, and the token selected by `index`.
134+
"""
135+
136+
token = tokens[index]
137+
138+
span = {
139+
"start": token["start"],
140+
"end": token["end"],
141+
"token_start": token["id"],
142+
"token_end": token["id"],
143+
"label": label,
144+
}
145+
146+
return span
147+
148+
149+
def split_long_span(self, tokens, span, start_label, end_label):
150+
"""
151+
Split a milti-token span into `n` spans of lengh `1`, where `n=len(tokens)`
152+
"""
153+
154+
spans = []
155+
spans.append(self.create_span(tokens, span["token_start"], start_label))
156+
spans.append(self.create_span(tokens, span["token_end"], end_label))
157+
158+
for index in range(span["token_start"] + 1, span["token_end"]):
159+
spans.append(self.create_span(tokens, index, "i-r"))
160+
161+
spans = sorted(spans, key=lambda k: k['token_start'])
162+
163+
return spans
164+
165+
@plac.annotations(
166+
input_file=(
167+
"Path to jsonl file containing chunks of references in prodigy format.",
168+
"positional",
169+
None,
170+
str
171+
),
172+
output_file=(
173+
"Path to jsonl file into which fully annotate files will be saved.",
174+
"positional",
175+
None,
176+
str
177+
)
178+
)
179+
180+
def reference_to_token_annotations(input_file, output_file):
181+
""" Converts a file output by prodigy (using prodigy db-out) from
182+
references level annotations to individual level annotations. The rationale
183+
for this is that reference level annotations are much easier for humans to
184+
do, but not useful when training a token level model.
185+
186+
This function is predominantly useful fot tagging reference spans, but may
187+
also have a function with other references annotations.
188+
"""
189+
190+
partially_annotated = read_jsonl(input_file)
191+
192+
# Only run the tagger on annotated examples.
193+
194+
partially_annotated = [doc for doc in partially_annotated if doc.get("spans")]
195+
196+
logger.info("Loaded %s documents with reference annotations", len(partially_annotated))
197+
198+
annotator = TokenTagger(partially_annotated)
199+
200+
fully_annotated = annotator.run()
201+
202+
write_jsonl(fully_annotated, output_file=output_file)
203+
204+
logger.info("Fully annotated references written to %s", output_file)

0 commit comments

Comments
 (0)