Skip to content

Commit cbf9a85

Browse files
committed
2 parents 94b3a13 + 4948103 commit cbf9a85

File tree

4 files changed

+206
-12
lines changed

4 files changed

+206
-12
lines changed

src/parserindexer/brat_ann_indexer.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from argparse import ArgumentParser
66
from indexer import parse_lpsc_from_path
77
import re
8-
from utils import canonical_name
8+
from utils import canonical_name, canonical_target_name
99

1010
# Functions to perform reference removal (assumes [n] reference style)
1111
# Written by Karanjeet Singh
@@ -53,7 +53,8 @@ def parse_ann_line(self, ann_line):
5353
parts = ann_line.strip().split('\t')
5454
res = {
5555
'annotation_id_s': parts[0],
56-
'source': 'brat',
56+
#'source': 'brat',
57+
'source': 'reviewed',
5758
}
5859
if parts[0][0] == 'T': # anchors (for targets, components, events)
5960
args = parts[1].split()[1:]
@@ -122,10 +123,11 @@ def extract_excerpt(self, content, ann):
122123
if m:
123124
sent_start = sent_start + m.start()
124125
# End: next period followed by {space,newline}, or end of document.
125-
sent_end = anchor_end + content[anchor_end:].find('. ')+1
126-
if sent_end <= anchor_end:
127-
sent_end = anchor_end + content[anchor_end:].find('.\n')+1
128-
if sent_end <= anchor_end:
126+
# Better: skip "wt.", "ig." (for Figure), "(e" or ".g"
127+
m = re.search('(?<!(wt|ig|\(e|\.g))\.[ \n]', content[anchor_end:])
128+
if m != None:
129+
sent_end = anchor_end + m.start() + 1
130+
else:
129131
sent_end = len(content)
130132
return content[sent_start:sent_end]
131133

@@ -162,13 +164,18 @@ def read_records(self, in_file):
162164
targets_anns = ch.get('targets_ss', [])
163165
cont_anns = ch.get('cont_ss', [])
164166
ch['target_ids_ss'] = list(map(lambda t: index[t]['id'], targets_anns))
167+
ch['target_ann_ids_ss'] = list(map(lambda t: index[t]['annotation_id_s'], targets_anns))
165168
ch['target_names_ss'] = list(map(lambda t: index[t]['name'], targets_anns))
166169
ch['cont_ids_ss'] = list(map(lambda c: index[c]['id'], cont_anns))
167170
ch['cont_names_ss'] = list(map(lambda c: index[c]['name'], cont_anns))
168171
# extract excerpt from anchor annotation
169172
anc_doc = index[ch['anchor_s']]
170173
ch['excerpt_t'] = self.extract_excerpt(txt, anc_doc)
171174

175+
# Track aliases
176+
targets = [a for a in children if a.get('type') == 'target']
177+
aliases = [a for a in children if a.get('type') == 'alias']
178+
172179
# Extract references
173180
references = extract_references(txt)
174181

@@ -187,11 +194,21 @@ def read_records(self, in_file):
187194
}
188195
for child in children:
189196
if 'name' in child:
190-
child['can_name'] = canonical_name(child['name'])
197+
if child['type'] == 'target':
198+
child['can_name'] = \
199+
canonical_target_name(child['name'],
200+
child['annotation_id_s'],
201+
targets, aliases)
202+
else:
203+
child['can_name'] = canonical_name(child['name'])
191204
if 'target_names_ss' in child:
192-
child['target_names_ss'] = map(canonical_name, child['target_names_ss'])
205+
child['target_names_ss'] = \
206+
[canonical_target_name(t, i, targets, aliases) \
207+
for (t,i) in zip(child['target_names_ss'],
208+
child['target_ann_ids_ss'])]
193209
if 'cont_names_ss' in child:
194-
child['cont_names_ss'] = map(canonical_name, child['cont_names_ss'])
210+
child['cont_names_ss'] = \
211+
[canonical_name(c) for c in child['cont_names_ss']]
195212
yield child
196213

197214
def index(self, solr_url, in_file):
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python
2+
#
3+
# Read in MTE extractions (.jsonl) and align with expert-vetting (.csv)
4+
# to filter; write out only those marked 'Y' by expert to new .jsonl.
5+
#
6+
# Author: Kiri Wagstaff
7+
# June 10, 2018
8+
# Copyright notice at bottom of file.
9+
10+
import sys, os
11+
from ioutils import read_jsonlines, dump_jsonlines
12+
import codecs, csv
13+
14+
def read_extractions(extractions):
15+
# Get the number of lines (docs) to process
16+
# Do this before re-opening the file because read_jsonlines()
17+
# returns a generator.
18+
with open(extractions) as f:
19+
l = f.readlines()
20+
ndocs = len(l)
21+
f.close()
22+
23+
# Read in the JSON file (contains, among other things, extractions)
24+
docs = read_jsonlines(extractions)
25+
26+
return docs, ndocs
27+
28+
29+
# Read in the expert annotations (.csv)
30+
def read_expert(expert):
31+
judgments = []
32+
#nrows = 0
33+
with codecs.open(expert, 'r', 'UTF-8') as csvfile:
34+
reader = csv.DictReader(csvfile)
35+
for row in reader:
36+
judgments.append(row)
37+
#nrows += 1
38+
#if row['Judgment'] == 'Y':
39+
# approved.append(row)
40+
#print len(approved), 'of', nrows, 'relations approved.'
41+
print 'Read %d judgments.' % len(judgments)
42+
43+
return judgments
44+
45+
46+
def query_relation(target, cont, sentence):
47+
print('<%s> contains <%s>? [y/n]' % (target, cont))
48+
print('Sentence: <%s>' % sentence)
49+
50+
return raw_input()
51+
52+
53+
def main(extractions, expert, outfile):
54+
55+
# Check arguments
56+
if not os.path.exists(extractions):
57+
print('Could not find extractions file %s.' % extractions)
58+
sys.exit(1)
59+
60+
if not os.path.exists(expert):
61+
print('Could not find expert file %s.' % expert)
62+
sys.exit(1)
63+
64+
# Read in the JSON file (contains, among other things, extractions)
65+
docs, ndocs = read_extractions(extractions)
66+
filtered_docs = []
67+
68+
# Read in the expert annotations (.csv)
69+
judgments = read_expert(expert)
70+
71+
# Align them. Iterate over the documents.
72+
n_rels_keep = 0
73+
n_rels_total = 0
74+
for (i,d) in enumerate(docs):
75+
# If there are no relations, omit this document
76+
if 'rel' not in d['metadata']:
77+
continue
78+
79+
docid = d['metadata']['resourceName']
80+
rels = d['metadata']['rel']
81+
n_rels_total += len(rels)
82+
83+
doc_judgments = [j for j in judgments if j[' Docid'] == docid]
84+
85+
# Relations to keep
86+
filtered_rels = []
87+
88+
if len(doc_judgments) == len(rels):
89+
# Same number in each set, so we can zip them up
90+
for (r, j) in zip(rels, doc_judgments):
91+
# Can't do exact string match on target_name because
92+
# some are partials.
93+
# Can't do exact string match on cont_name because
94+
# I helpfully expanded element names in the expert file.
95+
# Can do match on sentence at least!
96+
if (r['target_names'][0] == j[' Target'] and
97+
#r['cont_names'][0] == j[' Component'] and
98+
r['sentence'] == j[' Sentence']):
99+
# Only keep items judged 'Y'
100+
if j['Judgment'] == 'Y':
101+
filtered_rels.append(r)
102+
else:
103+
# Mismatch, so drop into manual review mode
104+
res = query_relation(r['target_names'][0],
105+
r['cont_names'][0],
106+
r['sentence'])
107+
if res == 'y' or res == 'Y':
108+
filtered_rels.append(r)
109+
else:
110+
# Different number of relations in expert vs. system output
111+
# so time for manual review
112+
print('%d/%d: ****** MANUAL REVIEW MODE (%s) ******' % \
113+
(i, ndocs, docid))
114+
for r in rels:
115+
res = query_relation(r['target_names'][0],
116+
r['cont_names'][0],
117+
r['sentence'])
118+
if res == 'y' or res == 'Y':
119+
filtered_rels.append(r)
120+
121+
print('%s (%d/%d): Kept %d/%d relations.' % \
122+
(docid, i, ndocs, len(filtered_rels), len(rels)))
123+
124+
# Only save this document if it has relations remaining
125+
if len(filtered_rels) > 0:
126+
n_rels_keep += len(filtered_rels)
127+
d['metadata']['rel'] = filtered_rels
128+
filtered_docs.append(d)
129+
130+
# Save filtered JSON content to outfile
131+
dump_jsonlines(filtered_docs, outfile)
132+
print
133+
print('Kept %d/%d relations in %d/%d documents.' % \
134+
(n_rels_keep, n_rels_total,
135+
len(filtered_docs), ndocs))
136+
137+
138+
if __name__ == '__main__':
139+
import argparse
140+
parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
141+
142+
parser.add_argument('extractions', help='.jsonl file containing all NER and RE extractions')
143+
parser.add_argument('expert', help='.csv file containing expert judgment of all relations')
144+
parser.add_argument('outfile', help='.jsonl file to store filtered extractions')
145+
146+
args = parser.parse_args()
147+
148+
main(**vars(args))
149+
150+
151+
# Copyright 2018, by the California Institute of Technology. ALL
152+
# RIGHTS RESERVED. United States Government Sponsorship
153+
# acknowledged. Any commercial use must be negotiated with the Office
154+
# of Technology Transfer at the California Institute of Technology.
155+
#
156+
# This software may be subject to U.S. export control laws and
157+
# regulations. By accepting this document, the user agrees to comply
158+
# with all applicable U.S. export laws and regulations. User has the
159+
# responsibility to obtain export licenses, or other export authority
160+
# as may be required before exporting such information to foreign
161+
# countries or providing access to foreign persons.

src/parserindexer/json2brat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# Kiri Wagstaff
77
# July 31, 2017
88

9-
import sys, os, shutil
9+
import sys, os, shutil, io
1010
import json
1111
from ioutils import read_jsonlines
1212

@@ -31,7 +31,7 @@ def convert_json_to_brat(jsonfile, outdir):
3131
ners = d['metadata']['ner']
3232
outfn = os.path.join(outdir,
3333
d['metadata']['resourceName'][:-4] + '.ann')
34-
outf = open(outfn, 'w')
34+
outf = io.open(outfn, 'w', encoding='utf8')
3535
print 'Writing to', outfn
3636
for (t, n) in enumerate(ners):
3737
outf.write('T%d\t%s %s %s\t%s\n' % \

src/parserindexer/utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,27 @@ def canonical_name(name):
135135
return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')
136136

137137

138-
def canonical_target_name(name):
138+
def canonical_target_name(name, id, targets, aliases):
139139
"""
140140
Gets canonical target name
141141
:param name - name whose canonical name is to be looked up
142142
:return canonical name
143143
"""
144144
name = name.strip()
145+
# Look up 'name' in the aliases; if found, replace with its antecedent
146+
# Note: this is super permissive. Exact match on id is safe,
147+
# but we're also allowing any exact-text match with any other
148+
# known target name.
149+
all_targets = [t['annotation_id_s'] for t in targets
150+
if t['name'] == name]
151+
name_aliases = [a['arg2_s'] for a in aliases
152+
if ((a['arg1_s'] == id) or
153+
(a['arg1_s'] in all_targets))]
154+
if len(name_aliases) > 0:
155+
# Ideally there is only one; let's use the first one
156+
can_name = [t['name'] for t in targets \
157+
if t['annotation_id_s'] == name_aliases[0]]
158+
print('Mapping <%s> to <%s>' % (name, can_name[0]))
159+
name = can_name[0]
160+
145161
return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')

0 commit comments

Comments
 (0)