Skip to content

Commit effba49

Browse files
committed
Added script to enable filtering of extractions by comparing to expert review results in .csv file. Resulting (much smaller) results are written to JSON file.
1 parent 963a398 commit effba49

File tree

1 file changed

+161
-0
lines changed

1 file changed

+161
-0
lines changed
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python
2+
#
3+
# Read in MTE extractions (.jsonl) and align with expert-vetting (.csv)
4+
# to filter; write out only those marked 'Y' by expert to new .jsonl.
5+
#
6+
# Author: Kiri Wagstaff
7+
# June 10, 2018
8+
# Copyright notice at bottom of file.
9+
10+
import sys, os
11+
from ioutils import read_jsonlines, dump_jsonlines
12+
import codecs, csv
13+
14+
def read_extractions(extractions):
15+
# Get the number of lines (docs) to process
16+
# Do this before re-opening the file because read_jsonlines()
17+
# returns a generator.
18+
with open(extractions) as f:
19+
l = f.readlines()
20+
ndocs = len(l)
21+
f.close()
22+
23+
# Read in the JSON file (contains, among other things, extractions)
24+
docs = read_jsonlines(extractions)
25+
26+
return docs, ndocs
27+
28+
29+
# Read in the expert annotations (.csv)
30+
def read_expert(expert):
31+
judgments = []
32+
#nrows = 0
33+
with codecs.open(expert, 'r', 'UTF-8') as csvfile:
34+
reader = csv.DictReader(csvfile)
35+
for row in reader:
36+
judgments.append(row)
37+
#nrows += 1
38+
#if row['Judgment'] == 'Y':
39+
# approved.append(row)
40+
#print len(approved), 'of', nrows, 'relations approved.'
41+
print 'Read %d judgments.' % len(judgments)
42+
43+
return judgments
44+
45+
46+
def query_relation(target, cont, sentence):
47+
print('<%s> contains <%s>? [y/n]' % (target, cont))
48+
print('Sentence: <%s>' % sentence)
49+
50+
return raw_input()
51+
52+
53+
def main(extractions, expert, outfile):
54+
55+
# Check arguments
56+
if not os.path.exists(extractions):
57+
print('Could not find extractions file %s.' % extractions)
58+
sys.exit(1)
59+
60+
if not os.path.exists(expert):
61+
print('Could not find expert file %s.' % expert)
62+
sys.exit(1)
63+
64+
# Read in the JSON file (contains, among other things, extractions)
65+
docs, ndocs = read_extractions(extractions)
66+
filtered_docs = []
67+
68+
# Read in the expert annotations (.csv)
69+
judgments = read_expert(expert)
70+
71+
# Align them. Iterate over the documents.
72+
n_rels_keep = 0
73+
n_rels_total = 0
74+
for (i,d) in enumerate(docs):
75+
# If there are no relations, omit this document
76+
if 'rel' not in d['metadata']:
77+
continue
78+
79+
docid = d['metadata']['resourceName']
80+
rels = d['metadata']['rel']
81+
n_rels_total += len(rels)
82+
83+
doc_judgments = [j for j in judgments if j[' Docid'] == docid]
84+
85+
# Relations to keep
86+
filtered_rels = []
87+
88+
if len(doc_judgments) == len(rels):
89+
# Same number in each set, so we can zip them up
90+
for (r, j) in zip(rels, doc_judgments):
91+
# Can't do exact string match on target_name because
92+
# some are partials.
93+
# Can't do exact string match on cont_name because
94+
# I helpfully expanded element names in the expert file.
95+
# Can do match on sentence at least!
96+
if (r['target_names'][0] == j[' Target'] and
97+
#r['cont_names'][0] == j[' Component'] and
98+
r['sentence'] == j[' Sentence']):
99+
# Only keep items judged 'Y'
100+
if j['Judgment'] == 'Y':
101+
filtered_rels.append(r)
102+
else:
103+
# Mismatch, so drop into manual review mode
104+
res = query_relation(r['target_names'][0],
105+
r['cont_names'][0],
106+
r['sentence'])
107+
if res == 'y' or res == 'Y':
108+
filtered_rels.append(r)
109+
else:
110+
# Different number of relations in expert vs. system output
111+
# so time for manual review
112+
print('%d/%d: ****** MANUAL REVIEW MODE (%s) ******' % \
113+
(i, ndocs, docid))
114+
for r in rels:
115+
res = query_relation(r['target_names'][0],
116+
r['cont_names'][0],
117+
r['sentence'])
118+
if res == 'y' or res == 'Y':
119+
filtered_rels.append(r)
120+
121+
print('%s (%d/%d): Kept %d/%d relations.' % \
122+
(docid, i, ndocs, len(filtered_rels), len(rels)))
123+
124+
# Only save this document if it has relations remaining
125+
if len(filtered_rels) > 0:
126+
n_rels_keep += len(filtered_rels)
127+
d['metadata']['rel'] = filtered_rels
128+
filtered_docs.append(d)
129+
130+
# Save filtered JSON content to outfile
131+
dump_jsonlines(filtered_docs, outfile)
132+
print
133+
print('Kept %d/%d relations in %d/%d documents.' % \
134+
(n_rels_keep, n_rels_total,
135+
len(filtered_docs), ndocs))
136+
137+
138+
if __name__ == '__main__':
139+
import argparse
140+
parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
141+
142+
parser.add_argument('extractions', help='.jsonl file containing all NER and RE extractions')
143+
parser.add_argument('expert', help='.csv file containing expert judgment of all relations')
144+
parser.add_argument('outfile', help='.jsonl file to store filtered extractions')
145+
146+
args = parser.parse_args()
147+
148+
main(**vars(args))
149+
150+
151+
# Copyright 2018, by the California Institute of Technology. ALL
152+
# RIGHTS RESERVED. United States Government Sponsorship
153+
# acknowledged. Any commercial use must be negotiated with the Office
154+
# of Technology Transfer at the California Institute of Technology.
155+
#
156+
# This software may be subject to U.S. export control laws and
157+
# regulations. By accepting this document, the user agrees to comply
158+
# with all applicable U.S. export laws and regulations. User has the
159+
# responsibility to obtain export licenses, or other export authority
160+
# as may be required before exporting such information to foreign
161+
# countries or providing access to foreign persons.

0 commit comments

Comments
 (0)