Skip to content

Commit 3ae2b05

Browse files
committed
Added script to convert parsed JSON output (relations) to a .csv file for easy manual review.
1 parent 74c109d commit 3ae2b05

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

src/parserindexer/json2csv.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#!/usr/bin/env python
2+
# json2csv.py
3+
# Convert JSON output (from parse_all.py/CoreNLP) to .csv format
4+
# to enable manual review of extracted relations.
5+
#
6+
# Kiri Wagstaff
7+
# August 28, 2017
8+
9+
import sys, os, shutil
10+
import json
11+
from ioutils import read_jsonlines
12+
from progressbar import ProgressBar, ETA, Bar, Percentage
13+
14+
def usage():
15+
print './json2csv.py <JSON file>'
16+
sys.exit(1)
17+
18+
19+
def convert_json_to_csv(jsonfile):
20+
# Get the number of lines (docs) to process
21+
# Do this before re-opening the file because read_jsonlines()
22+
# returns a generator.
23+
with open(jsonfile) as f:
24+
l = f.readlines()
25+
ndocs = len(l)
26+
f.close()
27+
28+
# Read in the JSON file
29+
docs = read_jsonlines(jsonfile)
30+
31+
# Open the output CSV file
32+
outfn = jsonfile[:jsonfile.rfind('.')] + '.csv'
33+
outf = open(outfn, 'w')
34+
print 'Writing to', outfn
35+
# Header
36+
outf.write('# Judgment, Docid, Target, Component, Sentence, URL\n')
37+
38+
widgets = ['Docs (of %d): ' % ndocs, Percentage(), ' ',
39+
Bar('='), ' ', ETA()]
40+
pbar = ProgressBar(widgets=widgets, maxval=ndocs).start()
41+
# Iterate over documents
42+
i=1
43+
for d in docs:
44+
if 'rel' not in d['metadata']:
45+
continue
46+
47+
docid = d['metadata']['resourceName']
48+
49+
# Output relations into the .csv file
50+
rels = d['metadata']['rel']
51+
ners = d['metadata']['ner']
52+
skip_inds = []
53+
for (t, r) in enumerate(rels):
54+
# Special merging step for adjacent Target tokens
55+
# If this matches a multi-word NER,
56+
# expand the target name and skip the next relation
57+
start_target = int(r['target_ids'][0].split('_')[1])
58+
end_target = int(r['target_ids'][0].split('_')[2])
59+
targ_name = r['target_names'][0]
60+
if start_target in skip_inds:
61+
continue
62+
next_rels = [r2 for r2 in rels if
63+
int(r2['target_ids'][0].split('_')[1]) > end_target]
64+
if len(next_rels) > 0:
65+
next_rels.sort(key=lambda x:
66+
int(x['target_ids'][0].split('_')[1]))
67+
next_rel = next_rels[0]
68+
start_next_target = int(next_rel['target_ids'][0].split('_')[1])
69+
end_next_target = int(next_rel['target_ids'][0].split('_')[2])
70+
ner_matches = [n for n in ners if \
71+
n['text'].startswith(targ_name) and
72+
n['begin'] == start_target and
73+
n['end'] == end_next_target]
74+
if len(ner_matches) > 0:
75+
print('Merging %s and %s' % (targ_name,
76+
next_rel['target_names'][0]))
77+
targ_name += ' ' + next_rel['target_names'][0]
78+
skip_inds.append(start_next_target)
79+
80+
outf.write(',%s,%s,%s,"%s"\n' %
81+
(docid,
82+
# These are arrays, but for auto-annotations,
83+
# they will only ever have one item
84+
#r['target_names'][0],
85+
targ_name,
86+
r['cont_names'][0],
87+
r['sentence']))
88+
# build URL manually? It's in the doc->url field.
89+
pbar.update(i)
90+
i += 1
91+
92+
print
93+
outf.close()
94+
95+
96+
def main():
97+
if len(sys.argv) != 2:
98+
usage()
99+
100+
if not os.path.exists(sys.argv[1]):
101+
print 'Error: could not find JSON input file %s.' % sys.argv[1]
102+
usage()
103+
104+
convert_json_to_csv(sys.argv[1])
105+
106+
107+
if __name__ == '__main__':
108+
main()

0 commit comments

Comments
 (0)