|
| 1 | +#!/usr/bin/env python |
| 2 | +# json2csv.py |
| 3 | +# Convert JSON output (from parse_all.py/CoreNLP) to .csv format |
| 4 | +# to enable manual review of extracted relations. |
| 5 | +# |
| 6 | +# Kiri Wagstaff |
| 7 | +# August 28, 2017 |
| 8 | + |
| 9 | +import sys, os, shutil |
| 10 | +import json |
| 11 | +from ioutils import read_jsonlines |
| 12 | +from progressbar import ProgressBar, ETA, Bar, Percentage |
| 13 | + |
| 14 | +def usage(): |
| 15 | + print './json2csv.py <JSON file>' |
| 16 | + sys.exit(1) |
| 17 | + |
| 18 | + |
| 19 | +def convert_json_to_csv(jsonfile): |
| 20 | + # Get the number of lines (docs) to process |
| 21 | + # Do this before re-opening the file because read_jsonlines() |
| 22 | + # returns a generator. |
| 23 | + with open(jsonfile) as f: |
| 24 | + l = f.readlines() |
| 25 | + ndocs = len(l) |
| 26 | + f.close() |
| 27 | + |
| 28 | + # Read in the JSON file |
| 29 | + docs = read_jsonlines(jsonfile) |
| 30 | + |
| 31 | + # Open the output CSV file |
| 32 | + outfn = jsonfile[:jsonfile.rfind('.')] + '.csv' |
| 33 | + outf = open(outfn, 'w') |
| 34 | + print 'Writing to', outfn |
| 35 | + # Header |
| 36 | + outf.write('# Judgment, Docid, Target, Component, Sentence, URL\n') |
| 37 | + |
| 38 | + widgets = ['Docs (of %d): ' % ndocs, Percentage(), ' ', |
| 39 | + Bar('='), ' ', ETA()] |
| 40 | + pbar = ProgressBar(widgets=widgets, maxval=ndocs).start() |
| 41 | + # Iterate over documents |
| 42 | + i=1 |
| 43 | + for d in docs: |
| 44 | + if 'rel' not in d['metadata']: |
| 45 | + continue |
| 46 | + |
| 47 | + docid = d['metadata']['resourceName'] |
| 48 | + |
| 49 | + # Output relations into the .csv file |
| 50 | + rels = d['metadata']['rel'] |
| 51 | + ners = d['metadata']['ner'] |
| 52 | + skip_inds = [] |
| 53 | + for (t, r) in enumerate(rels): |
| 54 | + # Special merging step for adjacent Target tokens |
| 55 | + # If this matches a multi-word NER, |
| 56 | + # expand the target name and skip the next relation |
| 57 | + start_target = int(r['target_ids'][0].split('_')[1]) |
| 58 | + end_target = int(r['target_ids'][0].split('_')[2]) |
| 59 | + targ_name = r['target_names'][0] |
| 60 | + if start_target in skip_inds: |
| 61 | + continue |
| 62 | + next_rels = [r2 for r2 in rels if |
| 63 | + int(r2['target_ids'][0].split('_')[1]) > end_target] |
| 64 | + if len(next_rels) > 0: |
| 65 | + next_rels.sort(key=lambda x: |
| 66 | + int(x['target_ids'][0].split('_')[1])) |
| 67 | + next_rel = next_rels[0] |
| 68 | + start_next_target = int(next_rel['target_ids'][0].split('_')[1]) |
| 69 | + end_next_target = int(next_rel['target_ids'][0].split('_')[2]) |
| 70 | + ner_matches = [n for n in ners if \ |
| 71 | + n['text'].startswith(targ_name) and |
| 72 | + n['begin'] == start_target and |
| 73 | + n['end'] == end_next_target] |
| 74 | + if len(ner_matches) > 0: |
| 75 | + print('Merging %s and %s' % (targ_name, |
| 76 | + next_rel['target_names'][0])) |
| 77 | + targ_name += ' ' + next_rel['target_names'][0] |
| 78 | + skip_inds.append(start_next_target) |
| 79 | + |
| 80 | + outf.write(',%s,%s,%s,"%s"\n' % |
| 81 | + (docid, |
| 82 | + # These are arrays, but for auto-annotations, |
| 83 | + # they will only ever have one item |
| 84 | + #r['target_names'][0], |
| 85 | + targ_name, |
| 86 | + r['cont_names'][0], |
| 87 | + r['sentence'])) |
| 88 | + # build URL manually? It's in the doc->url field. |
| 89 | + pbar.update(i) |
| 90 | + i += 1 |
| 91 | + |
| 92 | + print |
| 93 | + outf.close() |
| 94 | + |
| 95 | + |
| 96 | +def main(): |
| 97 | + if len(sys.argv) != 2: |
| 98 | + usage() |
| 99 | + |
| 100 | + if not os.path.exists(sys.argv[1]): |
| 101 | + print 'Error: could not find JSON input file %s.' % sys.argv[1] |
| 102 | + usage() |
| 103 | + |
| 104 | + convert_json_to_csv(sys.argv[1]) |
| 105 | + |
| 106 | + |
| 107 | +if __name__ == '__main__': |
| 108 | + main() |
0 commit comments