Merge branch 'master' of https://github.com/USCDataScience/parser-indexer-py

wkiri · wkiri · commit cbf9a854fb40 · 2020-08-11T10:18:51.000-07:00
diff --git a/src/parserindexer/brat_ann_indexer.py b/src/parserindexer/brat_ann_indexer.py
@@ -5,7 +5,7 @@
 from argparse import ArgumentParser
 from indexer import parse_lpsc_from_path
 import re
-from utils import canonical_name
+from utils import canonical_name, canonical_target_name
 
 # Functions to perform reference removal (assumes [n] reference style)
 # Written by Karanjeet Singh
@@ -53,7 +53,8 @@ def parse_ann_line(self, ann_line):
         parts = ann_line.strip().split('\t')
         res = {
             'annotation_id_s': parts[0],
-            'source': 'brat',
+            #'source': 'brat',
+            'source': 'reviewed',
         }
         if parts[0][0] == 'T': # anchors (for targets, components, events)
             args = parts[1].split()[1:]
@@ -122,10 +123,11 @@ def extract_excerpt(self, content, ann):
         if m:
             sent_start = sent_start + m.start()
         # End: next period followed by {space,newline}, or end of document.
-        sent_end     = anchor_end + content[anchor_end:].find('. ')+1
-        if sent_end <= anchor_end:
-            sent_end = anchor_end + content[anchor_end:].find('.\n')+1
-        if sent_end <= anchor_end:
+        # Better: skip "wt.", "ig." (for Figure), "(e" or ".g"
+        m = re.search('(?<!(wt|ig|\(e|\.g))\.[ \n]', content[anchor_end:])
+        if m != None:
+            sent_end = anchor_end + m.start() + 1
+        else:
             sent_end = len(content)
         return content[sent_start:sent_end]
 
@@ -162,13 +164,18 @@ def read_records(self, in_file):
                         targets_anns = ch.get('targets_ss', [])
                         cont_anns = ch.get('cont_ss', [])
                         ch['target_ids_ss'] = list(map(lambda t: index[t]['id'], targets_anns))
+                        ch['target_ann_ids_ss'] = list(map(lambda t: index[t]['annotation_id_s'], targets_anns))
                         ch['target_names_ss'] = list(map(lambda t: index[t]['name'], targets_anns))
                         ch['cont_ids_ss'] = list(map(lambda c: index[c]['id'], cont_anns))
                         ch['cont_names_ss'] = list(map(lambda c: index[c]['name'], cont_anns))
                         # extract excerpt from anchor annotation
                         anc_doc = index[ch['anchor_s']]
                         ch['excerpt_t'] = self.extract_excerpt(txt, anc_doc)
 
+                    # Track aliases
+                    targets = [a for a in children if a.get('type') == 'target']
+                    aliases = [a for a in children if a.get('type') == 'alias']
+
                 # Extract references
                 references = extract_references(txt)
 
@@ -187,11 +194,21 @@ def read_records(self, in_file):
                 }
                 for child in children:
                     if 'name' in child:
-                        child['can_name'] = canonical_name(child['name'])
+                        if child['type'] == 'target':
+                            child['can_name'] = \
+                                canonical_target_name(child['name'], 
+                                                      child['annotation_id_s'],
+                                                      targets, aliases)
+                        else:
+                            child['can_name'] = canonical_name(child['name'])
                     if 'target_names_ss' in child:
-                        child['target_names_ss'] = map(canonical_name, child['target_names_ss'])
+                        child['target_names_ss'] = \
+                            [canonical_target_name(t, i, targets, aliases) \
+                                 for (t,i) in zip(child['target_names_ss'],
+                                                  child['target_ann_ids_ss'])]
                     if 'cont_names_ss' in child:
-                        child['cont_names_ss'] = map(canonical_name, child['cont_names_ss'])
+                        child['cont_names_ss'] = \
+                            [canonical_name(c) for c in child['cont_names_ss']]
                     yield child
 
     def index(self, solr_url, in_file):
diff --git a/src/parserindexer/filter_extractions.py b/src/parserindexer/filter_extractions.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+# 
+# Read in MTE extractions (.jsonl) and align with expert-vetting (.csv)
+# to filter; write out only those marked 'Y' by expert to new .jsonl.
+#
+# Author: Kiri Wagstaff
+# June 10, 2018
+# Copyright notice at bottom of file.
+
+import sys, os
+from ioutils import read_jsonlines, dump_jsonlines
+import codecs, csv
+
+def read_extractions(extractions):
+    # Get the number of lines (docs) to process
+    # Do this before re-opening the file because read_jsonlines()
+    # returns a generator.
+    with open(extractions) as f:
+        l = f.readlines()
+        ndocs = len(l)
+        f.close()
+
+    # Read in the JSON file (contains, among other things, extractions)
+    docs = read_jsonlines(extractions)
+
+    return docs, ndocs
+
+
+# Read in the expert annotations (.csv)
+def read_expert(expert):
+    judgments = []
+    #nrows = 0
+    with codecs.open(expert, 'r', 'UTF-8') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            judgments.append(row)
+            #nrows += 1
+            #if row['Judgment'] == 'Y':
+            #    approved.append(row)
+    #print len(approved), 'of', nrows, 'relations approved.'
+    print 'Read %d judgments.' % len(judgments)
+
+    return judgments
+
+
+def query_relation(target, cont, sentence):
+    print('<%s> contains <%s>? [y/n]' % (target, cont))
+    print('Sentence: <%s>' % sentence)
+
+    return raw_input()
+
+
+def main(extractions, expert, outfile):
+
+    # Check arguments
+    if not os.path.exists(extractions):
+        print('Could not find extractions file %s.' % extractions)
+        sys.exit(1)
+
+    if not os.path.exists(expert):
+        print('Could not find expert file %s.' % expert)
+        sys.exit(1)
+
+    # Read in the JSON file (contains, among other things, extractions)
+    docs, ndocs = read_extractions(extractions)
+    filtered_docs = []
+
+    # Read in the expert annotations (.csv)
+    judgments = read_expert(expert)
+
+    # Align them.  Iterate over the documents.
+    n_rels_keep  = 0
+    n_rels_total = 0
+    for (i,d) in enumerate(docs):
+        # If there are no relations, omit this document
+        if 'rel' not in d['metadata']:
+            continue
+
+        docid = d['metadata']['resourceName']
+        rels  = d['metadata']['rel']
+        n_rels_total += len(rels)
+
+        doc_judgments = [j for j in judgments if j[' Docid'] == docid]
+
+        # Relations to keep
+        filtered_rels = []
+
+        if len(doc_judgments) == len(rels):
+            # Same number in each set, so we can zip them up
+            for (r, j) in zip(rels, doc_judgments):
+                # Can't do exact string match on target_name because 
+                # some are partials.
+                # Can't do exact string match on cont_name because 
+                # I helpfully expanded element names in the expert file.
+                # Can do match on sentence at least!
+                if (r['target_names'][0] == j[' Target'] and
+                    #r['cont_names'][0]   == j[' Component'] and
+                    r['sentence']        == j[' Sentence']):
+                    # Only keep items judged 'Y'
+                    if j['Judgment'] == 'Y':
+                        filtered_rels.append(r)
+                else:
+                    # Mismatch, so drop into manual review mode
+                    res = query_relation(r['target_names'][0], 
+                                         r['cont_names'][0],
+                                         r['sentence'])
+                    if res == 'y' or res == 'Y':
+                        filtered_rels.append(r)
+        else:
+            # Different number of relations in expert vs. system output
+            # so time for manual review
+            print('%d/%d: ****** MANUAL REVIEW MODE (%s) ******' % \
+                      (i, ndocs, docid))
+            for r in rels:
+                res = query_relation(r['target_names'][0], 
+                                     r['cont_names'][0],
+                                     r['sentence'])
+                if res == 'y' or res == 'Y':
+                    filtered_rels.append(r)
+                    
+        print('%s (%d/%d): Kept %d/%d relations.' % \
+                  (docid, i, ndocs, len(filtered_rels), len(rels)))
+
+        # Only save this document if it has relations remaining
+        if len(filtered_rels) > 0:
+            n_rels_keep += len(filtered_rels)
+            d['metadata']['rel'] = filtered_rels
+            filtered_docs.append(d)
+
+    # Save filtered JSON content to outfile
+    dump_jsonlines(filtered_docs, outfile)
+    print
+    print('Kept %d/%d relations in %d/%d documents.' % \
+              (n_rels_keep, n_rels_total,
+               len(filtered_docs), ndocs))
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
+
+    parser.add_argument('extractions', help='.jsonl file containing all NER and RE extractions')
+    parser.add_argument('expert',      help='.csv file containing expert judgment of all relations')
+    parser.add_argument('outfile',     help='.jsonl file to store filtered extractions')
+
+    args = parser.parse_args()
+
+    main(**vars(args))
+
+
+# Copyright 2018, by the California Institute of Technology. ALL
+# RIGHTS RESERVED. United States Government Sponsorship
+# acknowledged. Any commercial use must be negotiated with the Office
+# of Technology Transfer at the California Institute of Technology.
+#
+# This software may be subject to U.S. export control laws and
+# regulations.  By accepting this document, the user agrees to comply
+# with all applicable U.S. export laws and regulations.  User has the
+# responsibility to obtain export licenses, or other export authority
+# as may be required before exporting such information to foreign
+# countries or providing access to foreign persons.
diff --git a/src/parserindexer/json2brat.py b/src/parserindexer/json2brat.py
@@ -6,7 +6,7 @@
 # Kiri Wagstaff
 # July 31, 2017
 
-import sys, os, shutil
+import sys, os, shutil, io
 import json
 from ioutils import read_jsonlines
 
@@ -31,7 +31,7 @@ def convert_json_to_brat(jsonfile, outdir):
         ners = d['metadata']['ner']
         outfn = os.path.join(outdir, 
                              d['metadata']['resourceName'][:-4] + '.ann')
-        outf = open(outfn, 'w')
+        outf = io.open(outfn, 'w', encoding='utf8')
         print 'Writing to', outfn
         for (t, n) in enumerate(ners):
             outf.write('T%d\t%s %s %s\t%s\n' % \
diff --git a/src/parserindexer/utils.py b/src/parserindexer/utils.py
@@ -135,11 +135,27 @@ def canonical_name(name):
         return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')
 
 
-def canonical_target_name(name):
+def canonical_target_name(name, id, targets, aliases):
     """
     Gets canonical target name
     :param name - name whose canonical name is to be looked up
     :return canonical name
     """
     name = name.strip()
+    # Look up 'name' in the aliases; if found, replace with its antecedent
+    # Note: this is super permissive.  Exact match on id is safe,
+    # but we're also allowing any exact-text match with any other 
+    # known target name.
+    all_targets = [t['annotation_id_s'] for t in targets 
+                   if t['name'] == name]
+    name_aliases = [a['arg2_s'] for a in aliases 
+                    if ((a['arg1_s'] == id) or 
+                        (a['arg1_s'] in all_targets))]
+    if len(name_aliases) > 0:
+        # Ideally there is only one; let's use the first one
+        can_name = [t['name'] for t in targets \
+                        if t['annotation_id_s'] == name_aliases[0]]
+        print('Mapping <%s> to <%s>' % (name, can_name[0]))
+        name = can_name[0]
+
     return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')