When alias relations are present, map target names to known aliases.

wkiri · wkiri · commit f36c49180ab9 · 2019-06-14T12:25:20.000-07:00
diff --git a/src/parserindexer/brat_ann_indexer.py b/src/parserindexer/brat_ann_indexer.py
@@ -53,7 +53,8 @@ def parse_ann_line(self, ann_line):
         parts = ann_line.strip().split('\t')
         res = {
             'annotation_id_s': parts[0],
-            'source': 'brat',
+            #'source': 'brat',
+            'source': 'reviewed',
         }
         if parts[0][0] == 'T': # anchors (for targets, components, events)
             args = parts[1].split()[1:]
@@ -162,13 +163,18 @@ def read_records(self, in_file):
                         targets_anns = ch.get('targets_ss', [])
                         cont_anns = ch.get('cont_ss', [])
                         ch['target_ids_ss'] = list(map(lambda t: index[t]['id'], targets_anns))
+                        ch['target_ann_ids_ss'] = list(map(lambda t: index[t]['annotation_id_s'], targets_anns))
                         ch['target_names_ss'] = list(map(lambda t: index[t]['name'], targets_anns))
                         ch['cont_ids_ss'] = list(map(lambda c: index[c]['id'], cont_anns))
                         ch['cont_names_ss'] = list(map(lambda c: index[c]['name'], cont_anns))
                         # extract excerpt from anchor annotation
                         anc_doc = index[ch['anchor_s']]
                         ch['excerpt_t'] = self.extract_excerpt(txt, anc_doc)
 
+                    # Track aliases
+                    targets = [a for a in children if a.get('type') == 'target']
+                    aliases = [a for a in children if a.get('type') == 'alias']
+
                 # Extract references
                 references = extract_references(txt)
 
@@ -188,14 +194,20 @@ def read_records(self, in_file):
                 for child in children:
                     if 'name' in child:
                         if child['type'] == 'target':
-                            child['can_name'] = canonical_target_name(child['name'])
+                            child['can_name'] = \
+                                canonical_target_name(child['name'], 
+                                                      child['annotation_id_s'],
+                                                      targets, aliases)
                         else:
                             child['can_name'] = canonical_name(child['name'])
                     if 'target_names_ss' in child:
-                        child['target_names_ss'] = map(canonical_target_name, 
-                                                       child['target_names_ss'])
+                        child['target_names_ss'] = \
+                            [canonical_target_name(t, i, targets, aliases) \
+                                 for (t,i) in zip(child['target_names_ss'],
+                                                  child['target_ann_ids_ss'])]
                     if 'cont_names_ss' in child:
-                        child['cont_names_ss'] = map(canonical_name, child['cont_names_ss'])
+                        child['cont_names_ss'] = \
+                            [canonical_name(c) for c in child['cont_names_ss']]
                     yield child
 
     def index(self, solr_url, in_file):
diff --git a/src/parserindexer/utils.py b/src/parserindexer/utils.py
@@ -135,11 +135,27 @@ def canonical_name(name):
         return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')
 
 
-def canonical_target_name(name):
+def canonical_target_name(name, id, targets, aliases):
     """
     Gets canonical target name
     :param name - name whose canonical name is to be looked up
     :return canonical name
     """
     name = name.strip()
+    # Look up 'name' in the aliases; if found, replace with its antecedent
+    # Note: this is super permissive.  Exact match on id is safe,
+    # but we're also allowing any exact-text match with any other 
+    # known target name.
+    all_targets = [t['annotation_id_s'] for t in targets 
+                   if t['name'] == name]
+    name_aliases = [a['arg2_s'] for a in aliases 
+                    if ((a['arg1_s'] == id) or 
+                        (a['arg1_s'] in all_targets))]
+    if len(name_aliases) > 0:
+        # Ideally there is only one; let's use the first one
+        can_name = [t['name'] for t in targets \
+                        if t['annotation_id_s'] == name_aliases[0]]
+        print('Mapping <%s> to <%s>' % (name, can_name[0]))
+        name = can_name[0]
+
     return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')