Skip to content

Commit f36c491

Browse files
committed
When alias relations are present, map target names to known aliases.
1 parent 67b3e09 commit f36c491

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

src/parserindexer/brat_ann_indexer.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def parse_ann_line(self, ann_line):
5353
parts = ann_line.strip().split('\t')
5454
res = {
5555
'annotation_id_s': parts[0],
56-
'source': 'brat',
56+
#'source': 'brat',
57+
'source': 'reviewed',
5758
}
5859
if parts[0][0] == 'T': # anchors (for targets, components, events)
5960
args = parts[1].split()[1:]
@@ -162,13 +163,18 @@ def read_records(self, in_file):
162163
targets_anns = ch.get('targets_ss', [])
163164
cont_anns = ch.get('cont_ss', [])
164165
ch['target_ids_ss'] = list(map(lambda t: index[t]['id'], targets_anns))
166+
ch['target_ann_ids_ss'] = list(map(lambda t: index[t]['annotation_id_s'], targets_anns))
165167
ch['target_names_ss'] = list(map(lambda t: index[t]['name'], targets_anns))
166168
ch['cont_ids_ss'] = list(map(lambda c: index[c]['id'], cont_anns))
167169
ch['cont_names_ss'] = list(map(lambda c: index[c]['name'], cont_anns))
168170
# extract excerpt from anchor annotation
169171
anc_doc = index[ch['anchor_s']]
170172
ch['excerpt_t'] = self.extract_excerpt(txt, anc_doc)
171173

174+
# Track aliases
175+
targets = [a for a in children if a.get('type') == 'target']
176+
aliases = [a for a in children if a.get('type') == 'alias']
177+
172178
# Extract references
173179
references = extract_references(txt)
174180

@@ -188,14 +194,20 @@ def read_records(self, in_file):
188194
for child in children:
189195
if 'name' in child:
190196
if child['type'] == 'target':
191-
child['can_name'] = canonical_target_name(child['name'])
197+
child['can_name'] = \
198+
canonical_target_name(child['name'],
199+
child['annotation_id_s'],
200+
targets, aliases)
192201
else:
193202
child['can_name'] = canonical_name(child['name'])
194203
if 'target_names_ss' in child:
195-
child['target_names_ss'] = map(canonical_target_name,
196-
child['target_names_ss'])
204+
child['target_names_ss'] = \
205+
[canonical_target_name(t, i, targets, aliases) \
206+
for (t,i) in zip(child['target_names_ss'],
207+
child['target_ann_ids_ss'])]
197208
if 'cont_names_ss' in child:
198-
child['cont_names_ss'] = map(canonical_name, child['cont_names_ss'])
209+
child['cont_names_ss'] = \
210+
[canonical_name(c) for c in child['cont_names_ss']]
199211
yield child
200212

201213
def index(self, solr_url, in_file):

src/parserindexer/utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,27 @@ def canonical_name(name):
135135
return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')
136136

137137

138-
def canonical_target_name(name):
138+
def canonical_target_name(name, id, targets, aliases):
139139
"""
140140
Gets canonical target name
141141
:param name - name whose canonical name is to be looked up
142142
:return canonical name
143143
"""
144144
name = name.strip()
145+
# Look up 'name' in the aliases; if found, replace with its antecedent
146+
# Note: this is super permissive. Exact match on id is safe,
147+
# but we're also allowing any exact-text match with any other
148+
# known target name.
149+
all_targets = [t['annotation_id_s'] for t in targets
150+
if t['name'] == name]
151+
name_aliases = [a['arg2_s'] for a in aliases
152+
if ((a['arg1_s'] == id) or
153+
(a['arg1_s'] in all_targets))]
154+
if len(name_aliases) > 0:
155+
# Ideally there is only one; let's use the first one
156+
can_name = [t['name'] for t in targets \
157+
if t['annotation_id_s'] == name_aliases[0]]
158+
print('Mapping <%s> to <%s>' % (name, can_name[0]))
159+
name = can_name[0]
160+
145161
return re.sub(r"[\s_-]+", " ", name).title().replace(' ', '_')

0 commit comments

Comments
 (0)