@@ -28,13 +28,17 @@ def __init__(self, relations_table, articles_table, standoff_index):
2828 self .standoff_cache = {}
2929
3030 def process_statements (self ):
31+ """Process rows of the EXEV relations table into INDRA Statements."""
3132 for row in tqdm .tqdm (self .relations_table .itertuples (),
3233 total = len (self .relations_table ),
3334 desc = 'Processing Evex relations' ):
3435 self .statements += self .process_row (row )
3536
3637 def process_row (self , row ):
3738 """Process a row in the relations table into INDRA Statements."""
39+
40+ # First, we determine the statement type and create the subject/object
41+ # ageints.
3842 pol_idx = 1 if row .refined_polarity == 'Negative' else 0
3943 stmt_types = type_indra_mappings .get (row .refined_type )
4044 if not stmt_types :
@@ -47,19 +51,27 @@ def process_row(self, row):
4751 obj_agent = get_standard_agent ('EGID:%s' % target_id ,
4852 db_refs = {'EGID' : target_id })
4953
54+ # We now figure out what articles provide evidence for this relation
5055 article_keys = self .article_lookup .get (row .general_event_id )
5156 stmts = []
5257 for article_prefix , article_id in article_keys :
58+ # These text refs are known based on info we have independent of
59+ # standoff availability
5360 text_refs = {article_prefix : article_id }
5461 pmid = article_id if article_prefix == 'PMID' else None
5562
63+ # We not find the standoff for the given relation and gather
64+ # evidence info for it if possible.
5665 standoff = self .get_standoff_for_event (article_prefix , article_id )
5766 if not standoff :
5867 evidence_info = [{}]
5968 else :
6069 evidence_info = find_evidence_info (standoff , source_id ,
6170 target_id , row .refined_type ,
6271 row .refined_polarity )
72+ # For each article, it's possible that multiple evidences are
73+ # available for the relation so we create a separate Statements
74+ # (each with a single Evidence) here.
6375 for ev_info in evidence_info :
6476 annotations = {
6577 'evex_relation_type' : row .refined_type ,
@@ -76,12 +88,17 @@ def process_row(self, row):
7688 text_refs = text_refs ,
7789 text = ev_info .get ('text' ),
7890 annotations = annotations )
91+
92+ # We can set the raw Agent text which is specific to this
93+ # given evidence.
7994 subj = copy .deepcopy (subj_agent )
8095 obj = copy .deepcopy (obj_agent )
8196 if ev_info .get ('subj_text' ):
8297 subj .db_refs ['TEXT' ] = ev_info .get ('subj_text' )
8398 if ev_info .get ('obj_text' ):
8499 obj .db_refs ['TEXT' ] = ev_info .get ('obj_text' )
100+
101+ # Finally, create the Statement object
85102 if stmt_type == Complex :
86103 stmt = Complex ([subj , obj ], evidence = [ev ])
87104 else :
@@ -91,6 +108,7 @@ def process_row(self, row):
91108 return stmts
92109
93110 def get_standoff_for_event (self , article_prefix , article_id ):
111+ """Based on article info, return a standoff object of annotations."""
94112 key = (
95113 'pmc' if article_prefix == 'PMCID' else 'pubmed' ,
96114 article_id [3 :] if article_prefix == 'PMCID' else article_id
@@ -201,6 +219,7 @@ def find_evidence_info(standoff, source_id, target_id, event_type,
201219
202220
203221def get_regulation_info (standoff , regulation , source_uid , target_uid ):
222+ """Gather specific evidence info from a regulation in a standoff."""
204223 text = standoff .get_sentence_for_offset (regulation .event .start )
205224 subj = standoff .elements [source_uid ]
206225 subj_text = subj .text
@@ -318,19 +337,30 @@ def process_annotations(ann_file):
318337 elements = {}
319338 reader = csv .reader (ann_file , delimiter = '\t ' , quotechar = None )
320339 for row in reader :
340+ # The first element is always the UID
321341 uid = row [0 ]
322342 assert len (row ) == 2 or len (row ) == 3
343+ # If the row has 3 elements, then the last one is a value
323344 value = row [2 ] if len (row ) == 3 else None
345+ # The second element can have multiple space-separated parts
324346 parts = row [1 ].split ()
347+ # If this is an entity of some type
325348 if parts [0 ] in {'GGP' , 'Entity' }:
326349 entity = Entity (uid , parts [0 ], int (parts [1 ]), int (parts [2 ]), value )
327350 elements [uid ] = entity
351+ # These represent entity references like Entrez IDs
328352 elif parts [0 ] == 'Reference' :
329353 ref_ns , ref_id = parts [2 ].split (':' , maxsplit = 1 )
330354 elements [parts [1 ]].references [ref_ns ] = ref_id
355+ # These are various event types, we enumerate them explicitly in
356+ # the standoff_event_types variable to make sure it's not some
357+ # other type of row.
331358 elif parts [0 ] in standoff_event_types :
332359 event = Event (uid , parts [0 ], int (parts [1 ]), int (parts [2 ]), value )
333360 elements [uid ] = event
361+ # These are confidence values associated with regulations but also
362+ # other things like Negation. An additional complication is that it
363+ # can either represent a numerical of a qualitative confidence level.
334364 elif parts [0 ] == 'Confidence' :
335365 # Negation confidence
336366 if isinstance (parts [1 ], Negation ):
@@ -341,12 +371,16 @@ def process_annotations(ann_file):
341371 # Regulation confidence level
342372 else :
343373 elements [parts [1 ]].confidence_level = parts [2 ]
374+ # Represents a negation for a regulation
344375 elif parts [0 ] == 'Negation' :
345376 elements [uid ] = Negation (uid )
346377 elements [parts [1 ]].negation = elements [uid ]
378+ # Represents a speculation for a regulation
347379 elif parts [0 ] == 'Speculation' :
348380 elements [uid ] = Speculation (uid )
349381 elements [parts [1 ]].speculation = elements [uid ]
382+ # The remainder of cases are regulations. These are either basic
383+ # regulations or special cases like subunit-complex relations.
350384 elif len (row ) == 2 :
351385 if ':' in parts [0 ]:
352386 event_type , parent_id = parts [0 ].split (':' )
@@ -360,6 +394,8 @@ def process_annotations(ann_file):
360394 else :
361395 assert False , row
362396
397+ # The row contains a series of arguments for the regulation that
398+ # need to be parsed out in parts
363399 arguments = {}
364400 for element in parts [1 :]:
365401 role , arg_uid = element .split (':' )
@@ -368,6 +404,9 @@ def process_annotations(ann_file):
368404 # placeholder for these elements that can be resolved later
369405 element_obj = elements .get (arg_uid , Unresolved (arg_uid ))
370406
407+ # There are argument types that there are more than one of,
408+ # e.g., Theme for Binding so we need to sometimes turn
409+ # these into lists.
371410 if role in arguments :
372411 if not isinstance (arguments [role ], list ):
373412 arguments [role ] = [arguments [role ]]
@@ -377,7 +416,7 @@ def process_annotations(ann_file):
377416 regulation = Regulation (uid , event , arguments )
378417 elements [uid ] = regulation
379418 else :
380- print ( row )
419+ logger . error ( 'Could not process standoff file row: %s' % row )
381420 break
382421
383422 # We now need to resolve Unresolved regulation references. At this point
@@ -517,6 +556,11 @@ def paths_to_entrez_id(self, entrez_id):
517556 return paths
518557
519558
559+ @dataclass
560+ class Unresolved :
561+ uid : str
562+
563+
520564def add_subgraph (g , obj ):
521565 """Recursively build up a graph of standoff objects."""
522566 label = '{ID | %s} | {event_type | %s}' % (obj .uid , obj .event .get_type ())
@@ -542,11 +586,6 @@ def add_subgraph(g, obj):
542586 g .add_edge (obj .uid , vv .uid , label = k )
543587
544588
545- @dataclass
546- class Unresolved :
547- uid : str
548-
549-
550589# The set of event types used in the standoff format
551590standoff_event_types = {
552591 'Binding' ,
0 commit comments