updated indra_bert api to just use the json indra statement output instead of the more verbose and raw output for parsing and converting into indra statement objects

thomaslim6793 · bgyori · commit 157ae6f78c32 · 2025-10-22T12:52:59.000-04:00
diff --git a/indra/resources/default_belief_probs.json b/indra/resources/default_belief_probs.json
@@ -36,7 +36,8 @@
     "acsn": 0.01,
     "semrep": 0.05,
     "wormbase": 0.01,
-    "indra_bert": 0.05
+    "indra_bert": 0.05,
+    "indra_gpt": 0.05
   },
   "rand": {
     "eidos": 0.3,
@@ -75,6 +76,7 @@
     "acsn": 0.1,
     "semrep": 0.3,
     "wormbase": 0.1,
-    "indra_bert": 0.3
+    "indra_bert": 0.3,
+    "indra_gpt": 0.3
   }
 }
diff --git a/indra/sources/indra_bert/api.py b/indra/sources/indra_bert/api.py
@@ -20,13 +20,15 @@ def create_extractor(
     ner_model_path="thomaslim6793/indra_bert_ner_agent_detection", 
     stmt_model_path="thomaslim6793/indra_bert_indra_stmt_classifier",
     role_model_path="thomaslim6793/indra_bert_indra_stmt_agents_role_assigner",
+    mutations_model_path="thomaslim6793/indra_bert_agent_mutation_detection",
     stmt_conf_threshold=0.95
 ):
     try: 
         ise = IndraStructuredExtractor(
             ner_model_path=ner_model_path, 
             stmt_model_path=stmt_model_path,
-            role_model_path=role_model_path, 
+            role_model_path=role_model_path,
+            mutations_model_path=mutations_model_path,
             stmt_conf_threshold=stmt_conf_threshold
         )
     except Exception as e:
@@ -36,33 +38,38 @@ def create_extractor(
             ner_model_path="thomaslim6793/indra_bert_ner_agent_detection",
             stmt_model_path="thomaslim6793/indra_bert_indra_stmt_classifier",
             role_model_path="thomaslim6793/indra_bert_indra_stmt_agents_role_assigner",
+            mutations_model_path="thomaslim6793/indra_bert_agent_mutation_detection",
             stmt_conf_threshold=stmt_conf_threshold
         )
     logger.info(f"Loaded ner_model from: {ise.ner_model_local_path}")
     logger.info(f"Loaded stmt_model from: {ise.stmt_model_local_path}")
     logger.info(f"Loaded role_model from: {ise.role_model_local_path}")
+    logger.info(f"Loaded mutations_model from: {ise.mutations_model_local_path}")
     return ise
 
 def process_text(text, 
                  ner_model_path="thomaslim6793/indra_bert_ner_agent_detection",
                  stmt_model_path="thomaslim6793/indra_bert_indra_stmt_classifier",
                  role_model_path="thomaslim6793/indra_bert_indra_stmt_agents_role_assigner",
+                 mutations_model_path="thomaslim6793/indra_bert_agent_mutation_detection",
                  stmt_conf_threshold=0.95,
                  grounder=None):
     ise = create_extractor(
         ner_model_path=ner_model_path, 
         stmt_model_path=stmt_model_path,
-        role_model_path=role_model_path, 
+        role_model_path=role_model_path,
+        mutations_model_path=mutations_model_path,
         stmt_conf_threshold=stmt_conf_threshold
     )
-    res = ise.extract_structured_statements_batch(text)
+    res = ise.get_json_indra_stmts(text)
     ip = IndraBertProcessor(res, grounder=grounder)
     return ip, ise
 
 def process_texts(texts, 
                   ner_model_path="thomaslim6793/indra_bert_ner_agent_detection",
                   stmt_model_path="thomaslim6793/indra_bert_indra_stmt_classifier",
                   role_model_path="thomaslim6793/indra_bert_indra_stmt_agents_role_assigner",
+                  mutations_model_path="thomaslim6793/indra_bert_agent_mutation_detection",
                   stmt_conf_threshold=0.95,
                   grounder=None):
     
@@ -72,13 +79,14 @@ def process_texts(texts,
     ise = create_extractor(
         ner_model_path=ner_model_path, 
         stmt_model_path=stmt_model_path,
-        role_model_path=role_model_path, 
+        role_model_path=role_model_path,
+        mutations_model_path=mutations_model_path,
         stmt_conf_threshold=stmt_conf_threshold
     )
 
     ips = []
     for text in tqdm(texts, desc="Processing texts"):
-        res = ise.extract_structured_statements_batch(text)
+        res = ise.get_json_indra_stmts(text)
         ip = IndraBertProcessor(res, grounder=grounder)
         ips.append(ip)
     return ips, ise
diff --git a/indra/sources/indra_bert/processor.py b/indra/sources/indra_bert/processor.py
@@ -1,4 +1,5 @@
 from indra.statements import *
+from indra.statements.io import stmt_from_json
 from indra.ontology.standardize import standardize_agent_name
 
 import re
@@ -14,92 +15,39 @@ def __init__(self, data, grounder=None):
         self.grounder = grounder if grounder else default_grounder_wrapper
         self.extract_statements()
 
-    def get_agent(self, agent_info, context=None):
-        name = agent_info['text']
-        db_refs = self.grounder(name, context)
-        db_refs['TEXT'] = name
-        agent = Agent(name, db_refs=db_refs)
-        standardize_agent_name(agent, standardize_refs=True)
-        return agent
 
     def extract_statement(self, entry):
-        stmt_type = entry['stmt_pred']['label']
-        roles = entry['role_pred']['roles']
-        text = entry['original_text']
-
-        agents_by_role = {}
-        raw_texts = {}
-        coords = {}
-        for agent_info in roles:
-            role = agent_info['role']
-            agents_by_role[role] = self.get_agent(agent_info, text)
-            raw_texts[role] = agent_info['text']
-            coords[role] = ([agent_info['start'], agent_info['end']])
-
-        evidence = Evidence(
-            source_api=self.source_api,
-            text=text,
-        )
-
-        stmt_class = get_statement_by_name(stmt_type)
-        if issubclass(stmt_class, Complex):
-            if len(agents_by_role) < 2:
-                raise ValueError("Expected at least two roles: 'members'",
-                                 f" but got {agents_by_role.keys()}")
-            for role, _ in agents_by_role.items():
-                if not re.match(r'members\.\d+', role):
-                    raise ValueError(f"Unexpected role '{role}' for members")
-
-            members = [agent for role, agent in agents_by_role.items()]
-            raw_texts = [raw_text for role, raw_text in raw_texts.items()]
-            coords = [coord for role, coord in coords.items()]
-            annotations = {
-                'agents': {
-                    'raw_text': raw_texts,
-                    'coords': coords
-                }
-            }
-            evidence.annotations = annotations
-            stmt = Complex(members, evidence=[evidence])
-            return stmt
-        elif issubclass(stmt_class, (RegulateAmount, RegulateActivity)):
-            if agents_by_role.keys() != {'subj', 'obj'} or len(agents_by_role) != 2: 
-                raise ValueError("Expected exactly two roles: 'subj' and 'obj'",
-                                    f" but got {agents_by_role.keys()}")
-
-            subj = agents_by_role.get('subj')
-            obj = agents_by_role.get('obj')
-            raw_texts = [raw_texts.get('subj'), raw_texts.get('obj')]
-            coords = [coords.get('subj'), coords.get('obj')]
-            annotations = {
-                'agents': {
-                    'raw_text': raw_texts,
-                    'coords': coords
-                }
-            }
-            evidence.annotations = annotations
-            stmt = stmt_class(subj, obj, evidence=[evidence])
-            return stmt
-        elif issubclass(stmt_class, Modification):
-            if agents_by_role.keys() != {'enz', 'sub'} or len(agents_by_role) != 2:
-                raise ValueError("Expected exactly two roles: 'enz' and 'sub'",
-                                    f" but got {agents_by_role.keys()}")
-
-            enz = agents_by_role.get('enz')
-            sub = agents_by_role.get('sub')
-            raw_texts = [raw_texts.get('enz'), raw_texts.get('sub')]
-            coords = [coords.get('enz'), coords.get('sub')]
-            annotations = {
-                'agents': {
-                    'raw_text': raw_texts,
-                    'coords': coords
-                }
-            }
-            evidence.annotations = annotations
-            stmt = stmt_class(enz, sub, evidence=[evidence])
+        """Extract a statement from JSON using INDRA's built-in functionality."""
+        try:
+            # Use INDRA's built-in statement_from_json functionality
+            stmt = stmt_from_json(entry)
+            
+            # Apply grounding to agents if grounder is available
+            if self.grounder:
+                text = entry['evidence'][0]['text'] if entry.get('evidence') else ""
+                self._apply_grounding(stmt, text)
+            
             return stmt
-        else:
-            assert False, "Unsupported statement type: %s" % stmt_class
+            
+        except Exception as e:
+            logger.warning(f"Error creating statement from JSON: {e}")
+            raise
+    
+    def _apply_grounding(self, stmt, context_text):
+        """Apply grounding to all agents in a statement."""
+        # Get all agents from the statement
+        agents = stmt.agent_list()
+        
+        for agent in agents:
+            if agent and agent.name:
+                # Apply grounding
+                grounding_result = self.grounder(agent.name, context_text)
+                if grounding_result:
+                    # Update db_refs with grounding results
+                    agent.db_refs.update(grounding_result)
+                
+                # Standardize the agent name
+                standardize_agent_name(agent, standardize_refs=True)
 
     def extract_statements(self):
         self.statements = []