Merge pull request #1487 from prasham21/add-new-source

bgyori · web-flow · commit bd246413358c · 2025-12-18T22:54:21.000-05:00
Add new textToKnowledgeGraph reader source
diff --git a/README.md b/README.md
@@ -63,6 +63,7 @@ Reading systems:
 | Geneways   | [`indra.sources.geneways`](https://indra.readthedocs.io/en/latest/modules/sources/geneways/index.html) | https://www.ncbi.nlm.nih.gov/pubmed/15016385    |
 | GNBR       | [`indra.sources.gnbr`](https://indra.readthedocs.io/en/latest/modules/sources/gnbr/index.html)         | https://zenodo.org/record/3459420               |
 | SemRep     | [`indra.sources.semrep`](https://indra.readthedocs.io/en/latest/modules/sources/semrep.html)     | https://github.com/lhncbc/SemRep                |
+| textToKnowledgeGraph     | [`indra.sources.tkg`](https://indra.readthedocs.io/en/latest/modules/sources/tkg.html)     | https://github.com/ndexbio/llm-text-to-knowledge-graph |
 
 
 Biological pathway databases:
diff --git a/doc/installation.rst b/doc/installation.rst
@@ -208,6 +208,8 @@ of dependencies.
 +-----------------+------------------------------------------------------+
 |eidos_offline    |Offline reading with local instance of Eidos system   |
 +-----------------+------------------------------------------------------+
+|tkg              |Reading with textToKnowledgeGraph                     |
++-----------------+------------------------------------------------------+
 |geneways         |Genewayas reader input processing                     |
 +-----------------+------------------------------------------------------+
 |sofia            |SOFIA reader input processing                         |
diff --git a/doc/modules/sources/tkg/index.rst b/doc/modules/sources/tkg/index.rst
@@ -0,0 +1,19 @@
+textToKnowledgeGraph (:py:mod:`indra.sources.tkg`)
+==================================================
+
+.. automodule:: indra.sources.tkg
+    :members:
+
+
+textToKnowledgeGraph API (:py:mod:`indra.sources.tkg.api`)
+----------------------------------------------------------
+
+.. automodule:: indra.sources.tkg.api
+    :members:
+
+
+textToKnowledgeGraph Processor (:py:mod:`indra.sources.tkg.processor`)
+----------------------------------------------------------------------
+
+.. automodule:: indra.sources.tkg.processor
+    :members:
diff --git a/indra/resources/default_belief_probs.json b/indra/resources/default_belief_probs.json
@@ -34,7 +34,8 @@
     "creeds": 0.01,
     "ubibrowser": 0.01,
     "acsn": 0.01,
-    "semrep": 0.05
+    "semrep": 0.05,
+    "tkg": 0.05
   },
   "rand": {
     "eidos": 0.3,
@@ -72,6 +73,7 @@
     "ubibrowser": 0.1,
     "acsn": 0.1,
     "semrep": 0.3,
-    "wormbase": 0.1
+    "wormbase": 0.1,
+    "tkg": 0.3
   }
 }
diff --git a/indra/resources/source_info.json b/indra/resources/source_info.json
@@ -338,5 +338,15 @@
             "color": "white",
             "background-color": "#a8d7ff"
         }
+    },
+    "tkg": {
+        "name": "textToKnowledgeGraph",
+        "link": "https://github.com/ndexbio/llm-text-to-knowledge-graph",
+        "type": "reader",
+        "domain": "biology",
+        "default_style": {
+            "color": "white",
+            "background-color": "#ffb3ba"
+        }
     }
 }
diff --git a/indra/sources/bel/processor.py b/indra/sources/bel/processor.py
@@ -527,7 +527,7 @@ def get_db_refs_by_name(ns, name, node_data):
         db_refs = {'GO': go_id}
         name = go_client.get_go_label(go_id)
     elif ns in ('MESHPP', 'MESHD', 'MESH'):
-        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
+        mesh_id, mesh_name = mesh_client.get_mesh_id_name(name, offline=True)
         if not mesh_id:
             logger.info('Could not find MESH ID from %s' % name)
             return name, None
diff --git a/indra/sources/tkg/__init__.py b/indra/sources/tkg/__init__.py
@@ -0,0 +1,20 @@
+"""
+This module implements an input API and processor for the
+textToKnowledgeGraph method which uses LLMs to extract BEL statements
+from publications:
+
+textToKnowledgeGraph: Generation of Molecular Interaction Knowledge Graphs Using
+Large Language Models for Exploration in Cytoscape
+Favour James, Christopher Churas, Dexter Pratt, Augustin Luna
+bioRxiv https://doi.org/10.1101/2025.07.17.664328
+"""
+
+from .api import *
+from .processor import TkgProcessor
+
+__all__ = [
+    "process_json_file",
+    "process_json",
+    "process_pmc",
+    "TkgProcessor",
+]
diff --git a/indra/sources/tkg/api.py b/indra/sources/tkg/api.py
@@ -0,0 +1,127 @@
+__all__ = ["process_json_file", "process_json", "process_pmc"]
+"""
+This module implements an API for the textToKnowledgeGraph
+method which extracts BEL statements from publications via an LLM.
+
+This module provides two integration modes:
+
+Offline processing
+    In this mode, a JSON file output from textToKnowledgeGraph
+    is used as the starting point from which INDRA Statements are produced.
+
+Live processing
+    If the `texttoknowledgegraph` package is installed, calls
+    the LLM extraction pipeline, processes the returned BEL relations
+    and produces INDRA Statements.
+
+Both modes produce an TkgProcessor instance containing INDRA
+Statements derived from BEL expressions.
+"""
+
+import os
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Union
+
+from indra import get_config
+from .processor import TkgProcessor
+
+logger = logging.getLogger(__name__)
+
+
+
+def process_json_file(path: Union[str, Path]):
+    """Process a single textToKnowledgeGraph JSON results file.
+
+    Parameters
+    ----------
+    path : str or Path
+        Path to a JSON file containing BEL relations.
+
+    Returns
+    -------
+    TkgProcessor
+        Processor containing the converted INDRA Statements.
+    """
+    path = Path(path)
+    logger.debug("Processing LLM-BEL results file: %s", path)
+
+    with open(path, "r") as fh:
+        data = json.load(fh)
+
+    return process_json(data)
+
+
+def process_json(data: Dict):
+    """Process BEL relations returned directly from the LLM engine.
+
+    Parameters
+    ----------
+    data : dict
+        Dictionary containing at least a ``"relations"`` field.
+
+    Returns
+    -------
+    TkgProcessor
+        Processor with INDRA Statements derived from BEL.
+    """
+    processor = TkgProcessor(data)
+    processor.extract_statements()
+    return processor
+
+
+def process_pmc(pmc_id: str, output_base_path, **kwargs):
+    """Run live BEL extraction using textToKnowledgeGraph, if installed.
+
+    Parameters
+    ----------
+    pmc_id : str
+        PMCID such as 'PMC3898398'.
+    kwargs :
+        Additional keyword arguments passed to textToKnowledgeGraph.main().
+
+    Returns
+    -------
+    TkgProcessor
+        Processor containing INDRA Statements derived from live BEL output.
+
+    Raises
+    ------
+    ImportError
+        If textToKnowledgeGraph is not installed.
+    ValueError
+        If the returned data structure is unexpected.
+    """
+    try:
+        from textToKnowledgeGraph import main as tkg_main
+    except ImportError:
+        raise ImportError(
+            "The 'textToKnowledgeGraph' package is not installed. "
+            "Install it or run textToKnowledgeGraph separately to "
+            "produce output files and then use one of the functions like "
+            "process_json_file to process the outputs."
+        )
+
+    api_key = get_config('OPENAI_API_KEY', failure_ok=False)
+
+    logger.debug("Running live textToKnowledgeGraph extraction for %s", pmc_id)
+
+    success = tkg_main(
+        api_key=api_key,
+        pmc_ids=[pmc_id],
+        upload_to_ndex=False,
+        # Note: this assumes https://github.com/ndexbio/llm-text-to-knowledge-graph/pull/27
+        # will be merged
+        output_base_path=output_base_path,
+        **kwargs,
+    )
+
+    if success:
+        # TKG doesn't explicitly say where the results will be put so we need to
+        # construct this path ourselves
+        output_path = os.path.join(output_base_path, 'results', pmc_id,
+                                   'llm_results.json')
+
+        return process_json_file(output_path)
+    return None
diff --git a/indra/sources/tkg/processor.py b/indra/sources/tkg/processor.py
@@ -0,0 +1,86 @@
+__all__ = ["TkgProcessor"]
+
+import re
+import logging
+from typing import Dict, List
+
+from indra.sources.bel import process_bel_stmt
+
+logger = logging.getLogger(__name__)
+
+
+class TkgProcessor:
+    """Processor extracting INDRA Statments from textToKnowledgeGraph output.
+
+    After parsing BEL to INDRA Statements via PyBEL, this processor attaches
+    metadata (confidence, text, pmid, pmcid, etc.) to Evidence objects.
+
+    Parameters
+    ----------
+    results : Dict
+        Output data structure of textToKnowledgeGraph to be processed
+
+    Attributes
+    ----------
+    statements : List[indra.statements.Statement]
+        A list of INDRA Statements extracted from the results.
+    """
+
+    def __init__(self, results):
+        self.results = results
+        self.statements = []
+        self.skipped = []
+
+    # Alternative processing mode (not used by V1 tests but available)
+    def extract_statements(self):
+        """Run BEL to INDRA pipeline for all entries in llm_results."""
+        extractions = self.results.get('LLM_extractions', [])
+        for extraction in extractions:
+            results = extraction.get('Results', [])
+            for entry in results:
+                raw_bel_stmt = entry['bel_statement']
+                bel_stmt = normalize_bel(raw_bel_stmt)
+                try:
+                    pp = process_bel_stmt(bel_stmt)
+                except Exception as e:
+                    self.skipped.append(bel_stmt)
+                    continue
+                if pp and pp.statements:
+                    self.statements += pp.statements
+                else:
+                    self.skipped.append(bel_stmt)
+
+        logger.debug(
+            "textToKnowledgeGraph processor finished: extracted=%d "
+            "skipped=%d total=%d", len(self.statements), len(self.skipped),
+            len(self.results)
+        )
+
+
+# Fix GO Biological Process names that contain spaces
+GO_BP_PATTERN = re.compile(r'GO:([A-Za-z0-9\-\s]+)')
+
+
+def normalize_go_terms(bel: str) -> str:
+    """Normalize GO terms like:
+        GO:DNA-templated transcription
+    into:
+        GO:"DNA-templated transcription"
+    so PyBEL can parse them.
+    """
+    def replacer(match):
+        content = match.group(1)
+        # If already quoted or no spaces in string, we can return as is
+        if '"' in content or "'" in content or ' ' not in content:
+            return f'GO:{content}'
+        return f'GO:"{content}"'
+
+    return GO_BP_PATTERN.sub(replacer, bel)
+
+
+def normalize_bel(bel: str) -> str:
+    """Apply all normalization steps."""
+    # For now just normalizing GO terms which appears to be an existing issue.
+    # Can be extended with other processing steps later.
+    bel = normalize_go_terms(bel)
+    return bel
diff --git a/indra/tests/test_tkg.py b/indra/tests/test_tkg.py
@@ -0,0 +1,53 @@
+import indra.statements as ist
+from indra.sources import tkg
+
+
+def assert_grounding_value_or_none(stmt):
+    """Ensure there are no empty grounding values ('' or [])."""
+    for a in stmt.real_agent_list():
+        for k, v in a.db_refs.items():
+            if not v:
+                assert v is None, f"Invalid grounding value {k}={v}"
+
+
+TEST_JSON = {
+    "LLM_extractions": [
+        {
+            "Results": [
+                {
+                    "bel_statement": "p(HGNC:SIRT1) increases act(p(HGNC:PARP1))",
+                    "evidence": "SIRT1 activates PARP1",
+                },
+                {
+                    "bel_statement": "p(HGNC:SIRT1) decreases p(HGNC:MYC)",
+                    "evidence": "SIRT1 represses MYC",
+                },
+                {
+                    "bel_statement": "p(HGNC:SIRT1) increases act(p(HGNC:PARP1)",
+                    "evidence": "Malformed BEL",
+                },
+                {
+                    "bel_statement": "p(FPLX:ERK) directlyIncreases act(p(HGNC:PARP1))",
+                    "evidence": "ERK activates PARP1",
+                },
+            ]
+        }
+    ]
+}
+
+
+def test_tkg_processing(tmp_path):
+    proc = tkg.process_json(TEST_JSON)
+
+    assert proc is not None
+    assert hasattr(proc, "statements")
+
+    # Expect 3 valid BELs (invalid BEL should be skipped)
+    assert len(proc.statements) == 3
+
+    for st in proc.statements:
+        assert st.evidence, "Evidence must exist"
+        assert_grounding_value_or_none(st)
+
+    assert any(isinstance(s, ist.Activation) for s in proc.statements)
+    assert any(isinstance(s, ist.DecreaseAmount) for s in proc.statements)
diff --git a/indra/util/statement_presentation.py b/indra/util/statement_presentation.py
@@ -124,7 +124,7 @@ class to define a `StmtStat`.
               'ubibrowser', 'acsn', 'wormbase']
 """Database source names as they appear in the DB"""
 
-reader_sources = ['geneways', 'tees', 'gnbr', 'semrep', 'isi', 'trips',
+reader_sources = ['geneways', 'tees', 'gnbr', 'semrep', 'isi', 'tkg', 'trips',
                   'rlimsp', 'medscan', 'eidos', 'sparser', 'reach']
 """Reader source names as they appear in the DB"""
 
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -338,5 +338,15 @@`
`338`	`338`	`"color": "white",`
`339`	`339`	`"background-color": "#a8d7ff"`
`340`	`340`	`}`
	`341`	`+ },`
	`342`	`+ "tkg": {`
	`343`	`+ "name": "textToKnowledgeGraph",`
	`344`	`+ "link": "https://github.com/ndexbio/llm-text-to-knowledge-graph",`
	`345`	`+ "type": "reader",`
	`346`	`+ "domain": "biology",`
	`347`	`+ "default_style": {`
	`348`	`+ "color": "white",`
	`349`	`+ "background-color": "#ffb3ba"`
	`350`	`+ }`
`341`	`351`	`}`
`342`	`352`	`}`