Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f22d309
Add LlmBelProcessor to convert LLM BEL records into INDRA Statements
Dec 9, 2025
efeca30
Add V1 offline BEL ingestion test validating malformed BEL skipping a…
Dec 11, 2025
7380db4
Remove deprecated process_text import and align __all__ with actual p…
Dec 11, 2025
7996d7c
Fix BEL ingestion: correct prepare_bel_for_parsing unpacking and hand…
Dec 11, 2025
09dd248
Update LlmBelProcessor to support multi-statement BEL parsing and con…
Dec 11, 2025
cb8df5f
Add unified offline (V1) and mock live (V2) tests for LLM-BEL ingesti…
Dec 12, 2025
ebd8984
Add Sphinx documentation for new LLM-BEL source, API, processor, and …
Dec 12, 2025
c8610c7
Add prior belief probabilities for new LLM-BEL source
Dec 12, 2025
0ccba06
Register LLM-BEL as a new reader source with styling metadata
Dec 12, 2025
ff59ead
Register llm_bel in statement presentation ordering for HTML output
Dec 12, 2025
314218e
Include llm_bel package to ensure installation of new LLM-BEL reader …
Dec 12, 2025
e4ea8ca
Begin reimplementing processor and API
bgyori Dec 16, 2025
a2a2c7e
Remove more unnecessary code
bgyori Dec 16, 2025
50c7d9a
Rename module to TKG to fit actual tool name
bgyori Dec 17, 2025
3734aa4
Rename processor to match module name
bgyori Dec 17, 2025
341d484
Fix offline test
bgyori Dec 17, 2025
76f8a00
Fix import and get API key from config/env
bgyori Dec 17, 2025
84c32a3
Fix TKG integration
bgyori Dec 17, 2025
7e8b49e
Avoid MeSH web service call
bgyori Dec 18, 2025
b1ca3c0
Simplify processor and normalization
bgyori Dec 18, 2025
a6b9306
Clean up useless tests
bgyori Dec 18, 2025
497dfe2
Add to README
bgyori Dec 18, 2025
27a9267
Add extra dependencies for tkg
bgyori Dec 18, 2025
2046065
Remove ndex2 version constraint
bgyori Dec 18, 2025
b204daa
Only install texttoknowledgegraph on 3.9+
bgyori Dec 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Reading systems:
| Geneways | [`indra.sources.geneways`](https://indra.readthedocs.io/en/latest/modules/sources/geneways/index.html) | https://www.ncbi.nlm.nih.gov/pubmed/15016385 |
| GNBR | [`indra.sources.gnbr`](https://indra.readthedocs.io/en/latest/modules/sources/gnbr/index.html) | https://zenodo.org/record/3459420 |
| SemRep | [`indra.sources.semrep`](https://indra.readthedocs.io/en/latest/modules/sources/semrep.html) | https://github.com/lhncbc/SemRep |
| textToKnowledgeGraph | [`indra.sources.tkg`](https://indra.readthedocs.io/en/latest/modules/sources/tkg.html) | https://github.com/ndexbio/llm-text-to-knowledge-graph |


Biological pathway databases:
Expand Down
2 changes: 2 additions & 0 deletions doc/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ of dependencies.
+-----------------+------------------------------------------------------+
|eidos_offline |Offline reading with local instance of Eidos system |
+-----------------+------------------------------------------------------+
|tkg |Reading with textToKnowledgeGraph |
+-----------------+------------------------------------------------------+
|geneways |Genewayas reader input processing |
+-----------------+------------------------------------------------------+
|sofia |SOFIA reader input processing |
Expand Down
19 changes: 19 additions & 0 deletions doc/modules/sources/tkg/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
textToKnowledgeGraph (:py:mod:`indra.sources.tkg`)
==================================================

.. automodule:: indra.sources.tkg
:members:


textToKnowledgeGraph API (:py:mod:`indra.sources.tkg.api`)
----------------------------------------------------------

.. automodule:: indra.sources.tkg.api
:members:


textToKnowledgeGraph Processor (:py:mod:`indra.sources.tkg.processor`)
----------------------------------------------------------------------

.. automodule:: indra.sources.tkg.processor
:members:
6 changes: 4 additions & 2 deletions indra/resources/default_belief_probs.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
"creeds": 0.01,
"ubibrowser": 0.01,
"acsn": 0.01,
"semrep": 0.05
"semrep": 0.05,
"tkg": 0.05
},
"rand": {
"eidos": 0.3,
Expand Down Expand Up @@ -72,6 +73,7 @@
"ubibrowser": 0.1,
"acsn": 0.1,
"semrep": 0.3,
"wormbase": 0.1
"wormbase": 0.1,
"tkg": 0.3
}
}
10 changes: 10 additions & 0 deletions indra/resources/source_info.json
Original file line number Diff line number Diff line change
Expand Up @@ -338,5 +338,15 @@
"color": "white",
"background-color": "#a8d7ff"
}
},
"tkg": {
"name": "textToKnowledgeGraph",
"link": "https://github.com/ndexbio/llm-text-to-knowledge-graph",
"type": "reader",
"domain": "biology",
"default_style": {
"color": "white",
"background-color": "#ffb3ba"
}
}
}
2 changes: 1 addition & 1 deletion indra/sources/bel/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def get_db_refs_by_name(ns, name, node_data):
db_refs = {'GO': go_id}
name = go_client.get_go_label(go_id)
elif ns in ('MESHPP', 'MESHD', 'MESH'):
mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
mesh_id, mesh_name = mesh_client.get_mesh_id_name(name, offline=True)
if not mesh_id:
logger.info('Could not find MESH ID from %s' % name)
return name, None
Expand Down
20 changes: 20 additions & 0 deletions indra/sources/tkg/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
This module implements an input API and processor for the
textToKnowledgeGraph method which uses LLMs to extract BEL statements
from publications:

textToKnowledgeGraph: Generation of Molecular Interaction Knowledge Graphs Using
Large Language Models for Exploration in Cytoscape
Favour James, Christopher Churas, Dexter Pratt, Augustin Luna
bioRxiv https://doi.org/10.1101/2025.07.17.664328
"""

from .api import *
from .processor import TkgProcessor

__all__ = [
"process_json_file",
"process_json",
"process_pmc",
"TkgProcessor",
]
127 changes: 127 additions & 0 deletions indra/sources/tkg/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
__all__ = ["process_json_file", "process_json", "process_pmc"]
"""
This module implements an API for the textToKnowledgeGraph
method which extracts BEL statements from publications via an LLM.

This module provides two integration modes:

Offline processing
In this mode, a JSON file output from textToKnowledgeGraph
is used as the starting point from which INDRA Statements are produced.

Live processing
If the `texttoknowledgegraph` package is installed, calls
the LLM extraction pipeline, processes the returned BEL relations
and produces INDRA Statements.

Both modes produce an TkgProcessor instance containing INDRA
Statements derived from BEL expressions.
"""

import os
import json
import logging
from pathlib import Path
from typing import Dict, Union

from indra import get_config
from .processor import TkgProcessor

logger = logging.getLogger(__name__)



def process_json_file(path: Union[str, Path]):
"""Process a single textToKnowledgeGraph JSON results file.

Parameters
----------
path : str or Path
Path to a JSON file containing BEL relations.

Returns
-------
TkgProcessor
Processor containing the converted INDRA Statements.
"""
path = Path(path)
logger.debug("Processing LLM-BEL results file: %s", path)

with open(path, "r") as fh:
data = json.load(fh)

return process_json(data)


def process_json(data: Dict):
"""Process BEL relations returned directly from the LLM engine.

Parameters
----------
data : dict
Dictionary containing at least a ``"relations"`` field.

Returns
-------
TkgProcessor
Processor with INDRA Statements derived from BEL.
"""
processor = TkgProcessor(data)
processor.extract_statements()
return processor


def process_pmc(pmc_id: str, output_base_path, **kwargs):
"""Run live BEL extraction using textToKnowledgeGraph, if installed.

Parameters
----------
pmc_id : str
PMCID such as 'PMC3898398'.
kwargs :
Additional keyword arguments passed to textToKnowledgeGraph.main().

Returns
-------
TkgProcessor
Processor containing INDRA Statements derived from live BEL output.

Raises
------
ImportError
If textToKnowledgeGraph is not installed.
ValueError
If the returned data structure is unexpected.
"""
try:
from textToKnowledgeGraph import main as tkg_main
except ImportError:
raise ImportError(
"The 'textToKnowledgeGraph' package is not installed. "
"Install it or run textToKnowledgeGraph separately to "
"produce output files and then use one of the functions like "
"process_json_file to process the outputs."
)

api_key = get_config('OPENAI_API_KEY', failure_ok=False)

logger.debug("Running live textToKnowledgeGraph extraction for %s", pmc_id)

success = tkg_main(
api_key=api_key,
pmc_ids=[pmc_id],
upload_to_ndex=False,
# Note: this assumes https://github.com/ndexbio/llm-text-to-knowledge-graph/pull/27
# will be merged
output_base_path=output_base_path,
**kwargs,
)

if success:
# TKG doesn't explicitly say where the results will be put so we need to
# construct this path ourselves
output_path = os.path.join(output_base_path, 'results', pmc_id,
'llm_results.json')

return process_json_file(output_path)
return None
86 changes: 86 additions & 0 deletions indra/sources/tkg/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
__all__ = ["TkgProcessor"]

import re
import logging
from typing import Dict, List

from indra.sources.bel import process_bel_stmt

logger = logging.getLogger(__name__)


class TkgProcessor:
"""Processor extracting INDRA Statments from textToKnowledgeGraph output.

After parsing BEL to INDRA Statements via PyBEL, this processor attaches
metadata (confidence, text, pmid, pmcid, etc.) to Evidence objects.

Parameters
----------
results : Dict
Output data structure of textToKnowledgeGraph to be processed

Attributes
----------
statements : List[indra.statements.Statement]
A list of INDRA Statements extracted from the results.
"""

def __init__(self, results):
self.results = results
self.statements = []
self.skipped = []

# Alternative processing mode (not used by V1 tests but available)
def extract_statements(self):
"""Run BEL to INDRA pipeline for all entries in llm_results."""
extractions = self.results.get('LLM_extractions', [])
for extraction in extractions:
results = extraction.get('Results', [])
for entry in results:
raw_bel_stmt = entry['bel_statement']
bel_stmt = normalize_bel(raw_bel_stmt)
try:
pp = process_bel_stmt(bel_stmt)
except Exception as e:
self.skipped.append(bel_stmt)
continue
if pp and pp.statements:
self.statements += pp.statements
else:
self.skipped.append(bel_stmt)

logger.debug(
"textToKnowledgeGraph processor finished: extracted=%d "
"skipped=%d total=%d", len(self.statements), len(self.skipped),
len(self.results)
)


# Fix GO Biological Process names that contain spaces
GO_BP_PATTERN = re.compile(r'GO:([A-Za-z0-9\-\s]+)')


def normalize_go_terms(bel: str) -> str:
"""Normalize GO terms like:
GO:DNA-templated transcription
into:
GO:"DNA-templated transcription"
so PyBEL can parse them.
"""
def replacer(match):
content = match.group(1)
# If already quoted or no spaces in string, we can return as is
if '"' in content or "'" in content or ' ' not in content:
return f'GO:{content}'
return f'GO:"{content}"'

return GO_BP_PATTERN.sub(replacer, bel)


def normalize_bel(bel: str) -> str:
"""Apply all normalization steps."""
# For now just normalizing GO terms which appears to be an existing issue.
# Can be extended with other processing steps later.
bel = normalize_go_terms(bel)
return bel
53 changes: 53 additions & 0 deletions indra/tests/test_tkg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import indra.statements as ist
from indra.sources import tkg


def assert_grounding_value_or_none(stmt):
"""Ensure there are no empty grounding values ('' or [])."""
for a in stmt.real_agent_list():
for k, v in a.db_refs.items():
if not v:
assert v is None, f"Invalid grounding value {k}={v}"


TEST_JSON = {
"LLM_extractions": [
{
"Results": [
{
"bel_statement": "p(HGNC:SIRT1) increases act(p(HGNC:PARP1))",
"evidence": "SIRT1 activates PARP1",
},
{
"bel_statement": "p(HGNC:SIRT1) decreases p(HGNC:MYC)",
"evidence": "SIRT1 represses MYC",
},
{
"bel_statement": "p(HGNC:SIRT1) increases act(p(HGNC:PARP1)",
"evidence": "Malformed BEL",
},
{
"bel_statement": "p(FPLX:ERK) directlyIncreases act(p(HGNC:PARP1))",
"evidence": "ERK activates PARP1",
},
]
}
]
}


def test_tkg_processing(tmp_path):
proc = tkg.process_json(TEST_JSON)

assert proc is not None
assert hasattr(proc, "statements")

# Expect 3 valid BELs (invalid BEL should be skipped)
assert len(proc.statements) == 3

for st in proc.statements:
assert st.evidence, "Evidence must exist"
assert_grounding_value_or_none(st)

assert any(isinstance(s, ist.Activation) for s in proc.statements)
assert any(isinstance(s, ist.DecreaseAmount) for s in proc.statements)
2 changes: 1 addition & 1 deletion indra/util/statement_presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class to define a `StmtStat`.
'ubibrowser', 'acsn', 'wormbase']
"""Database source names as they appear in the DB"""

reader_sources = ['geneways', 'tees', 'gnbr', 'semrep', 'isi', 'trips',
reader_sources = ['geneways', 'tees', 'gnbr', 'semrep', 'isi', 'tkg', 'trips',
'rlimsp', 'medscan', 'eidos', 'sparser', 'reach']
"""Reader source names as they appear in the DB"""

Expand Down
Loading