Skip to content

Commit bd24641

Browse files
authored
Merge pull request #1487 from prasham21/add-new-source
Add new textToKnowledgeGraph reader source
2 parents 1d86f16 + b204daa commit bd24641

File tree

12 files changed

+328
-6
lines changed

12 files changed

+328
-6
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Reading systems:
6363
| Geneways | [`indra.sources.geneways`](https://indra.readthedocs.io/en/latest/modules/sources/geneways/index.html) | https://www.ncbi.nlm.nih.gov/pubmed/15016385 |
6464
| GNBR | [`indra.sources.gnbr`](https://indra.readthedocs.io/en/latest/modules/sources/gnbr/index.html) | https://zenodo.org/record/3459420 |
6565
| SemRep | [`indra.sources.semrep`](https://indra.readthedocs.io/en/latest/modules/sources/semrep.html) | https://github.com/lhncbc/SemRep |
66+
| textToKnowledgeGraph | [`indra.sources.tkg`](https://indra.readthedocs.io/en/latest/modules/sources/tkg.html) | https://github.com/ndexbio/llm-text-to-knowledge-graph |
6667

6768

6869
Biological pathway databases:

doc/installation.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ of dependencies.
208208
+-----------------+------------------------------------------------------+
209209
|eidos_offline |Offline reading with local instance of Eidos system |
210210
+-----------------+------------------------------------------------------+
211+
|tkg |Reading with textToKnowledgeGraph |
212+
+-----------------+------------------------------------------------------+
211213
|geneways |Genewayas reader input processing |
212214
+-----------------+------------------------------------------------------+
213215
|sofia |SOFIA reader input processing |

doc/modules/sources/tkg/index.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
textToKnowledgeGraph (:py:mod:`indra.sources.tkg`)
2+
==================================================
3+
4+
.. automodule:: indra.sources.tkg
5+
:members:
6+
7+
8+
textToKnowledgeGraph API (:py:mod:`indra.sources.tkg.api`)
9+
----------------------------------------------------------
10+
11+
.. automodule:: indra.sources.tkg.api
12+
:members:
13+
14+
15+
textToKnowledgeGraph Processor (:py:mod:`indra.sources.tkg.processor`)
16+
----------------------------------------------------------------------
17+
18+
.. automodule:: indra.sources.tkg.processor
19+
:members:

indra/resources/default_belief_probs.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434
"creeds": 0.01,
3535
"ubibrowser": 0.01,
3636
"acsn": 0.01,
37-
"semrep": 0.05
37+
"semrep": 0.05,
38+
"tkg": 0.05
3839
},
3940
"rand": {
4041
"eidos": 0.3,
@@ -72,6 +73,7 @@
7273
"ubibrowser": 0.1,
7374
"acsn": 0.1,
7475
"semrep": 0.3,
75-
"wormbase": 0.1
76+
"wormbase": 0.1,
77+
"tkg": 0.3
7678
}
7779
}

indra/resources/source_info.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,5 +338,15 @@
338338
"color": "white",
339339
"background-color": "#a8d7ff"
340340
}
341+
},
342+
"tkg": {
343+
"name": "textToKnowledgeGraph",
344+
"link": "https://github.com/ndexbio/llm-text-to-knowledge-graph",
345+
"type": "reader",
346+
"domain": "biology",
347+
"default_style": {
348+
"color": "white",
349+
"background-color": "#ffb3ba"
350+
}
341351
}
342352
}

indra/sources/bel/processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def get_db_refs_by_name(ns, name, node_data):
527527
db_refs = {'GO': go_id}
528528
name = go_client.get_go_label(go_id)
529529
elif ns in ('MESHPP', 'MESHD', 'MESH'):
530-
mesh_id, mesh_name = mesh_client.get_mesh_id_name(name)
530+
mesh_id, mesh_name = mesh_client.get_mesh_id_name(name, offline=True)
531531
if not mesh_id:
532532
logger.info('Could not find MESH ID from %s' % name)
533533
return name, None

indra/sources/tkg/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""
2+
This module implements an input API and processor for the
3+
textToKnowledgeGraph method which uses LLMs to extract BEL statements
4+
from publications:
5+
6+
textToKnowledgeGraph: Generation of Molecular Interaction Knowledge Graphs Using
7+
Large Language Models for Exploration in Cytoscape
8+
Favour James, Christopher Churas, Dexter Pratt, Augustin Luna
9+
bioRxiv https://doi.org/10.1101/2025.07.17.664328
10+
"""
11+
12+
from .api import *
13+
from .processor import TkgProcessor
14+
15+
__all__ = [
16+
"process_json_file",
17+
"process_json",
18+
"process_pmc",
19+
"TkgProcessor",
20+
]

indra/sources/tkg/api.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
__all__ = ["process_json_file", "process_json", "process_pmc"]
2+
"""
3+
This module implements an API for the textToKnowledgeGraph
4+
method which extracts BEL statements from publications via an LLM.
5+
6+
This module provides two integration modes:
7+
8+
Offline processing
9+
In this mode, a JSON file output from textToKnowledgeGraph
10+
is used as the starting point from which INDRA Statements are produced.
11+
12+
Live processing
13+
If the `texttoknowledgegraph` package is installed, calls
14+
the LLM extraction pipeline, processes the returned BEL relations
15+
and produces INDRA Statements.
16+
17+
Both modes produce an TkgProcessor instance containing INDRA
18+
Statements derived from BEL expressions.
19+
"""
20+
21+
import os
22+
import json
23+
import logging
24+
from pathlib import Path
25+
from typing import Dict, Union
26+
27+
from indra import get_config
28+
from .processor import TkgProcessor
29+
30+
logger = logging.getLogger(__name__)
31+
32+
33+
34+
def process_json_file(path: Union[str, Path]):
35+
"""Process a single textToKnowledgeGraph JSON results file.
36+
37+
Parameters
38+
----------
39+
path : str or Path
40+
Path to a JSON file containing BEL relations.
41+
42+
Returns
43+
-------
44+
TkgProcessor
45+
Processor containing the converted INDRA Statements.
46+
"""
47+
path = Path(path)
48+
logger.debug("Processing LLM-BEL results file: %s", path)
49+
50+
with open(path, "r") as fh:
51+
data = json.load(fh)
52+
53+
return process_json(data)
54+
55+
56+
def process_json(data: Dict):
57+
"""Process BEL relations returned directly from the LLM engine.
58+
59+
Parameters
60+
----------
61+
data : dict
62+
Dictionary containing at least a ``"relations"`` field.
63+
64+
Returns
65+
-------
66+
TkgProcessor
67+
Processor with INDRA Statements derived from BEL.
68+
"""
69+
processor = TkgProcessor(data)
70+
processor.extract_statements()
71+
return processor
72+
73+
74+
def process_pmc(pmc_id: str, output_base_path, **kwargs):
75+
"""Run live BEL extraction using textToKnowledgeGraph, if installed.
76+
77+
Parameters
78+
----------
79+
pmc_id : str
80+
PMCID such as 'PMC3898398'.
81+
kwargs :
82+
Additional keyword arguments passed to textToKnowledgeGraph.main().
83+
84+
Returns
85+
-------
86+
TkgProcessor
87+
Processor containing INDRA Statements derived from live BEL output.
88+
89+
Raises
90+
------
91+
ImportError
92+
If textToKnowledgeGraph is not installed.
93+
ValueError
94+
If the returned data structure is unexpected.
95+
"""
96+
try:
97+
from textToKnowledgeGraph import main as tkg_main
98+
except ImportError:
99+
raise ImportError(
100+
"The 'textToKnowledgeGraph' package is not installed. "
101+
"Install it or run textToKnowledgeGraph separately to "
102+
"produce output files and then use one of the functions like "
103+
"process_json_file to process the outputs."
104+
)
105+
106+
api_key = get_config('OPENAI_API_KEY', failure_ok=False)
107+
108+
logger.debug("Running live textToKnowledgeGraph extraction for %s", pmc_id)
109+
110+
success = tkg_main(
111+
api_key=api_key,
112+
pmc_ids=[pmc_id],
113+
upload_to_ndex=False,
114+
# Note: this assumes https://github.com/ndexbio/llm-text-to-knowledge-graph/pull/27
115+
# will be merged
116+
output_base_path=output_base_path,
117+
**kwargs,
118+
)
119+
120+
if success:
121+
# TKG doesn't explicitly say where the results will be put so we need to
122+
# construct this path ourselves
123+
output_path = os.path.join(output_base_path, 'results', pmc_id,
124+
'llm_results.json')
125+
126+
return process_json_file(output_path)
127+
return None

indra/sources/tkg/processor.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
__all__ = ["TkgProcessor"]
2+
3+
import re
4+
import logging
5+
from typing import Dict, List
6+
7+
from indra.sources.bel import process_bel_stmt
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
class TkgProcessor:
13+
"""Processor extracting INDRA Statments from textToKnowledgeGraph output.
14+
15+
After parsing BEL to INDRA Statements via PyBEL, this processor attaches
16+
metadata (confidence, text, pmid, pmcid, etc.) to Evidence objects.
17+
18+
Parameters
19+
----------
20+
results : Dict
21+
Output data structure of textToKnowledgeGraph to be processed
22+
23+
Attributes
24+
----------
25+
statements : List[indra.statements.Statement]
26+
A list of INDRA Statements extracted from the results.
27+
"""
28+
29+
def __init__(self, results):
30+
self.results = results
31+
self.statements = []
32+
self.skipped = []
33+
34+
# Alternative processing mode (not used by V1 tests but available)
35+
def extract_statements(self):
36+
"""Run BEL to INDRA pipeline for all entries in llm_results."""
37+
extractions = self.results.get('LLM_extractions', [])
38+
for extraction in extractions:
39+
results = extraction.get('Results', [])
40+
for entry in results:
41+
raw_bel_stmt = entry['bel_statement']
42+
bel_stmt = normalize_bel(raw_bel_stmt)
43+
try:
44+
pp = process_bel_stmt(bel_stmt)
45+
except Exception as e:
46+
self.skipped.append(bel_stmt)
47+
continue
48+
if pp and pp.statements:
49+
self.statements += pp.statements
50+
else:
51+
self.skipped.append(bel_stmt)
52+
53+
logger.debug(
54+
"textToKnowledgeGraph processor finished: extracted=%d "
55+
"skipped=%d total=%d", len(self.statements), len(self.skipped),
56+
len(self.results)
57+
)
58+
59+
60+
# Fix GO Biological Process names that contain spaces
61+
GO_BP_PATTERN = re.compile(r'GO:([A-Za-z0-9\-\s]+)')
62+
63+
64+
def normalize_go_terms(bel: str) -> str:
65+
"""Normalize GO terms like:
66+
GO:DNA-templated transcription
67+
into:
68+
GO:"DNA-templated transcription"
69+
so PyBEL can parse them.
70+
"""
71+
def replacer(match):
72+
content = match.group(1)
73+
# If already quoted or no spaces in string, we can return as is
74+
if '"' in content or "'" in content or ' ' not in content:
75+
return f'GO:{content}'
76+
return f'GO:"{content}"'
77+
78+
return GO_BP_PATTERN.sub(replacer, bel)
79+
80+
81+
def normalize_bel(bel: str) -> str:
82+
"""Apply all normalization steps."""
83+
# For now just normalizing GO terms which appears to be an existing issue.
84+
# Can be extended with other processing steps later.
85+
bel = normalize_go_terms(bel)
86+
return bel

indra/tests/test_tkg.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import indra.statements as ist
2+
from indra.sources import tkg
3+
4+
5+
def assert_grounding_value_or_none(stmt):
6+
"""Ensure there are no empty grounding values ('' or [])."""
7+
for a in stmt.real_agent_list():
8+
for k, v in a.db_refs.items():
9+
if not v:
10+
assert v is None, f"Invalid grounding value {k}={v}"
11+
12+
13+
TEST_JSON = {
14+
"LLM_extractions": [
15+
{
16+
"Results": [
17+
{
18+
"bel_statement": "p(HGNC:SIRT1) increases act(p(HGNC:PARP1))",
19+
"evidence": "SIRT1 activates PARP1",
20+
},
21+
{
22+
"bel_statement": "p(HGNC:SIRT1) decreases p(HGNC:MYC)",
23+
"evidence": "SIRT1 represses MYC",
24+
},
25+
{
26+
"bel_statement": "p(HGNC:SIRT1) increases act(p(HGNC:PARP1)",
27+
"evidence": "Malformed BEL",
28+
},
29+
{
30+
"bel_statement": "p(FPLX:ERK) directlyIncreases act(p(HGNC:PARP1))",
31+
"evidence": "ERK activates PARP1",
32+
},
33+
]
34+
}
35+
]
36+
}
37+
38+
39+
def test_tkg_processing(tmp_path):
40+
proc = tkg.process_json(TEST_JSON)
41+
42+
assert proc is not None
43+
assert hasattr(proc, "statements")
44+
45+
# Expect 3 valid BELs (invalid BEL should be skipped)
46+
assert len(proc.statements) == 3
47+
48+
for st in proc.statements:
49+
assert st.evidence, "Evidence must exist"
50+
assert_grounding_value_or_none(st)
51+
52+
assert any(isinstance(s, ist.Activation) for s in proc.statements)
53+
assert any(isinstance(s, ist.DecreaseAmount) for s in proc.statements)

0 commit comments

Comments
 (0)