Skip to content

Commit b5b5450

Browse files
feat: add local blast
1 parent ed2f0a0 commit b5b5450

File tree

1 file changed

+89
-27
lines changed

1 file changed

+89
-27
lines changed

graphgen/models/searcher/db/uniprot_searcher.py

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
import os
12
import re
3+
import subprocess
4+
import tempfile
25
from io import StringIO
36
from typing import Dict, Optional
47

@@ -24,6 +27,14 @@ class UniProtSearch(BaseSearcher):
2427
3) Search with FASTA sequence (BLAST searcher).
2528
"""
2629

30+
def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"):
31+
super().__init__()
32+
self.use_local_blast = use_local_blast
33+
self.local_blast_db = local_blast_db
34+
if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"):
35+
logger.error("Local BLAST database files not found. Please check the path.")
36+
self.use_local_blast = False
37+
2738
def get_by_accession(self, accession: str) -> Optional[dict]:
2839
try:
2940
handle = ExPASy.get_sprot_raw(accession)
@@ -101,38 +112,86 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
101112
logger.error("Empty FASTA sequence provided.")
102113
return None
103114

104-
# UniProtKB/Swiss-Prot BLAST API
105-
try:
106-
logger.debug("Performing BLAST searcher for the given sequence: %s", seq)
107-
result_handle = NCBIWWW.qblast(
108-
program="blastp",
109-
database="swissprot",
110-
sequence=seq,
111-
hitlist_size=1,
112-
expect=threshold,
113-
)
114-
blast_record = NCBIXML.read(result_handle)
115-
except RequestException:
116-
raise
117-
except Exception as e: # pylint: disable=broad-except
118-
logger.error("BLAST searcher failed: %s", e)
119-
return None
115+
accession = None
116+
if self.use_local_blast:
117+
accession = self._local_blast(seq, threshold)
118+
if accession:
119+
logger.debug("Local BLAST found accession: %s", accession)
120+
121+
if not accession:
122+
logger.debug("Falling back to NCBIWWW.qblast.")
123+
124+
# UniProtKB/Swiss-Prot BLAST API
125+
try:
126+
logger.debug(
127+
"Performing BLAST searcher for the given sequence: %s", seq
128+
)
129+
result_handle = NCBIWWW.qblast(
130+
program="blastp",
131+
database="swissprot",
132+
sequence=seq,
133+
hitlist_size=1,
134+
expect=threshold,
135+
)
136+
blast_record = NCBIXML.read(result_handle)
137+
except RequestException:
138+
raise
139+
except Exception as e: # pylint: disable=broad-except
140+
logger.error("BLAST searcher failed: %s", e)
141+
return None
120142

121-
if not blast_record.alignments:
122-
logger.info("No BLAST hits found for the given sequence.")
123-
return None
143+
if not blast_record.alignments:
144+
logger.info("No BLAST hits found for the given sequence.")
145+
return None
124146

125-
best_alignment = blast_record.alignments[0]
126-
best_hsp = best_alignment.hsps[0]
127-
if best_hsp.expect > threshold:
128-
logger.info("No BLAST hits below the threshold E-value.")
129-
return None
130-
hit_id = best_alignment.hit_id
147+
best_alignment = blast_record.alignments[0]
148+
best_hsp = best_alignment.hsps[0]
149+
if best_hsp.expect > threshold:
150+
logger.info("No BLAST hits below the threshold E-value.")
151+
return None
152+
hit_id = best_alignment.hit_id
131153

132-
# like sp|P01308.1|INS_HUMAN
133-
accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
154+
# like sp|P01308.1|INS_HUMAN
155+
accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
134156
return self.get_by_accession(accession)
135157

158+
def _local_blast(self, seq: str, threshold: float) -> Optional[str]:
159+
"""
160+
Perform local BLAST search using local BLAST database.
161+
:param seq: The protein sequence.
162+
:param threshold: E-value threshold for BLAST searcher.
163+
:return: The accession number of the best hit or None if not found.
164+
"""
165+
try:
166+
with tempfile.NamedTemporaryFile(
167+
mode="w+", suffix=".fa", delete=False
168+
) as tmp:
169+
tmp.write(f">query\n{seq}\n")
170+
tmp_name = tmp.name
171+
172+
cmd = [
173+
"blastp",
174+
"-db",
175+
self.local_blast_db,
176+
"-query",
177+
tmp_name,
178+
"-evalue",
179+
str(threshold),
180+
"-max_target_seqs",
181+
"1",
182+
"-outfmt",
183+
"6 sacc", # only return accession
184+
]
185+
logger.debug("Running local blastp: %s", " ".join(cmd))
186+
out = subprocess.check_output(cmd, text=True).strip()
187+
os.remove(tmp_name)
188+
if out:
189+
return out.split("\n", maxsplit=1)[0]
190+
return None
191+
except Exception as exc: # pylint: disable=broad-except
192+
logger.error("Local blastp failed: %s", exc)
193+
return None
194+
136195
@retry(
137196
stop=stop_after_attempt(5),
138197
wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -173,3 +232,6 @@ async def search(
173232
if result:
174233
result["_search_query"] = query
175234
return result
235+
236+
237+
# TODO: use local UniProt database for large-scale searchs

0 commit comments

Comments
 (0)