Skip to content

Commit ac22fdc

Browse files
authored
Merge pull request #105 from CHERRY-ui8/feat/add-dna-rna-search
feat: Add DNA and RNA search functionality
1 parent 0bcdd3d commit ac22fdc

19 files changed

+1215
-21
lines changed

graphgen/configs/search_config.yaml

Lines changed: 0 additions & 14 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
pipeline:
2+
- name: read_step
3+
op_key: read
4+
params:
5+
input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6+
7+
- name: search_step
8+
op_key: search
9+
deps: [read_step] # search_step depends on read_step
10+
params:
11+
data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
12+
ncbi_params:
13+
email: [email protected] # NCBI requires an email address
14+
tool: GraphGen # tool name for NCBI API
15+
use_local_blast: true # whether to use local blast for DNA search
16+
local_blast_db: /your_path/refseq_241 # path to local BLAST database (without .nhr extension)
17+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
pipeline:
2+
- name: read_step
3+
op_key: read
4+
params:
5+
input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6+
7+
- name: search_step
8+
op_key: search
9+
deps: [read_step] # search_step depends on read_step
10+
params:
11+
data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
12+
uniprot_params:
13+
use_local_blast: true # whether to use local blast for uniprot search
14+
local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot
15+
# options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
pipeline:
2+
- name: read_step
3+
op_key: read
4+
params:
5+
input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6+
7+
- name: search_step
8+
op_key: search
9+
deps: [read_step] # search_step depends on read_step
10+
params:
11+
data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral
12+
rnacentral_params:
13+
use_local_blast: true # whether to use local blast for RNA search
14+
local_blast_db: /your_path/refseq_rna_241 # format: /path/to/refseq_rna_${RELEASE}
15+
# can also use DNA database with RNA sequences (if already built)
16+

graphgen/graphgen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __init__(
4545

4646
# llm
4747
self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
48-
model_name=os.getenv("TOKENIZER_MODEL")
48+
model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base")
4949
)
5050

5151
self.synthesizer_llm_client: BaseLLMWrapper = (

graphgen/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
RDFReader,
2727
TXTReader,
2828
)
29+
from .searcher.db.ncbi_searcher import NCBISearch
30+
from .searcher.db.rnacentral_searcher import RNACentralSearch
2931
from .searcher.db.uniprot_searcher import UniProtSearch
3032
from .searcher.kg.wiki_search import WikiSearch
3133
from .searcher.web.bing_search import BingSearch

0 commit comments

Comments
 (0)