Skip to content

Commit 66ba20a

Browse files
authored
Merge pull request #134 from GenomicMedLab/staging
Staging
2 parents 4456bfd + b375a20 commit 66ba20a

File tree

16 files changed

+589
-446
lines changed

16 files changed

+589
-446
lines changed

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ If you do not wish to use the default, you must set the environment variable `UT
7171
#### SeqRepo
7272
`cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
7373
74+
Use the `SEQREPO_DATA_PATH` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
75+
7476
From the _root_ directory:
7577
```
7678
pip install seqrepo
@@ -96,13 +98,15 @@ exit
9698

9799
![image](biomart.png)
98100

101+
Use the `TRANSCRIPT_MAPPINGS_PATH` environment variable to set the path of an already existing `transcript_mappings.tsv`. The default is `cool_seq_tool/data/transcript_mapping.tsv`.
102+
99103
#### LRG_RefSeqGene
100104

101-
`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene`. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).
105+
`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene` if the environment variable `LRG_REFSEQGENE_PATH` is not set. When `LRG_REFSEQGENE_PATH` is set, `cool-seq-tool` will look at this path and expect the LRG_RefSeqGene file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).
102106

103107
#### MANE Summary Data
104108

105-
`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz`. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).
109+
`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz` if the environment variable `MANE_SUMMARY_PATH` is not set. When `MANE_SUMMARY_PATH` is set, `cool-seq-tool` will look at this path and expect the MANE Summary Data file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).
106110

107111
## Starting the UTA Tools Service Locally
108112

cool_seq_tool/__init__.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,27 @@
1414

1515
LOG_FN = "cool_seq_tool.log"
1616

17-
if "UTA_DB_URL" in environ:
18-
UTA_DB_URL = environ["UTA_DB_URL"]
19-
else:
20-
UTA_DB_URL = "postgresql://uta_admin@localhost:5433/uta/uta_20210129"
21-
22-
if "SEQREPO_DATA_PATH" in environ:
23-
SEQREPO_DATA_PATH = environ["SEQREPO_DATA_PATH"]
24-
else:
25-
SEQREPO_DATA_PATH = "/usr/local/share/seqrepo/latest"
26-
27-
TRANSCRIPT_MAPPINGS_PATH = f"{APP_ROOT}/data/transcript_mapping.tsv"
28-
29-
from cool_seq_tool.data import DataDownload # noqa: E402, I202
30-
d = DataDownload()
31-
MANE_SUMMARY_PATH = d._mane_summary_path
32-
LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
17+
UTA_DB_URL = environ.get("UTA_DB_URL",
18+
"postgresql://uta_admin@localhost:5433/uta/uta_20210129")
19+
SEQREPO_DATA_PATH = Path(environ.get("SEQREPO_DATA_PATH",
20+
"/usr/local/share/seqrepo/latest"))
21+
TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
22+
f"{APP_ROOT}/data/transcript_mapping.tsv"))
23+
24+
25+
MANE_SUMMARY_PATH = environ.get("MANE_SUMMARY_PATH")
26+
LRG_REFSEQGENE_PATH = environ.get("LRG_REFSEQGENE_PATH")
27+
if not all((MANE_SUMMARY_PATH, LRG_REFSEQGENE_PATH)):
28+
from cool_seq_tool.data import DataDownload # noqa: E402, I202
29+
d = DataDownload()
30+
31+
if not MANE_SUMMARY_PATH:
32+
MANE_SUMMARY_PATH = d._mane_summary_path
33+
34+
if not LRG_REFSEQGENE_PATH:
35+
LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
36+
MANE_SUMMARY_PATH = Path(MANE_SUMMARY_PATH)
37+
LRG_REFSEQGENE_PATH = Path(LRG_REFSEQGENE_PATH)
38+
3339

3440
from cool_seq_tool.cool_seq_tool import CoolSeqTool # noqa: E402, F401, I202

cool_seq_tool/cool_seq_tool.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from typing import Optional, Union, List, Tuple, Dict
44
from pathlib import Path
55

6+
from gene.query import QueryHandler as GeneQueryHandler
7+
68
from cool_seq_tool import logger
79
from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper
810
from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \
@@ -18,24 +20,31 @@
1820
class CoolSeqTool:
1921
"""Class to initialize data sources."""
2022

21-
def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH,
22-
transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
23-
lrg_refseqgene_path: str = LRG_REFSEQGENE_PATH,
24-
mane_data_path: str = MANE_SUMMARY_PATH,
25-
db_url: str = UTA_DB_URL, db_pwd: str = "",
26-
gene_db_url: str = "", gene_db_region: str = "us-east-2"
27-
) -> None:
23+
def __init__(
24+
self, seqrepo_data_path: Path = SEQREPO_DATA_PATH,
25+
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
26+
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
27+
mane_data_path: Path = MANE_SUMMARY_PATH,
28+
db_url: str = UTA_DB_URL, db_pwd: str = "",
29+
gene_query_handler: GeneQueryHandler = None,
30+
gene_db_url: str = "", gene_db_region: str = "us-east-2"
31+
) -> None:
2832
"""Initialize CoolSeqTool class
2933
30-
:param str seqrepo_data_path: The path to the seqrepo directory.
31-
:param str transcript_file_path: The path to transcript_mappings.tsv
32-
:param str lrg_refseqgene_path: The path to LRG_RefSeqGene
33-
:param str mane_data_path: Path to RefSeq MANE summary data
34+
:param Path seqrepo_data_path: The path to the seqrepo directory.
35+
:param Path transcript_file_path: The path to transcript_mappings.tsv
36+
:param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
37+
:param Path mane_data_path: Path to RefSeq MANE summary data
3438
:param str db_url: PostgreSQL connection URL
3539
Format: `driver://user:pass@host/database/schema`
3640
:param str db_pwd: User's password for uta database
37-
:param str gene_db_url: URL to gene normalizer dynamodb
38-
:param str gene_db_region: AWS region for gene normalizer db
41+
:param GeneQueryHandler gene_query_handler: Gene normalizer query handler
42+
instance. If this is provided, will use a current instance. If this is not
43+
provided, will create a new instance.
44+
:param str gene_db_url: URL to gene normalizer dynamodb. Only used when
45+
`gene_query_handler` is `None`.
46+
:param str gene_db_region: AWS region for gene normalizer db. Only used when
47+
`gene_query_handler` is `None`.
3948
"""
4049
self.seqrepo_access = SeqRepoAccess(
4150
seqrepo_data_path=seqrepo_data_path)
@@ -45,7 +54,8 @@ def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH,
4554
self.mane_transcript_mappings = MANETranscriptMappings(
4655
mane_data_path=mane_data_path)
4756
self.uta_db = UTADatabase(db_url=db_url, db_pwd=db_pwd)
48-
gene_normalizer = GeneNormalizer(gene_db_url, gene_db_region)
57+
gene_normalizer = GeneNormalizer(gene_query_handler, gene_db_url,
58+
gene_db_region)
4959
self.alignment_mapper = AlignmentMapper(
5060
self.seqrepo_access, self.transcript_mappings, self.uta_db)
5161
self.mane_transcript = MANETranscript(
@@ -471,7 +481,7 @@ async def _set_genomic_data(self, params: Dict, strand: int,
471481
if not grch38_ac:
472482
return f"Invalid genomic accession: {params['chr']}"
473483

474-
grch38_ac = grch38_ac[0][0]
484+
grch38_ac = grch38_ac[0]
475485
if grch38_ac != params["chr"]: # params["chr"] is genomic accession
476486
# Liftover to 38
477487
descr = await self.uta_db.get_chr_assembly(params["chr"])

cool_seq_tool/data_sources/gene_normalizer.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,24 @@
1010
class GeneNormalizer:
1111
"""Gene Normalizer class for getting gene data"""
1212

13-
def __init__(self, db_url: str = "", db_region: str = "us-east-2") -> None:
13+
def __init__(
14+
self, query_handler: QueryHandler = None, db_url: str = "",
15+
db_region: str = "us-east-2"
16+
) -> None:
1417
"""Initialize gene normalizer class
1518
16-
:param str db_url: URL to gene normalizer dynamodb
17-
:param str db_region: AWS region for gene normalizer db
19+
:param QueryHandler query_handler: Gene normalizer query handler instance.
20+
If this is provided, will use a current instance. If this is not provided,
21+
will create a new instance.
22+
:param str db_url: URL to gene normalizer dynamodb. Only used when
23+
`query_handler` is `None`.
24+
:param str db_region: AWS region for gene normalizer db. Only used when
25+
`query_handler` is `None`.
1826
"""
19-
self.query_handler = QueryHandler(db_url, db_region)
27+
if query_handler:
28+
self.query_handler = query_handler
29+
else:
30+
self.query_handler = QueryHandler(db_url, db_region)
2031

2132
def get_hgnc_data(self, gene: str) -> Dict:
2233
"""Return HGNC data for a given gene

cool_seq_tool/data_sources/mane_transcript.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ async def g_to_grch38(self, ac: str, start_pos: int,
736736

737737
newest_ac = await self.uta_db.get_newest_assembly_ac(ac)
738738
if newest_ac:
739-
ac = newest_ac[0][0]
739+
ac = newest_ac[0]
740740
if self._validate_index(ac, (start_pos, end_pos), 0):
741741
return dict(
742742
ac=ac,

cool_seq_tool/data_sources/mane_transcript_mappings.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""The module for loading MANE Transcript mappings to genes."""
2+
from pathlib import Path
23
from typing import Dict, Optional, List
34

45
import pandas as pd
@@ -9,9 +10,9 @@
910
class MANETranscriptMappings:
1011
"""The MANE Transcript mappings class."""
1112

12-
def __init__(self, mane_data_path: str = MANE_SUMMARY_PATH) -> None:
13+
def __init__(self, mane_data_path: Path = MANE_SUMMARY_PATH) -> None:
1314
"""Initialize the MANE Transcript mappings class.
14-
:param str mane_data_path: Path to RefSeq MANE summary data
15+
:param Path mane_data_path: Path to RefSeq MANE summary data
1516
"""
1617
self.mane_data_path = mane_data_path
1718
self.df = self._load_mane_transcript_data()

cool_seq_tool/data_sources/seqrepo_access.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""A module for accessing SeqRepo."""
22
from typing import Optional, List, Tuple, Union
33
from os import environ
4+
from pathlib import Path
45

56
from biocommons.seqrepo import SeqRepo
67

@@ -12,9 +13,9 @@
1213
class SeqRepoAccess:
1314
"""The SeqRepoAccess class."""
1415

15-
def __init__(self, seqrepo_data_path: str = SEQREPO_DATA_PATH) -> None:
16+
def __init__(self, seqrepo_data_path: Path = SEQREPO_DATA_PATH) -> None:
1617
"""Initialize the SeqRepoAccess class.
17-
:param str seqrepo_data_path: The path to the seqrepo directory.
18+
:param Path seqrepo_data_path: The path to the seqrepo directory.
1819
"""
1920
environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
2021
self.seqrepo_client = SeqRepo(seqrepo_data_path)

cool_seq_tool/data_sources/transcript_mappings.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""The module for Transcript Mappings."""
22
import csv
3+
from pathlib import Path
34
from typing import Dict, List, Optional
45

56
from cool_seq_tool import TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH
@@ -8,12 +9,12 @@
89
class TranscriptMappings:
910
"""The transcript mappings class."""
1011

11-
def __init__(self, transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
12-
lrg_refseqgene_path: str = LRG_REFSEQGENE_PATH) -> None:
12+
def __init__(self, transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
13+
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH) -> None:
1314
"""Initialize the transcript mappings class.
1415
15-
:param str transcript_file_path: Path to transcript mappings file
16-
:param str lrg_refseqgene_path: Path to LRG RefSeqGene file
16+
:param Path transcript_file_path: Path to transcript mappings file
17+
:param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
1718
"""
1819
# ENSP <-> Gene Symbol
1920
self.ensembl_protein_version_for_gene_symbol: Dict[str, List[str]] = {}
@@ -51,11 +52,10 @@ def __init__(self, transcript_file_path: str = TRANSCRIPT_MAPPINGS_PATH,
5152
self._load_transcript_mappings_data(transcript_file_path)
5253
self._load_refseq_gene_symbol_data(lrg_refseqgene_path)
5354

54-
def _load_transcript_mappings_data(self,
55-
transcript_file_path: str) -> None:
55+
def _load_transcript_mappings_data(self, transcript_file_path: Path) -> None:
5656
"""Load transcript mappings file to dictionaries.
5757
58-
:param str transcript_file_path: Path to transcript mappings file
58+
:param Path transcript_file_path: Path to transcript mappings file
5959
"""
6060
with open(transcript_file_path) as file:
6161
reader = csv.DictReader(file, delimiter="\t")
@@ -96,10 +96,10 @@ def _load_transcript_mappings_data(self,
9696
self.ensp_to_enst[versioned_protein_transcript] = \
9797
versioned_transcript
9898

99-
def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: str) -> None:
99+
def _load_refseq_gene_symbol_data(self, lrg_refseqgene_path: Path) -> None:
100100
"""Load data from RefSeq Gene Symbol file to dictionaries.
101101
102-
:param str lrg_refseqgene_path: Path to LRG RefSeqGene file
102+
:param Path lrg_refseqgene_path: Path to LRG RefSeqGene file
103103
"""
104104
with open(lrg_refseqgene_path) as file:
105105
reader = csv.DictReader(file, delimiter="\t")

0 commit comments

Comments
 (0)