Skip to content

Commit c8126d9

Browse files
authored
Merge pull request #142 from GenomicMedLab/staging
Staging
2 parents 294a2e6 + 437a869 commit c8126d9

File tree

15 files changed

+208
-253
lines changed

15 files changed

+208
-253
lines changed

Pipfile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ pydantic = "*"
1515
fastapi = "*"
1616
uvicorn = "*"
1717
gene-normalizer = "*"
18+
"ga4gh.vrs" = "*"
1819

1920
[dev-packages]
2021
cool_seq_tool = {editable = true, path = "."}
@@ -25,11 +26,9 @@ flake8-docstrings = "*"
2526
flake8-annotations = "*"
2627
flake8-quotes = "*"
2728
flake8-import-order = "*"
28-
coverage = "*"
2929
pytest-cov = "*"
30-
coveralls = "*"
31-
jupyterlab = "*"
3230
pytest-asyncio = "==0.18.3"
3331
ipython = "*"
32+
ipykernel = "*"
3433
psycopg2-binary = "*"
3534
mock = "*"

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ The **cool-seq-tool** provides:
1212
### pip
1313

1414
```commandline
15-
pip install cool-seq-tool
15+
pip install cool-seq-tool[dev,tests]
1616
```
1717

1818
### Development
@@ -30,7 +30,7 @@ Install backend dependencies and enter Pipenv environment:
3030

3131
```commandline
3232
pipenv shell
33-
pipenv lock && pipenv sync
33+
pipenv update
3434
pipenv install --dev
3535
```
3636

@@ -71,7 +71,7 @@ If you do not wish to use the default, you must set the environment variable `UT
7171
#### SeqRepo
7272
`cool-seq-tool` relies on [seqrepo](https://github.com/biocommons/biocommons.seqrepo), which you must download yourself.
7373
74-
Use the `SEQREPO_DATA_PATH` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
74+
Use the `SEQREPO_ROOT_DIR` environment variable to set the path of an already existing SeqRepo directory. The default is `/usr/local/share/seqrepo/latest`.
7575
7676
From the _root_ directory:
7777
```

cool_seq_tool/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616

1717
UTA_DB_URL = environ.get("UTA_DB_URL",
1818
"postgresql://uta_admin@localhost:5433/uta/uta_20210129")
19-
SEQREPO_DATA_PATH = Path(environ.get("SEQREPO_DATA_PATH",
20-
"/usr/local/share/seqrepo/latest"))
19+
SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
2120
TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
2221
f"{APP_ROOT}/data/transcript_mapping.tsv"))
2322

cool_seq_tool/cool_seq_tool.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,51 +3,54 @@
33
from typing import Optional, Union, List, Tuple, Dict
44
from pathlib import Path
55

6+
from biocommons.seqrepo import SeqRepo
67
from gene.query import QueryHandler as GeneQueryHandler
78

8-
from cool_seq_tool import logger
9+
from cool_seq_tool import logger, SEQREPO_ROOT_DIR
910
from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper
1011
from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \
1112
ResidueMode, GenomicDataResponse, ServiceMeta, TranscriptExonDataResponse
1213
from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings,\
1314
SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer
14-
from cool_seq_tool import SEQREPO_DATA_PATH, \
15-
TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, \
16-
UTA_DB_URL
15+
from cool_seq_tool import TRANSCRIPT_MAPPINGS_PATH, LRG_REFSEQGENE_PATH, \
16+
MANE_SUMMARY_PATH, UTA_DB_URL
1717
from cool_seq_tool.version import __version__
1818

1919

2020
class CoolSeqTool:
2121
"""Class to initialize data sources."""
2222

2323
def __init__(
24-
self, seqrepo_data_path: Path = SEQREPO_DATA_PATH,
24+
self,
2525
transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH,
2626
lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH,
2727
mane_data_path: Path = MANE_SUMMARY_PATH,
2828
db_url: str = UTA_DB_URL, db_pwd: str = "",
29-
gene_query_handler: GeneQueryHandler = None,
30-
gene_db_url: str = "", gene_db_region: str = "us-east-2"
29+
gene_query_handler: Optional[GeneQueryHandler] = None,
30+
gene_db_url: str = "", gene_db_region: str = "us-east-2",
31+
sr: Optional[SeqRepo] = None
3132
) -> None:
3233
"""Initialize CoolSeqTool class
3334
34-
:param Path seqrepo_data_path: The path to the seqrepo directory.
3535
:param Path transcript_file_path: The path to transcript_mappings.tsv
3636
:param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
3737
:param Path mane_data_path: Path to RefSeq MANE summary data
3838
:param str db_url: PostgreSQL connection URL
3939
Format: `driver://user:pass@host/database/schema`
4040
:param str db_pwd: User's password for uta database
41-
:param GeneQueryHandler gene_query_handler: Gene normalizer query handler
42-
instance. If this is provided, will use a current instance. If this is not
43-
provided, will create a new instance.
41+
:param Optional[GeneQueryHandler] gene_query_handler: Gene normalizer query
42+
handler instance. If this is provided, will use a current instance. If this
43+
is not provided, will create a new instance.
4444
:param str gene_db_url: URL to gene normalizer dynamodb. Only used when
4545
`gene_query_handler` is `None`.
4646
:param str gene_db_region: AWS region for gene normalizer db. Only used when
4747
`gene_query_handler` is `None`.
48+
:param Optional[SeqRepo] sr: SeqRepo instance. If this is not provided, will
49+
create a new instance.
4850
"""
49-
self.seqrepo_access = SeqRepoAccess(
50-
seqrepo_data_path=seqrepo_data_path)
51+
if not sr:
52+
sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR)
53+
self.seqrepo_access = SeqRepoAccess(sr)
5154
self.transcript_mappings = TranscriptMappings(
5255
transcript_file_path=transcript_file_path,
5356
lrg_refseqgene_path=lrg_refseqgene_path)

cool_seq_tool/data_sources/mane_transcript.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import math
1111
from typing import Optional, Set, Tuple, Dict, List, Union
1212

13-
import hgvs.parser
1413
import pandas as pd
1514

1615
from cool_seq_tool.schemas import AnnotationLayer, Assembly, MappedManeData, \
@@ -47,7 +46,6 @@ def __init__(self, seqrepo_access: SeqRepoAccess,
4746
:param GeneNormalizer gene_normalizer: Access to Gene Normalizer
4847
"""
4948
self.seqrepo_access = seqrepo_access
50-
self.hgvs_parser = hgvs.parser.Parser()
5149
self.transcript_mappings = transcript_mappings
5250
self.mane_transcript_mappings = mane_transcript_mappings
5351
self.uta_db = uta_db

cool_seq_tool/data_sources/seqrepo_access.py

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,22 @@
11
"""A module for accessing SeqRepo."""
22
from typing import Optional, List, Tuple, Union
33
from os import environ
4-
from pathlib import Path
54

6-
from biocommons.seqrepo import SeqRepo
5+
from ga4gh.vrs.dataproxy import SeqRepoDataProxy
76

87
from cool_seq_tool.schemas import ResidueMode
9-
from cool_seq_tool import SEQREPO_DATA_PATH, logger
8+
from cool_seq_tool import logger
109
from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos
1110

1211

13-
class SeqRepoAccess:
12+
class SeqRepoAccess(SeqRepoDataProxy):
1413
"""The SeqRepoAccess class."""
1514

16-
def __init__(self, seqrepo_data_path: Path = SEQREPO_DATA_PATH) -> None:
17-
"""Initialize the SeqRepoAccess class.
18-
:param Path seqrepo_data_path: The path to the seqrepo directory.
19-
"""
20-
environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
21-
self.seqrepo_client = SeqRepo(seqrepo_data_path)
15+
environ["SEQREPO_LRU_CACHE_MAXSIZE"] = "none"
2216

2317
def get_reference_sequence(
24-
self, ac: str, start: Optional[int] = None, end: Optional[int] = None,
25-
residue_mode: str = ResidueMode.RESIDUE
18+
self, ac: str, start: Optional[int] = None, end: Optional[int] = None,
19+
residue_mode: str = ResidueMode.RESIDUE
2620
) -> Tuple[str, Optional[str]]:
2721
"""Get reference sequence for an accession given a start and end position.
2822
If `start` and `end` are not given, it will return the entire reference sequence
@@ -45,7 +39,7 @@ def get_reference_sequence(
4539
if start == end:
4640
end += 1
4741
try:
48-
sequence = self.seqrepo_client.fetch(ac, start=start, end=end)
42+
sequence = self.sr.fetch(ac, start=start, end=end)
4943
except KeyError:
5044
msg = f"Accession, {ac}, not found in SeqRepo"
5145
logger.warning(msg)
@@ -77,7 +71,7 @@ def get_reference_sequence(
7771
return sequence, None
7872

7973
def translate_identifier(
80-
self, ac: str, target_namespace: Optional[Union[str, List[str]]] = None
74+
self, ac: str, target_namespaces: Optional[Union[str, List[str]]] = None
8175
) -> Tuple[List[str], Optional[str]]:
8276
"""Return list of identifiers for accession.
8377
@@ -86,31 +80,32 @@ def translate_identifier(
8680
:return: List of identifiers, warning
8781
"""
8882
try:
89-
ga4gh_identifiers = self.seqrepo_client.translate_identifier(
90-
ac, target_namespaces=target_namespace)
83+
ga4gh_identifiers = self.sr.translate_identifier(
84+
ac, target_namespaces=target_namespaces)
9185
except KeyError:
9286
msg = f"SeqRepo unable to get translated identifiers for {ac}"
9387
logger.warning(msg)
9488
return [], msg
9589
else:
9690
return ga4gh_identifiers, None
9791

98-
def aliases(self,
99-
input_str: str) -> Tuple[List[Optional[str]], Optional[str]]:
92+
def translate_alias(
93+
self, input_str: str
94+
) -> Tuple[List[Optional[str]], Optional[str]]:
10095
"""Get aliases for a given input.
10196
10297
:param str input_str: Input to get aliases for
10398
:return: List of aliases, warning
10499
"""
105100
try:
106-
return self.seqrepo_client.translate_alias(input_str), None
101+
return self.sr.translate_alias(input_str), None
107102
except KeyError:
108103
msg = f"SeqRepo could not translate alias {input_str}"
109104
logger.warning(msg)
110105
return [], msg
111106

112107
def chromosome_to_acs(
113-
self, chromosome: str
108+
self, chromosome: str
114109
) -> Tuple[Optional[List[str]], Optional[str]]:
115110
"""Get accessions for a chromosome
116111
@@ -119,8 +114,8 @@ def chromosome_to_acs(
119114
"""
120115
acs = []
121116
for assembly in ["GRCh38", "GRCh37"]:
122-
tmp_acs = self.translate_identifier(f"{assembly}:chr{chromosome}",
123-
target_namespace="refseq")[0]
117+
tmp_acs, _ = self.translate_identifier(f"{assembly}:chr{chromosome}",
118+
target_namespaces="refseq")
124119
for ac in tmp_acs:
125120
acs.append(ac.split("refseq:")[-1])
126121
if acs:
@@ -134,7 +129,7 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]:
134129
:param str ac: Accession
135130
:return: Chromosome, warning
136131
"""
137-
aliases, warning = self.aliases(ac)
132+
aliases, _ = self.translate_alias(ac)
138133
aliases = ([a.split(":")[-1] for a in aliases
139134
if a.startswith("GRCh") and "." not in a and "chr" not in a] or [None])[0] # noqa: E501
140135
if aliases is None:

cool_seq_tool/data_sources/uta_database.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
# Environment variables for paths to chain files for pyliftover
2525
LIFTOVER_CHAIN_37_TO_38 = environ.get("LIFTOVER_CHAIN_37_TO_38")
26-
LIFTOVER_CHAIN_38_TO_37 = environ.get("LIFTOVER_CHAIN_38_TO_37")
2726

2827

2928
class UTADatabase:
@@ -33,8 +32,7 @@ def __init__(
3332
self,
3433
db_url: str = UTA_DB_URL,
3534
db_pwd: str = "",
36-
chain_file_37_to_38: Optional[str] = None,
37-
chain_file_38_to_37: Optional[str] = None
35+
chain_file_37_to_38: Optional[str] = None
3836
) -> None:
3937
"""Initialize DB class. Downstream libraries should use the create()
4038
method to construct a new instance: await UTADatabase.create()
@@ -46,10 +44,6 @@ def __init__(
4644
This is used for pyliftover. If this is not provided, will check to see if
4745
LIFTOVER_CHAIN_37_TO_38 env var is set. If neither is provided, will allow
4846
pyliftover to download a chain file from UCSC
49-
:param chain_file_38_to_37: Optional path to chain file for 38 to 37 assembly.
50-
This is used for pyliftover. If this is not provided, will check to see if
51-
LIFTOVER_CHAIN_38_TO_37 env var is set. If neither is provided, will allow
52-
pyliftover to download a chain file from UCSC
5347
"""
5448
self.schema = None
5549
self.db_url = db_url
@@ -63,12 +57,6 @@ def __init__(
6357
else:
6458
self.liftover_37_to_38 = LiftOver("hg19", "hg38")
6559

66-
chain_file_38_to_37 = chain_file_38_to_37 or LIFTOVER_CHAIN_38_TO_37
67-
if chain_file_38_to_37:
68-
self.liftover_38_to_37 = LiftOver(chain_file_38_to_37)
69-
else:
70-
self.liftover_38_to_37 = LiftOver("hg38", "hg19")
71-
7260
@staticmethod
7361
def _update_db_url(db_pwd: str, db_url: str) -> str:
7462
"""Return new db_url containing password.
@@ -1022,8 +1010,6 @@ def get_liftover(self, chromosome: str, pos: int,
10221010

10231011
if liftover_to_assembly == Assembly.GRCH38:
10241012
liftover = self.liftover_37_to_38.convert_coordinate(chromosome, pos)
1025-
elif liftover_to_assembly == Assembly.GRCH37:
1026-
liftover = self.liftover_38_to_37.convert_coordinate(chromosome, pos)
10271013
else:
10281014
logger.warning(f"{liftover_to_assembly} assembly not supported")
10291015
liftover = None

cool_seq_tool/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.9"
1+
__version__ = "0.1.10"

0 commit comments

Comments
 (0)