Skip to content

Commit 6a55280

Browse files
authored
feat: clean up file download behavior and descriptions (#163)
1 parent 699fa09 commit 6a55280

File tree

5 files changed

+94
-77
lines changed

5 files changed

+94
-77
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,5 @@ pyproject.toml
135135
cool_seq_tool/data/seqrepo/
136136
cool_seq_tool/data/*.txt
137137
cool_seq_tool/data/LRG_RefSeqGene*
138+
cool_seq_tool/data/MANE*
138139
cool_seq_tool/data/notebooks/

README.md

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,6 @@ sudo mv /usr/local/share/seqrepo/2021-01-29._fkuefgd /usr/local/share/seqrepo/20
9393
exit
9494
```
9595

96-
#### transcript_mappings.tsv
97-
`cool-seq-tool` uses [Ensembl BioMart](http://www.ensembl.org/biomart/martview) to retrieve `cool_seq_tool/data/transcript_mappings.tsv`. We currently use `Human Genes (GRCh38.p13)` for the dataset and the following attributes we use are: Gene stable ID, Gene stable ID version, Transcript stable ID, Transcript stable ID version, Protein stable ID, Protein stable ID version, RefSeq match transcript (MANE Select), Gene name.
98-
99-
![image](biomart.png)
100-
101-
Use the `TRANSCRIPT_MAPPINGS_PATH` environment variable to set the path of an already existing `transcript_mappings.tsv`. The default is `cool_seq_tool/data/transcript_mapping.tsv`.
102-
10396
#### LRG_RefSeqGene
10497

10598
`cool-seq-tool` fetches the latest version of `LRG_RefSeqGene` if the environment variable `LRG_REFSEQGENE_PATH` is not set. When `LRG_REFSEQGENE_PATH` is set, `cool-seq-tool` will look at this path and expect the LRG_RefSeqGene file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene).
@@ -108,6 +101,22 @@ Use the `TRANSCRIPT_MAPPINGS_PATH` environment variable to set the path of an al
108101

109102
`cool-seq-tool` fetches the latest version of `MANE.GRCh38.*.summary.txt.gz` if the environment variable `MANE_SUMMARY_PATH` is not set. When `MANE_SUMMARY_PATH` is set, `cool-seq-tool` will look at this path and expect the MANE Summary Data file. This file is found can be found [here](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/).
110103

104+
#### transcript_mapping.tsv
105+
`cool-seq-tool` is packaged with transcript mapping data acquired from [Ensembl BioMart](http://www.ensembl.org/biomart/martview). If the environment variable `TRANSCRIPT_MAPPINGS_PATH` is not set, `cool-seq-tool` will use the built-in file. When `TRANSCRIPT_MAPPINGS_PATH` is set, `cool_seq_tool` will look at this path and expect to find the transcript mapping TSV file.
106+
107+
To acquire this data manually from the [BioMart](https://www.ensembl.org/biomart/martview), select the `Human Genes (GRCh38.p13)` dataset and choose the following attributes:
108+
109+
* Gene stable ID
110+
* Gene stable ID version
111+
* Transcript stable ID
112+
* Transcript stable ID version
113+
* Protein stable ID
114+
* Protein stable ID version
115+
* RefSeq match transcript (MANE Select)
116+
* Gene name
117+
118+
![image](biomart.png)
119+
111120
## Starting the UTA Tools Service Locally
112121

113122
To start the service, run the following:

cool_seq_tool/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(
3636
) -> None:
3737
"""Initialize CoolSeqTool class
3838
39-
:param Path transcript_file_path: The path to transcript_mappings.tsv
39+
:param Path transcript_file_path: The path to transcript_mapping.tsv
4040
:param Path lrg_refseqgene_path: The path to LRG_RefSeqGene
4141
:param Path mane_data_path: Path to RefSeq MANE summary data
4242
:param str db_url: PostgreSQL connection URL
Lines changed: 62 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,83 @@
1-
"""Module for downloading data files."""
1+
"""Module for handling downloadable data files."""
22
from ftplib import FTP
3+
import logging
34
from os import remove
45
import gzip
6+
from pathlib import Path
57
import shutil
68
import datetime
79

810
from dateutil import parser
911

1012
from cool_seq_tool import APP_ROOT
1113

14+
logger = logging.getLogger("cool_seq_tool")
15+
1216

1317
class DataDownload:
14-
"""Class for downloading data files."""
18+
"""Class for managing downloadable data files. Responsible for checking if files
19+
are available under default locations, and fetching them if not.
20+
"""
1521

1622
def __init__(self) -> None:
17-
"""Initialize DataDownload."""
18-
self._make_data_dir()
19-
self._download_data()
20-
21-
def _make_data_dir(self) -> None:
22-
"""Make data directory"""
23+
"""Initialize downloadable data locations."""
2324
self._data_dir = APP_ROOT / "data"
24-
self._data_dir.mkdir(exist_ok=True, parents=True)
25-
26-
def _download_data(self) -> None:
27-
"""Download data files needed for cool_seq_tool."""
28-
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
29-
ftp.login()
30-
self._download_mane_summary(ftp)
31-
self._download_lrg_refseq_gene_data(ftp)
3225

33-
def _download_mane_summary(self, ftp: FTP) -> None:
34-
"""Download latest MANE summary data and set path
26+
def get_mane_summary(self) -> Path:
27+
"""Identify latest MANE summary data. If unavailable locally, download from
28+
source.
3529
36-
:param FTP ftp: FTP connection
30+
:return: path to MANE summary file
3731
"""
38-
ftp.cwd("/refseq/MANE/MANE_human/current")
39-
files = ftp.nlst()
40-
mane_summary_file = \
41-
[f for f in files if f.endswith(".summary.txt.gz")]
42-
if not mane_summary_file:
43-
raise Exception("Unable to download MANE summary data")
44-
mane_summary_file = mane_summary_file[0]
45-
self._mane_summary_path = \
46-
self._data_dir / mane_summary_file[:-3]
47-
mane_data_path = self._data_dir / mane_summary_file
48-
if not self._mane_summary_path.exists():
49-
with open(mane_data_path, "wb") as fp:
50-
ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
51-
with gzip.open(mane_data_path, "rb") as f_in:
52-
with open(self._mane_summary_path, "wb") as f_out:
53-
shutil.copyfileobj(f_in, f_out)
54-
remove(mane_data_path)
32+
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
33+
ftp.login()
34+
ftp.cwd("/refseq/MANE/MANE_human/current")
35+
files = ftp.nlst()
36+
mane_summary_file = \
37+
[f for f in files if f.endswith(".summary.txt.gz")]
38+
if not mane_summary_file:
39+
raise Exception("Unable to download MANE summary data")
40+
mane_summary_file = mane_summary_file[0]
41+
self._mane_summary_path = \
42+
self._data_dir / mane_summary_file[:-3]
43+
mane_data_path = self._data_dir / mane_summary_file
44+
if not self._mane_summary_path.exists():
45+
logger.info("Downloading MANE summary file from NCBI.")
46+
with open(mane_data_path, "wb") as fp:
47+
ftp.retrbinary(f"RETR {mane_summary_file}", fp.write)
48+
with gzip.open(mane_data_path, "rb") as f_in:
49+
with open(self._mane_summary_path, "wb") as f_out:
50+
shutil.copyfileobj(f_in, f_out)
51+
remove(mane_data_path)
52+
logger.info("MANE summary file download complete.")
53+
return self._mane_summary_path
5554

56-
def _download_lrg_refseq_gene_data(self, ftp: FTP) -> None:
57-
"""Download latest LRG_RefSeqGene and set path
55+
def get_lrg_refseq_gene_data(self) -> Path:
56+
"""Identify latest LRG RefSeq Gene file. If unavailable locally, download from
57+
source.
5858
59-
:param FTP ftp: FTP connection
59+
:return: path to acquired LRG RefSeq Gene data file
6060
"""
61-
lrg_refseqgene_file = "LRG_RefSeqGene"
62-
ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/"
63-
ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}"
64-
timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip()
65-
date = str(parser.parse(timestamp)).split()[0]
66-
version = datetime.datetime.strptime(date,
67-
"%Y-%m-%d").strftime("%Y%m%d")
68-
69-
fn_versioned = f"{lrg_refseqgene_file}_{version}"
70-
lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file
71-
self._lrg_refseqgene_path = self._data_dir / fn_versioned
72-
if not self._lrg_refseqgene_path.exists():
73-
ftp.cwd(ftp_dir_path)
74-
with open(lrg_refseqgene_path, "wb") as fp:
75-
ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write)
76-
with open(lrg_refseqgene_path, "rb") as f_in:
77-
with open(self._lrg_refseqgene_path, "wb") as f_out:
78-
shutil.copyfileobj(f_in, f_out)
79-
remove(lrg_refseqgene_path)
61+
with FTP("ftp.ncbi.nlm.nih.gov") as ftp:
62+
ftp.login()
63+
lrg_refseqgene_file = "LRG_RefSeqGene"
64+
ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/"
65+
ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}"
66+
timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip()
67+
date = str(parser.parse(timestamp)).split()[0]
68+
version = datetime.datetime.strptime(date,
69+
"%Y-%m-%d").strftime("%Y%m%d")
70+
fn_versioned = f"{lrg_refseqgene_file}_{version}"
71+
lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file
72+
self._lrg_refseqgene_path = self._data_dir / fn_versioned
73+
if not self._lrg_refseqgene_path.exists():
74+
logger.info("Downloading LRG RefSeq data from NCBI.")
75+
ftp.cwd(ftp_dir_path)
76+
with open(lrg_refseqgene_path, "wb") as fp:
77+
ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write)
78+
with open(lrg_refseqgene_path, "rb") as f_in:
79+
with open(self._lrg_refseqgene_path, "wb") as f_out:
80+
shutil.copyfileobj(f_in, f_out)
81+
remove(lrg_refseqgene_path)
82+
logger.info("LRG RefSeq data download complete.")
83+
return self._lrg_refseqgene_path

cool_seq_tool/paths.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,27 @@
22
from os import environ
33
from pathlib import Path
44

5+
from cool_seq_tool.data.data_downloads import DataDownload
6+
57

68
APP_ROOT = Path(__file__).resolve().parents[0]
79

810
TRANSCRIPT_MAPPINGS_PATH = Path(environ.get("TRANSCRIPT_MAPPINGS_PATH",
911
f"{APP_ROOT}/data/transcript_mapping.tsv"))
1012

11-
MANE_SUMMARY_PATH = environ.get("MANE_SUMMARY_PATH", "")
12-
LRG_REFSEQGENE_PATH = environ.get("LRG_REFSEQGENE_PATH", "")
13-
if not all((MANE_SUMMARY_PATH, LRG_REFSEQGENE_PATH)):
14-
from cool_seq_tool.data import DataDownload # noqa: E402, I202
15-
d = DataDownload()
13+
d = DataDownload()
14+
15+
provided_mane_summary_path = environ.get("MANE_SUMMARY_PATH", "")
16+
if provided_mane_summary_path:
17+
MANE_SUMMARY_PATH = Path(provided_mane_summary_path)
18+
else:
19+
MANE_SUMMARY_PATH = d.get_mane_summary()
1620

17-
if not MANE_SUMMARY_PATH:
18-
MANE_SUMMARY_PATH = d._mane_summary_path
21+
provided_lrg_refseq_path = environ.get("LRG_REFSEQGENE_PATH", "")
22+
if provided_lrg_refseq_path:
23+
LRG_REFSEQGENE_PATH = Path(provided_lrg_refseq_path)
24+
else:
25+
LRG_REFSEQGENE_PATH = d.get_lrg_refseq_gene_data()
1926

20-
if not LRG_REFSEQGENE_PATH:
21-
LRG_REFSEQGENE_PATH = d._lrg_refseqgene_path
22-
MANE_SUMMARY_PATH = Path(MANE_SUMMARY_PATH)
23-
LRG_REFSEQGENE_PATH = Path(LRG_REFSEQGENE_PATH)
2427

2528
SEQREPO_ROOT_DIR = environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")

0 commit comments

Comments
 (0)