|
1 | | -"""Module for downloading data files.""" |
| 1 | +"""Module for handling downloadable data files.""" |
2 | 2 | from ftplib import FTP |
| 3 | +import logging |
3 | 4 | from os import remove |
4 | 5 | import gzip |
| 6 | +from pathlib import Path |
5 | 7 | import shutil |
6 | 8 | import datetime |
7 | 9 |
|
8 | 10 | from dateutil import parser |
9 | 11 |
|
10 | 12 | from cool_seq_tool import APP_ROOT |
11 | 13 |
|
| 14 | +logger = logging.getLogger("cool_seq_tool") |
| 15 | + |
12 | 16 |
|
13 | 17 | class DataDownload: |
14 | | - """Class for downloading data files.""" |
| 18 | + """Class for managing downloadable data files. Responsible for checking if files |
| 19 | + are available under default locations, and fetching them if not. |
| 20 | + """ |
15 | 21 |
|
16 | 22 | def __init__(self) -> None: |
17 | | - """Initialize DataDownload.""" |
18 | | - self._make_data_dir() |
19 | | - self._download_data() |
20 | | - |
21 | | - def _make_data_dir(self) -> None: |
22 | | - """Make data directory""" |
| 23 | + """Initialize downloadable data locations.""" |
23 | 24 | self._data_dir = APP_ROOT / "data" |
24 | | - self._data_dir.mkdir(exist_ok=True, parents=True) |
25 | | - |
26 | | - def _download_data(self) -> None: |
27 | | - """Download data files needed for cool_seq_tool.""" |
28 | | - with FTP("ftp.ncbi.nlm.nih.gov") as ftp: |
29 | | - ftp.login() |
30 | | - self._download_mane_summary(ftp) |
31 | | - self._download_lrg_refseq_gene_data(ftp) |
32 | 25 |
|
33 | | - def _download_mane_summary(self, ftp: FTP) -> None: |
34 | | - """Download latest MANE summary data and set path |
| 26 | + def get_mane_summary(self) -> Path: |
| 27 | + """Identify latest MANE summary data. If unavailable locally, download from |
| 28 | + source. |
35 | 29 |
|
36 | | - :param FTP ftp: FTP connection |
| 30 | + :return: path to MANE summary file |
37 | 31 | """ |
38 | | - ftp.cwd("/refseq/MANE/MANE_human/current") |
39 | | - files = ftp.nlst() |
40 | | - mane_summary_file = \ |
41 | | - [f for f in files if f.endswith(".summary.txt.gz")] |
42 | | - if not mane_summary_file: |
43 | | - raise Exception("Unable to download MANE summary data") |
44 | | - mane_summary_file = mane_summary_file[0] |
45 | | - self._mane_summary_path = \ |
46 | | - self._data_dir / mane_summary_file[:-3] |
47 | | - mane_data_path = self._data_dir / mane_summary_file |
48 | | - if not self._mane_summary_path.exists(): |
49 | | - with open(mane_data_path, "wb") as fp: |
50 | | - ftp.retrbinary(f"RETR {mane_summary_file}", fp.write) |
51 | | - with gzip.open(mane_data_path, "rb") as f_in: |
52 | | - with open(self._mane_summary_path, "wb") as f_out: |
53 | | - shutil.copyfileobj(f_in, f_out) |
54 | | - remove(mane_data_path) |
| 32 | + with FTP("ftp.ncbi.nlm.nih.gov") as ftp: |
| 33 | + ftp.login() |
| 34 | + ftp.cwd("/refseq/MANE/MANE_human/current") |
| 35 | + files = ftp.nlst() |
| 36 | + mane_summary_file = \ |
| 37 | + [f for f in files if f.endswith(".summary.txt.gz")] |
| 38 | + if not mane_summary_file: |
| 39 | + raise Exception("Unable to download MANE summary data") |
| 40 | + mane_summary_file = mane_summary_file[0] |
| 41 | + self._mane_summary_path = \ |
| 42 | + self._data_dir / mane_summary_file[:-3] |
| 43 | + mane_data_path = self._data_dir / mane_summary_file |
| 44 | + if not self._mane_summary_path.exists(): |
| 45 | + logger.info("Downloading MANE summary file from NCBI.") |
| 46 | + with open(mane_data_path, "wb") as fp: |
| 47 | + ftp.retrbinary(f"RETR {mane_summary_file}", fp.write) |
| 48 | + with gzip.open(mane_data_path, "rb") as f_in: |
| 49 | + with open(self._mane_summary_path, "wb") as f_out: |
| 50 | + shutil.copyfileobj(f_in, f_out) |
| 51 | + remove(mane_data_path) |
| 52 | + logger.info("MANE summary file download complete.") |
| 53 | + return self._mane_summary_path |
55 | 54 |
|
56 | | - def _download_lrg_refseq_gene_data(self, ftp: FTP) -> None: |
57 | | - """Download latest LRG_RefSeqGene and set path |
| 55 | + def get_lrg_refseq_gene_data(self) -> Path: |
| 56 | + """Identify latest LRG RefSeq Gene file. If unavailable locally, download from |
| 57 | + source. |
58 | 58 |
|
59 | | - :param FTP ftp: FTP connection |
| 59 | + :return: path to acquired LRG RefSeq Gene data file |
60 | 60 | """ |
61 | | - lrg_refseqgene_file = "LRG_RefSeqGene" |
62 | | - ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/" |
63 | | - ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}" |
64 | | - timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip() |
65 | | - date = str(parser.parse(timestamp)).split()[0] |
66 | | - version = datetime.datetime.strptime(date, |
67 | | - "%Y-%m-%d").strftime("%Y%m%d") |
68 | | - |
69 | | - fn_versioned = f"{lrg_refseqgene_file}_{version}" |
70 | | - lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file |
71 | | - self._lrg_refseqgene_path = self._data_dir / fn_versioned |
72 | | - if not self._lrg_refseqgene_path.exists(): |
73 | | - ftp.cwd(ftp_dir_path) |
74 | | - with open(lrg_refseqgene_path, "wb") as fp: |
75 | | - ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write) |
76 | | - with open(lrg_refseqgene_path, "rb") as f_in: |
77 | | - with open(self._lrg_refseqgene_path, "wb") as f_out: |
78 | | - shutil.copyfileobj(f_in, f_out) |
79 | | - remove(lrg_refseqgene_path) |
| 61 | + with FTP("ftp.ncbi.nlm.nih.gov") as ftp: |
| 62 | + ftp.login() |
| 63 | + lrg_refseqgene_file = "LRG_RefSeqGene" |
| 64 | + ftp_dir_path = "/refseq/H_sapiens/RefSeqGene/" |
| 65 | + ftp_file_path = f"{ftp_dir_path}{lrg_refseqgene_file}" |
| 66 | + timestamp = ftp.voidcmd(f"MDTM {ftp_file_path}")[4:].strip() |
| 67 | + date = str(parser.parse(timestamp)).split()[0] |
| 68 | + version = datetime.datetime.strptime(date, |
| 69 | + "%Y-%m-%d").strftime("%Y%m%d") |
| 70 | + fn_versioned = f"{lrg_refseqgene_file}_{version}" |
| 71 | + lrg_refseqgene_path = self._data_dir / lrg_refseqgene_file |
| 72 | + self._lrg_refseqgene_path = self._data_dir / fn_versioned |
| 73 | + if not self._lrg_refseqgene_path.exists(): |
| 74 | + logger.info("Downloading LRG RefSeq data from NCBI.") |
| 75 | + ftp.cwd(ftp_dir_path) |
| 76 | + with open(lrg_refseqgene_path, "wb") as fp: |
| 77 | + ftp.retrbinary(f"RETR {lrg_refseqgene_file}", fp.write) |
| 78 | + with open(lrg_refseqgene_path, "rb") as f_in: |
| 79 | + with open(self._lrg_refseqgene_path, "wb") as f_out: |
| 80 | + shutil.copyfileobj(f_in, f_out) |
| 81 | + remove(lrg_refseqgene_path) |
| 82 | + logger.info("LRG RefSeq data download complete.") |
| 83 | + return self._lrg_refseqgene_path |
0 commit comments