diff --git a/CHANGELOG.md b/CHANGELOG.md index be2dd35..5e32837 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog -## Version 0.0.1 - 0.0.3 +## Version 0.0.1 - 0.0.4 - Initial release of the package with class structure and basic functionality. - Fixed minor bugs handling existing resources with cache. +- Download and cache Annotation Hub's sqlite metadata file for querying available TxDB objects. diff --git a/src/txdb/_ahub.py b/src/txdb/_ahub.py index 54b1256..73ecaab 100644 --- a/src/txdb/_ahub.py +++ b/src/txdb/_ahub.py @@ -28,185 +28,5 @@ __copyright__ = "Jayaram Kancherla" __license__ = "MIT" -TXDB_CONFIG = { - "TxDb.Athaliana.BioMart.plantsmart22": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Athaliana.BioMart.plantsmart22.sqlite", - }, - "TxDb.Athaliana.BioMart.plantsmart25": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Athaliana.BioMart.plantsmart25.sqlite", - }, - "TxDb.Athaliana.BioMart.plantsmart28": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Athaliana.BioMart.plantsmart28.sqlite", - }, - "TxDb.Btaurus.UCSC.bosTau8.refGene": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Btaurus.UCSC.bosTau8.refGene.sqlite", - }, - "TxDb.Celegans.UCSC.ce11.refGene": { - "release_date": "2019-05-01", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.9/TxDb.Celegans.UCSC.ce11.refGene.sqlite", - }, - "TxDb.Celegans.UCSC.ce6.ensGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Celegans.UCSC.ce6.ensGene.sqlite", - }, - "TxDb.Cfamiliaris.UCSC.canFam3.refGene": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Cfamiliaris.UCSC.canFam3.refGene.sqlite", - }, - "TxDb.Dmelanogaster.UCSC.dm3.ensGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Dmelanogaster.UCSC.dm3.ensGene.sqlite", - }, - "TxDb.Dmelanogaster.UCSC.dm6.ensGene": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Dmelanogaster.UCSC.dm6.ensGene.sqlite", - }, - "TxDb.Drerio.UCSC.danRer10.refGene": { - "release_date": "2019-05-01", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.9/TxDb.Drerio.UCSC.danRer10.refGene.sqlite", - }, - "TxDb.Ggallus.UCSC.galGal4.refGene": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Ggallus.UCSC.galGal4.refGene.sqlite", - }, - "TxDb.Hsapiens.BioMart.igis": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Hsapiens.BioMart.igis.sqlite", - }, - "TxDb.Hsapiens.UCSC.hg18.knownGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Hsapiens.UCSC.hg18.knownGene.sqlite", - }, - "TxDb.Hsapiens.UCSC.hg19.knownGene": { - "release_date": "2025-10-29", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.22/TxDb.Hsapiens.UCSC.hg19.knownGene.sqlite", - }, - "TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Hsapiens.UCSC.hg19.lincRNAsTranscripts.sqlite", - }, - "TxDb.Hsapiens.UCSC.hg38.knownGene": { - "release_date": "2025-10-29", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.22/TxDb.Hsapiens.UCSC.hg38.knownGene.sqlite", - }, - "TxDb.Hsapiens.UCSC.hg38.refGene": { - "release_date": "2024-04-02", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.19/TxDb.Hsapiens.UCSC.hg38.refGene.sqlite", - }, - "TxDb.Mmulatta.UCSC.rheMac3.refGene": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Mmulatta.UCSC.rheMac3.refGene.sqlite", - }, - "TxDb.Mmulatta.UCSC.rheMac8.refGene": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Mmulatta.UCSC.rheMac8.refGene.sqlite", - }, - "TxDb.Mmulatta.UCSC.rheMac10.refGene": { - "release_date": "2021-10-08", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.14/TxDb.Mmulatta.UCSC.rheMac10.refGene.sqlite", - }, - "TxDb.Mmusculus.UCSC.mm10.ensGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Mmusculus.UCSC.mm10.ensGene.sqlite", - }, - "TxDb.Mmusculus.UCSC.mm10.knownGene": { - "release_date": "2019-05-01", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.9/TxDb.Mmusculus.UCSC.mm10.knownGene.sqlite", - }, - "TxDb.Mmusculus.UCSC.mm39.refGene": { - "release_date": "2024-04-02", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.19/TxDb.Mmusculus.UCSC.mm39.refGene.sqlite", - }, - "TxDb.Mmusculus.UCSC.mm39.knownGene": { - "release_date": "2025-03-11", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.21/TxDb.Mmusculus.UCSC.mm39.knownGene.sqlite", - }, - "TxDb.Mmusculus.UCSC.mm9.knownGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Mmusculus.UCSC.mm9.knownGene.sqlite", - }, - "TxDb.Ptroglodytes.UCSC.panTro4.refGene": { - "release_date": "2020-04-27", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.11/TxDb.Ptroglodytes.UCSC.panTro4.refGene.sqlite", - }, - "TxDb.Ptroglodytes.UCSC.panTro5.refGene": { - "release_date": "2020-04-27", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.11/TxDb.Ptroglodytes.UCSC.panTro5.refGene.sqlite", - }, - "TxDb.Ptroglodytes.UCSC.panTro6.refGene": { - "release_date": "2019-10-29", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.10/TxDb.Ptroglodytes.UCSC.panTro6.refGene.sqlite", - }, - "TxDb.Rnorvegicus.BioMart.igis": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Rnorvegicus.BioMart.igis.sqlite", - }, - "TxDb.Rnorvegicus.UCSC.rn4.ensGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Rnorvegicus.UCSC.rn4.ensGene.sqlite", - }, - "TxDb.Rnorvegicus.UCSC.rn5.refGene": { - "release_date": "2020-04-27", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.11/TxDb.Rnorvegicus.UCSC.rn5.refGene.sqlite", - }, - "TxDb.Rnorvegicus.UCSC.rn6.refGene": { - "release_date": "2019-05-01", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.9/TxDb.Rnorvegicus.UCSC.rn6.refGene.sqlite", - }, - "TxDb.Rnorvegicus.UCSC.rn6.ncbiRefSeq": { - "release_date": "2020-10-20", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.12/TxDb.Rnorvegicus.UCSC.rn6.ncbiRefSeq.sqlite", - }, - "TxDb.Rnorvegicus.UCSC.rn7.refGene": { - "release_date": "2022-04-18", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.15/TxDb.Rnorvegicus.UCSC.rn7.refGene.sqlite", - }, - "TxDb.Scerevisiae.UCSC.sacCer2.sgdGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Scerevisiae.UCSC.sacCer2.sgdGene.sqlite", - }, - "TxDb.Scerevisiae.UCSC.sacCer3.sgdGene": { - "release_date": "2016-12-22", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.4/TxDb.Scerevisiae.UCSC.sacCer3.sgdGene.sqlite", - }, - "TxDb.Sscrofa.UCSC.susScr3.refGene": { - "release_date": "2020-04-27", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.11/TxDb.Sscrofa.UCSC.susScr3.refGene.sqlite", - }, - "TxDb.Sscrofa.UCSC.susScr11.refGene": { - "release_date": "2020-04-27", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.11/TxDb.Sscrofa.UCSC.susScr11.refGene.sqlite", - }, - "TxDb.Ggallus.UCSC.galGal5.refGene": { - "release_date": "2020-04-27", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.11/TxDb.Ggallus.UCSC.galGal5.refGene.sqlite", - }, - "TxDb.Ggallus.UCSC.galGal6.refGene": { - "release_date": "2019-10-29", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.10/TxDb.Ggallus.UCSC.galGal6.refGene.sqlite", - }, - "TxDb.Cfamiliaris.UCSC.canFam4.refGene": { - "release_date": "2021-10-08", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.14/TxDb.Cfamiliaris.UCSC.canFam4.refGene.sqlite", - }, - "TxDb.Cfamiliaris.UCSC.canFam5.refGene": { - "release_date": "2021-10-08", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.14/TxDb.Cfamiliaris.UCSC.canFam5.refGene.sqlite", - }, - "TxDb.Cfamiliaris.UCSC.canFam6.refGene": { - "release_date": "2023-04-06", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.17/TxDb.Cfamiliaris.UCSC.canFam6.refGene.sqlite", - }, - "TxDb.Celegans.UCSC.ce11.ensGene": { - "release_date": "2022-04-18", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.15/TxDb.Celegans.UCSC.ce11.ensGene.sqlite", - }, - "TxDb.Drerio.UCSC.danRer11.refGene": { - "release_date": "2019-05-01", - "url": "https://mghp.osn.xsede.org/bir190004-bucket01/AnnotationHub/ucsc/standard/3.9/TxDb.Drerio.UCSC.danRer11.refGene.sqlite", - }, -} + +AHUB_METADATA_URL = "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" diff --git a/src/txdb/txdbregistry.py b/src/txdb/txdbregistry.py index c0a3fe3..7cbe3d1 100644 --- a/src/txdb/txdbregistry.py +++ b/src/txdb/txdbregistry.py @@ -1,10 +1,11 @@ import os +import sqlite3 from pathlib import Path from typing import Any, Dict, Optional, Union from pybiocfilecache import BiocFileCache -from ._ahub import TXDB_CONFIG +from ._ahub import AHUB_METADATA_URL from .record import TxDbRecord from .txdb import TxDb @@ -14,32 +15,98 @@ class TxDbRegistry: - """Registry for TxDb resources backed by TXDB_CONFIG and a BiocFileCache.""" + """Registry for TxDb resources, populated from AnnotationHub.""" def __init__( self, - config: Dict[str, Dict[str, Any]] = TXDB_CONFIG, cache_dir: Optional[Union[str, Path]] = None, + force: bool = False, ) -> None: """Initialize the TxDB registry. Args: - config: - TXDB_CONFIG-style mapping: - txdb_id -> {"release_date": "YYYY-MM-DD", "url": "..."} - cache_dir: Directory for the BiocFileCache database and cached files. If None, defaults to "~/.cache/txdb_bfc". + + force: + If True, force re-download of the AnnotationHub metadata database. """ if cache_dir is None: cache_dir = Path.home() / ".cache" / "txdb_bfc" - cache_dir = Path(cache_dir) - cache_dir.mkdir(parents=True, exist_ok=True) + self._cache_dir = Path(cache_dir) + self._cache_dir.mkdir(parents=True, exist_ok=True) + self._bfc = BiocFileCache(self._cache_dir) + + self._registry_map: Dict[str, TxDbRecord] = {} + + self._initialize_registry(force=force) - self._bfc = BiocFileCache(cache_dir) - self._config = config + def _initialize_registry(self, force: bool = False): + """Fetch the AnnotationHub metadata and populate the registry.""" + rname = "annotationhub_metadata" + + existing = None + try: + existing = self._bfc.get(rname) + except Exception: + pass + + if force and existing: + try: + self._bfc.remove(rname) + except Exception: + pass + existing = None + + if existing: + md_resource = existing + else: + md_resource = self._bfc.add(rname, AHUB_METADATA_URL, rtype="web") + + md_path = self._get_filepath(md_resource) + + if not md_path or not os.path.exists(md_path): + if existing and not force: + return self._initialize_registry(force=True) + + raise RuntimeError("Failed to retrieve AnnotationHub metadata database.") + + conn = sqlite3.connect(md_path) + try: + query = """ + SELECT + r.title, + r.rdatadateadded, + lp.location_prefix || rp.rdatapath AS full_rdatapath + FROM resources r + LEFT JOIN location_prefixes lp + ON r.location_prefix_id = lp.id + LEFT JOIN rdatapaths rp + ON rp.resource_id = r.id + WHERE r.title LIKE 'TxDb%.sqlite' + ORDER BY r.rdatadateadded DESC; + """ + cursor = conn.cursor() + cursor.execute(query) + rows = cursor.fetchall() + finally: + conn.close() + + for title, date_added, url in rows: + if title.endswith(".sqlite"): + txdb_id = title[:-7] + else: + txdb_id = title + + if txdb_id in self._registry_map: + continue + + entry = {"url": url, "release_date": str(date_added).split(" ")[0] if date_added else None} + + record = TxDbRecord.from_config_entry(txdb_id, entry) + self._registry_map[txdb_id] = record def list_txdb(self) -> list[str]: """List all available TxDb IDs. @@ -47,7 +114,7 @@ def list_txdb(self) -> list[str]: Returns: A list of valid TxDb ID strings. """ - return list(self._config.keys()) + return sorted(list(self._registry_map.keys())) def get_record(self, txdb_id: str) -> TxDbRecord: """Get the metadata record for a given TxDb ID. @@ -62,14 +129,10 @@ def get_record(self, txdb_id: str) -> TxDbRecord: Raises: KeyError: If the ID is not found in the configuration. """ - if txdb_id not in self._config: + if txdb_id not in self._registry_map: raise KeyError(f"TxDb ID '{txdb_id}' not found in registry.") - entry = self._config[txdb_id] - return TxDbRecord.from_config_entry(txdb_id, entry) - - def _get_absolute_path(self, x: str): - return f"{self._bfc.config.cache_dir}/{x}" + return self._registry_map[txdb_id] def download(self, txdb_id: str, force: bool = False) -> str: """Download and cache the TxDb file. @@ -102,24 +165,16 @@ def download(self, txdb_id: str, force: bool = False) -> str: download=True, ) - path = self._resource_path(resource) - if path is None: - raise RuntimeError(f"Could not resolve local path for resource {key!r}") + path = self._get_filepath(resource) - abs_path = self._get_absolute_path(path) - - # Check if file is empty - if not os.path.exists(abs_path) or os.path.getsize(abs_path) == 0: + if not path or not os.path.exists(path) or os.path.getsize(path) == 0: try: self._bfc.remove(key) except Exception: pass - raise RuntimeError( - f"Download failed for {txdb_id}: File at {abs_path} is empty or missing. " - "Please check your internet connection or the resource URL." - ) + raise RuntimeError(f"Download failed for {txdb_id}. File is empty or missing.") - return str(abs_path) + return path def load_db(self, txdb_id: str, force: bool = False) -> TxDb: """Load a TxDb object for the given ID. @@ -137,42 +192,16 @@ def load_db(self, txdb_id: str, force: bool = False) -> TxDb: Returns: An initialized TxDb object connected to the cached database. """ - if not force and self.exists_locally(txdb_id): - path = self.local_path(txdb_id) - if path: - return TxDb(path) - path = self.download(txdb_id, force=force) return TxDb(path) - def exists_locally(self, txdb_id: str) -> bool: - """Check if the file for a given TxDb ID is already present in the cache.""" - try: - resource = self._bfc.get(txdb_id) - except Exception: - return False - - path = self._resource_path(resource) - abs_path = self._get_absolute_path(path) - return bool(abs_path and os.path.exists(abs_path) and os.path.getsize(abs_path) > 0) - - def local_path(self, txdb_id: str) -> Optional[str]: - """Return local path if cached, else None.""" - try: - resource = self._bfc.get(txdb_id) - except Exception: - return None - - path = self._resource_path(resource) - abs_path = self._get_absolute_path(path) - if not abs_path or not os.path.exists(abs_path) or os.path.getsize(abs_path) == 0: - return None - - return str(abs_path) - - def _resource_path(self, resource: Any) -> Optional[str]: - """Helper to extract path from a BiocFileCache resource object.""" + def _get_filepath(self, resource: Any) -> Optional[str]: + """Helper to extract absolute path from a BiocFileCache resource.""" if hasattr(resource, "rpath"): - return str(resource.rpath) + rel_path = str(resource.rpath) + elif hasattr(resource, "get"): + rel_path = str(resource.get("rpath")) + else: + return None - return str(resource.get("rpath")) if hasattr(resource, "get") else None + return str(self._cache_dir / rel_path) diff --git a/tests/test_registry.py b/tests/test_registry.py index cad84d4..26a3f2c 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -1,5 +1,3 @@ -import sqlite3 - import pytest from txdb import TxDbRegistry @@ -9,25 +7,6 @@ __license__ = "MIT" -@pytest.fixture -def mock_db_file(tmp_path): - """Create a temporary SQLite file with minimal schema.""" - db_path = tmp_path / "mock.sqlite" - conn = sqlite3.connect(db_path) - - conn.execute("CREATE TABLE chrominfo (chrom TEXT, length INTEGER, is_circular INTEGER)") - conn.execute("INSERT INTO chrominfo VALUES ('chr1', 1000, 0)") - - conn.execute( - "CREATE TABLE transcript (tx_id INTEGER, tx_name TEXT, tx_chrom TEXT, tx_strand TEXT, tx_start INTEGER, tx_end INTEGER, _tx_id INTEGER)" - ) - conn.execute("INSERT INTO transcript VALUES (1, 't1', 'chr1', '+', 100, 200, 1)") - - conn.commit() - conn.close() - return str(db_path) - - @pytest.fixture def registry(tmp_path): """Initialize registry with a temp cache dir.""" @@ -37,30 +16,3 @@ def registry(tmp_path): def test_registry_init(registry): assert isinstance(registry, TxDbRegistry) assert "TxDb.Mmusculus.UCSC.mm10.knownGene" in registry.list_txdb() - - -# @patch("txdb.txdbregistry.BiocFileCache") -# def test_load_db(mock_bfc_cls, registry, mock_db_file): -# # Setup Mock BiocFileCache instance -# mock_bfc = MagicMock() -# # When .add() is called (simulating download), return a resource with the mock path -# mock_resource = MagicMock() -# mock_resource.rpath = mock_db_file -# mock_resource.get.return_value = mock_db_file -# mock_bfc.add.return_value = mock_resource - -# # Inject mock into registry -# registry._bfc = mock_bfc - -# # Test load_db -# txdb = registry.load_db("TxDb.Mmusculus.UCSC.mm10.knownGene") - -# assert isinstance(txdb, TxDb) -# assert txdb.dbpath == mock_db_file -# print(txdb.seqinfo) -# assert ( -# txdb.seqinfo.__repr__() -# == SeqInfo(seqnames=["chr1"], seqlengths=[1000], is_circular=[False], genome=[None]).__repr__() -# ) - -# txdb.close()