Reed-CompBio · tristan-f-r · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 ## Helping Out
 
-There are `TODOs` that better enhance the reproducability of datasets or analysis of algorithm outputs, as well as
+There are `TODOs` that better enhance the reproducability and accuracy of datasets or analysis of algorithm outputs, as well as
 [open resolvable issues](https://github.com/Reed-CompBio/spras-benchmarking/).
 
 ## Adding a dataset

diff --git a/cache/.gitignore b/cache/.gitignore
@@ -0,0 +1 @@
+artifacts
diff --git a/cache/README.md b/cache/README.md
@@ -0,0 +1,3 @@
+# cache
+
+Handles artifact fetching and cache. This folder has a `Snakefile` which only contains a single function used for producing fetching rules.
diff --git a/cache/Snakefile b/cache/Snakefile
@@ -0,0 +1,34 @@
+from cache import link
+from cache.util import uncompress
+import urllib.parse
+from dataclasses import dataclass
+from typing import Union
+from pathlib import Path
+
+@dataclass
+class FetchConfig:
+    directive: list[str]
+    uncompress: bool = False
+
+def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]):
+    """
+    Produces fetch rules based on a dictionary mapping
+    output files to their directory.py-based directive.
+    """
+    # Map inputs to be wrapped with FetchConfig if list[str]
+    input_dict = {k: FetchConfig(v) if isinstance(v, list) else v for k, v in input_dict.items()}
+
+    directives = [urllib.parse.quote_plus("/".join(directive.directive)) for directive in input_dict.values()]
+    assert len(directives) == len(set(directives)), "Directives aren't unique!"
+
+    for output_file, config in input_dict.items():
+        # Since placeholders are evaluated when the job is actually ran,
+        # we pass data using params and output.
+        rule:
+            name: f"fetch_{urllib.parse.quote_plus("/".join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
+            output: file=output_file
+            params:
+                config=config
+            run:
+                Path(output.file).parent.mkdir(exist_ok=True)
+                link(Path(output.file), params.config.directive, uncompress=params.config.uncompress)
diff --git a/cache/__init__.py b/cache/__init__.py
@@ -0,0 +1,82 @@
+"""
+This is how spras-benchmarking handles artifact caching. `cache` should be used specifically inside `Snakefile`
+"""
+
+from cache.util import uncompress as uncompress_file
+from cache.directory import get_cache_item
+from pathlib import Path
+import os
+from urllib.parse import quote_plus
+import pickle
+
+__all__ = ["link"]
+
+dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
+artifacts_dir = dir_path / "artifacts"
+
+def get_artifact_name(directive: list[str]) -> str:
+    return quote_plus("/".join(directive))
+
+def has_expired(directive: list[str]) -> bool:
+    """
+    Check if the artifact metadata associated with a directive has expired.
+    Avoids re-downloading the artifact if nothing has changed.
+    """
+    artifact_name = get_artifact_name(directive)
+    cache_item = get_cache_item(directive)
+
+    metadata_dir = artifacts_dir / 'metadata'
+    metadata_dir.mkdir(exist_ok=True)
+    metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata')
+
+    # metadata never existed: we need to retrieve the new file
+    if not metadata_file.exists():
+        with open(metadata_file, 'wb') as f:
+            pickle.dump(cache_item, f)
+        return True
+
+    old_cache_item = None
+    with open(metadata_file, 'rb') as f:
+        old_cache_item = pickle.load(f)
+
+    # metadata expired: re-retrieve the item
+    if old_cache_item != cache_item:
+        with open(metadata_file, 'wb') as f:
+            pickle.dump(cache_item, f)
+        return True
+
+    # metadata hasn't changed and already existed: this hasn't expired
+    return False
+
+def link(output: str, directive: list[str], uncompress=False):
+    """
+    Links output files from cache.directory directives.
+    For example,
+
+    ```py
+    link("output/ensg-ensp.tsv", ["BioMart", "ensg-ensp.tsv"])
+    ```
+
+    would download and check BioMart's cache for ENSG-ENSP mapping, then symlink the cached output
+    (lying somewhere in the cache folder) with the desired `output`.
+    """
+
+    artifacts_dir.mkdir(exist_ok=True)
+
+    artifact_name = get_artifact_name(directive)
+
+    Path(output).unlink(missing_ok=True)
+
+    # Re-download if the directive has expired.
+    cache_item = get_cache_item(directive)
+    if has_expired(directive):
+        (artifacts_dir / artifact_name).unlink(missing_ok=True)
+        cache_item.download(artifacts_dir / artifact_name)
+
+    if uncompress:
+        uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed')
+        uncompressed_artifact_path.unlink(missing_ok=True)
+        uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path)
+        Path(output).symlink_to(uncompressed_artifact_path)
+    else:
+        Path(output).symlink_to(artifacts_dir / artifact_name)
diff --git a/cache/biomart/README.md b/cache/biomart/README.md
@@ -1,3 +1,4 @@
 # BioMart XML Queries
 
 Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).
+See the martview: https://www.ensembl.org/biomart/martview.
diff --git a/cache/directory.py b/cache/directory.py
@@ -12,6 +12,7 @@
 
 dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
 
+
 def fetch_biomart_url(xml: str) -> str:
     """
     Access BioMart data through the BioMart REST API:
@@ -20,16 +21,31 @@ def fetch_biomart_url(xml: str) -> str:
     ROOT = "http://www.ensembl.org/biomart/martservice?query="
     return ROOT + urllib.parse.quote_plus(xml)
 
+
 @dataclass
 class CacheItem:
     """Class for differentriating between offline and online items in a cache."""
 
+    name: str
+    """The display name of the artifact, used for human-printing."""
     cached: str
     online: str
 
+    @classmethod
+    def cache_only(cls, name: str, cached: str) -> "CacheItem":
+        """Wrapper method to explicitly declare a CacheItem as cached only."""
+        return cls(name=name, online=cached, cached="")
+
     def download(self, output: str | PathLike):
+        print(f"Fetching {self.name}...")
         print(f"Downloading {self.online}...")
 
+        if self.cached == "":
+            # From CacheItem.cached_only
+            # (gdown doesn't take in Paths for the output_file, so we must stringify it here)
+            gdown.download(self.online, str(output))
+            return
+
         urllib.request.urlretrieve(self.online, output)
 
         with NamedTemporaryFile() as cached_file:
@@ -45,82 +61,97 @@ def download(self, output: str | PathLike):
 directory: CacheDirectory = {
     "STRING": {
         "9606": {
-            "links": CacheItem(
+            "9606.protein.links.txt.gz": CacheItem(
+                name="STRING 9606 protein links",
                 cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
                 online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
             ),
-            "aliases": CacheItem(
+            "9606.protein.aliases.txt.gz": CacheItem(
+                name="STRING 9606 protein aliases",
                 cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
                 online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
-            )
+            ),
         }
     },
     "UniProt": {
         # We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth.
         "9606": {
             # We prefer manually curated genes.
             "SwissProt_9606.tsv": CacheItem(
+                name="UniProt 9606 SwissProt genes",
                 cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
-                online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"
+                online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
             ),
             "HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
+                name="UniProt 9606 ID external database mapping",
                 cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
-                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
+                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz",
             ),
             "HUMAN_9606_idmapping.dat.gz": CacheItem(
+                name="UniProt 9606 internal id mapping",
                 cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
-                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"
-            )
+                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz",
+            ),
         }
     },
     "DISEASES": {
         # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
         # archived files directory instead.
         "tiga_gene-trait_stats.tsv": CacheItem(
+            name="TIGA data",
             cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
             online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
         ),
         "HumanDO.tsv": CacheItem(
+            name="Disease ontology data",
             cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
             online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
         ),
         "human_disease_textmining_filtered.tsv": CacheItem(
+            name="DISEASES textmining channel",
             cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
             online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
         ),
         "human_disease_knowledge_filtered.tsv": CacheItem(
+            name="DISEASES knowledge channel",
             cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
             online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
         ),
     },
     "BioMart": {
         "ensg-ensp.tsv": CacheItem(
+            name="BioMart ENSG <-> ENSP mapping",
             cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
-            online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
+            online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
         )
     },
     "DepMap": {
         "OmicsProfiles.csv": CacheItem(
+            name="DepMap omics metadata",
             cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads",
         ),
         "CRISPRGeneDependency.csv": CacheItem(
+            name="DepMap gene dependency probability estimates",
             cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads",
         ),
         "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
+            name="DepMap genotyped matrix",
             cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads",
         ),
         "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
+            name="DepMap model-level TPMs",
             cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads",
         ),
         "OmicsCNGeneWGS.csv": CacheItem(
+            name="DepMap gene-level copy number data",
             cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"
-        )
-    }
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
+        ),
+    },
 }
 
 

diff --git a/cache/index.py b/cache/index.py
diff --git a/databases/util.py → cache/util.py b/databases/util.py → cache/util.py
@@ -2,6 +2,7 @@
 import gzip
 import shutil
 
+
 def uncompress(source: Path, target: Path):
     """Uncompresses a .gz file"""
     # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775

diff --git a/databases/README.md b/databases/README.md
diff --git a/databases/__init__.py b/databases/__init__.py
diff --git a/databases/stringdb.py b/databases/stringdb.py
diff --git a/datasets/depmap/Snakefile b/datasets/depmap/Snakefile
@@ -1,22 +1,22 @@
+include: "../../cache/Snakefile"
+
 rule all:
     # We currently only care about the FADU cell line.
     input:
         "processed/FADU_cell_line_prizes_input_nonzero.txt",
         "processed/FADU_cell_line_prizes.txt",
         "processed/FADU_gold_standard_thresh_0_5.txt"
 
-rule fetch:
-    output:
-        "raw/CRISPRGeneDependency.csv",
-        "raw/OmicsProfiles.csv",
-        "raw/OmicsSomaticMutationsMatrixDamaging.csv",
-        "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv",
-        "raw/OmicsCNGeneWGS.csv",
-        "raw/HUMAN_9606_idmapping.tsv",
-        "raw/HUMAN_9606_idmapping_selected.tsv",
-        "raw/SwissProt_9606.tsv"
-    shell:
-        "uv run scripts/fetch.py"
+produce_fetch_rules({
+    "raw/CRISPRGeneDependency.csv": ["DepMap", "CRISPRGeneDependency.csv"],
+    "raw/OmicsProfiles.csv": ["DepMap", "OmicsProfiles.csv"],
+    "raw/OmicsSomaticMutationsMatrixDamaging.csv": ["DepMap", "OmicsSomaticMutationsMatrixDamaging.csv"],
+    "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv": ["DepMap", "OmicsExpressionProteinCodingGenesTPMLogp1.csv"],
+    "raw/OmicsCNGeneWGS.csv": ["DepMap", "OmicsCNGeneWGS.csv"],
+    "raw/HUMAN_9606_idmapping.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"], uncompress=True),
+    "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
+    "raw/SwissProt_9606.tsv": ["UniProt", "9606", "SwissProt_9606.tsv"],
+})
 
 rule mapping:
     input:

diff --git a/datasets/depmap/scripts/cell_line_processing.py b/datasets/depmap/scripts/cell_line_processing.py
@@ -131,6 +131,7 @@ def process_single_cell_line(
         print(f"Processing for cell line '{cell_line_name}' completed successfully.")
         return True
 
+
 def generate_gold_standard(cell_line_name, model_id, CRISPR_dependency, gene_to_uniprot, threshold: float):
     """Generate gold standard file for the cell line based on CRISPR dependency and gene to Uniprot mapping."""
     # map Uniprot IDs to gene symbols in the CRISPR dependency data
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# cache

		Handles artifact fetching and cache. This folder has a `Snakefile` which only contains a single function used for producing fetching rules.