Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## Helping Out

There are `TODOs` that better enhance the reproducability of datasets or analysis of algorithm outputs, as well as
There are `TODOs` that better enhance the reproducability and accuracy of datasets or analysis of algorithm outputs, as well as
[open resolvable issues](https://github.com/Reed-CompBio/spras-benchmarking/).

## Adding a dataset
Expand Down
1 change: 1 addition & 0 deletions cache/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
artifacts
3 changes: 3 additions & 0 deletions cache/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# cache

Handles artifact fetching and cache. This folder has a `Snakefile` which only contains a single function used for producing fetching rules.
34 changes: 34 additions & 0 deletions cache/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from cache import link
from cache.util import uncompress
import urllib.parse
from dataclasses import dataclass
from typing import Union
from pathlib import Path

@dataclass
class FetchConfig:
directive: list[str]
uncompress: bool = False

def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]):
"""
Produces fetch rules based on a dictionary mapping
output files to their directory.py-based directive.
"""
# Map inputs to be wrapped with FetchConfig if list[str]
input_dict = {k: FetchConfig(v) if isinstance(v, list) else v for k, v in input_dict.items()}

directives = [urllib.parse.quote_plus("/".join(directive.directive)) for directive in input_dict.values()]
assert len(directives) == len(set(directives)), "Directives aren't unique!"

for output_file, config in input_dict.items():
# Since placeholders are evaluated when the job is actually ran,
# we pass data using params and output.
rule:
name: f"fetch_{urllib.parse.quote_plus("/".join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
output: file=output_file
params:
config=config
run:
Path(output.file).parent.mkdir(exist_ok=True)
link(Path(output.file), params.config.directive, uncompress=params.config.uncompress)
82 changes: 82 additions & 0 deletions cache/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
This is how spras-benchmarking handles artifact caching. `cache` should be used specifically inside `Snakefile`
"""

from cache.util import uncompress as uncompress_file
from cache.directory import get_cache_item
from pathlib import Path
import os
from urllib.parse import quote_plus
import pickle

__all__ = ["link"]

dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
artifacts_dir = dir_path / "artifacts"

def get_artifact_name(directive: list[str]) -> str:
return quote_plus("/".join(directive))

def has_expired(directive: list[str]) -> bool:
"""
Check if the artifact metadata associated with a directive has expired.
Avoids re-downloading the artifact if nothing has changed.
"""
artifact_name = get_artifact_name(directive)
cache_item = get_cache_item(directive)

metadata_dir = artifacts_dir / 'metadata'
metadata_dir.mkdir(exist_ok=True)
metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata')

# metadata never existed: we need to retrieve the new file
if not metadata_file.exists():
with open(metadata_file, 'wb') as f:
pickle.dump(cache_item, f)
return True

old_cache_item = None
with open(metadata_file, 'rb') as f:
old_cache_item = pickle.load(f)

# metadata expired: re-retrieve the item
if old_cache_item != cache_item:
with open(metadata_file, 'wb') as f:
pickle.dump(cache_item, f)
return True

# metadata hasn't changed and already existed: this hasn't expired
return False

def link(output: str, directive: list[str], uncompress=False):
"""
Links output files from cache.directory directives.
For example,

```py
link("output/ensg-ensp.tsv", ["BioMart", "ensg-ensp.tsv"])
```

would download and check BioMart's cache for ENSG-ENSP mapping, then symlink the cached output
(lying somewhere in the cache folder) with the desired `output`.
"""

artifacts_dir.mkdir(exist_ok=True)

artifact_name = get_artifact_name(directive)

Path(output).unlink(missing_ok=True)

# Re-download if the directive has expired.
cache_item = get_cache_item(directive)
if has_expired(directive):
(artifacts_dir / artifact_name).unlink(missing_ok=True)
cache_item.download(artifacts_dir / artifact_name)

if uncompress:
uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed')
uncompressed_artifact_path.unlink(missing_ok=True)
uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path)
Path(output).symlink_to(uncompressed_artifact_path)
else:
Path(output).symlink_to(artifacts_dir / artifact_name)
1 change: 1 addition & 0 deletions cache/biomart/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# BioMart XML Queries

Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).
See the martview: https://www.ensembl.org/biomart/martview.
61 changes: 46 additions & 15 deletions cache/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

dir_path = Path(os.path.dirname(os.path.realpath(__file__)))


def fetch_biomart_url(xml: str) -> str:
"""
Access BioMart data through the BioMart REST API:
Expand All @@ -20,16 +21,31 @@ def fetch_biomart_url(xml: str) -> str:
ROOT = "http://www.ensembl.org/biomart/martservice?query="
return ROOT + urllib.parse.quote_plus(xml)


@dataclass
class CacheItem:
"""Class for differentriating between offline and online items in a cache."""

name: str
"""The display name of the artifact, used for human-printing."""
cached: str
online: str

@classmethod
def cache_only(cls, name: str, cached: str) -> "CacheItem":
"""Wrapper method to explicitly declare a CacheItem as cached only."""
return cls(name=name, online=cached, cached="")

def download(self, output: str | PathLike):
print(f"Fetching {self.name}...")
print(f"Downloading {self.online}...")

if self.cached == "":
# From CacheItem.cached_only
# (gdown doesn't take in Paths for the output_file, so we must stringify it here)
gdown.download(self.online, str(output))
return

urllib.request.urlretrieve(self.online, output)

with NamedTemporaryFile() as cached_file:
Expand All @@ -45,82 +61,97 @@ def download(self, output: str | PathLike):
directory: CacheDirectory = {
"STRING": {
"9606": {
"links": CacheItem(
"9606.protein.links.txt.gz": CacheItem(
name="STRING 9606 protein links",
cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
),
"aliases": CacheItem(
"9606.protein.aliases.txt.gz": CacheItem(
name="STRING 9606 protein aliases",
cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
)
),
}
},
"UniProt": {
# We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth.
"9606": {
# We prefer manually curated genes.
"SwissProt_9606.tsv": CacheItem(
name="UniProt 9606 SwissProt genes",
cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"
online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
),
"HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
name="UniProt 9606 ID external database mapping",
cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz",
),
"HUMAN_9606_idmapping.dat.gz": CacheItem(
name="UniProt 9606 internal id mapping",
cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"
)
online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz",
),
}
},
"DISEASES": {
# Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
# archived files directory instead.
"tiga_gene-trait_stats.tsv": CacheItem(
name="TIGA data",
cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
),
"HumanDO.tsv": CacheItem(
name="Disease ontology data",
cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
),
"human_disease_textmining_filtered.tsv": CacheItem(
name="DISEASES textmining channel",
cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
),
"human_disease_knowledge_filtered.tsv": CacheItem(
name="DISEASES knowledge channel",
cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
),
},
"BioMart": {
"ensg-ensp.tsv": CacheItem(
name="BioMart ENSG <-> ENSP mapping",
cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
)
},
"DepMap": {
"OmicsProfiles.csv": CacheItem(
name="DepMap omics metadata",
cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads",
),
"CRISPRGeneDependency.csv": CacheItem(
name="DepMap gene dependency probability estimates",
cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads",
),
"OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
name="DepMap genotyped matrix",
cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads",
),
"OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
name="DepMap model-level TPMs",
cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads",
),
"OmicsCNGeneWGS.csv": CacheItem(
name="DepMap gene-level copy number data",
cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"
)
}
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
),
},
}


Expand Down
1 change: 0 additions & 1 deletion cache/index.py

This file was deleted.

1 change: 1 addition & 0 deletions databases/util.py → cache/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import gzip
import shutil


def uncompress(source: Path, target: Path):
"""Uncompresses a .gz file"""
# Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775
Expand Down
3 changes: 0 additions & 3 deletions databases/README.md

This file was deleted.

Empty file removed databases/__init__.py
Empty file.
49 changes: 0 additions & 49 deletions databases/stringdb.py

This file was deleted.

24 changes: 12 additions & 12 deletions datasets/depmap/Snakefile
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
include: "../../cache/Snakefile"

rule all:
# We currently only care about the FADU cell line.
input:
"processed/FADU_cell_line_prizes_input_nonzero.txt",
"processed/FADU_cell_line_prizes.txt",
"processed/FADU_gold_standard_thresh_0_5.txt"

rule fetch:
output:
"raw/CRISPRGeneDependency.csv",
"raw/OmicsProfiles.csv",
"raw/OmicsSomaticMutationsMatrixDamaging.csv",
"raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv",
"raw/OmicsCNGeneWGS.csv",
"raw/HUMAN_9606_idmapping.tsv",
"raw/HUMAN_9606_idmapping_selected.tsv",
"raw/SwissProt_9606.tsv"
shell:
"uv run scripts/fetch.py"
produce_fetch_rules({
"raw/CRISPRGeneDependency.csv": ["DepMap", "CRISPRGeneDependency.csv"],
"raw/OmicsProfiles.csv": ["DepMap", "OmicsProfiles.csv"],
"raw/OmicsSomaticMutationsMatrixDamaging.csv": ["DepMap", "OmicsSomaticMutationsMatrixDamaging.csv"],
"raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv": ["DepMap", "OmicsExpressionProteinCodingGenesTPMLogp1.csv"],
"raw/OmicsCNGeneWGS.csv": ["DepMap", "OmicsCNGeneWGS.csv"],
"raw/HUMAN_9606_idmapping.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"], uncompress=True),
"raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
"raw/SwissProt_9606.tsv": ["UniProt", "9606", "SwissProt_9606.tsv"],
})

rule mapping:
input:
Expand Down
1 change: 1 addition & 0 deletions datasets/depmap/scripts/cell_line_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def process_single_cell_line(
print(f"Processing for cell line '{cell_line_name}' completed successfully.")
return True


def generate_gold_standard(cell_line_name, model_id, CRISPR_dependency, gene_to_uniprot, threshold: float):
"""Generate gold standard file for the cell line based on CRISPR dependency and gene to Uniprot mapping."""
# map Uniprot IDs to gene symbols in the CRISPR dependency data
Expand Down
Loading
Loading