Skip to content

Commit 6c74c50

Browse files
committed
refactor: cache
1 parent ecca2b4 commit 6c74c50

File tree

12 files changed

+108
-79
lines changed

12 files changed

+108
-79
lines changed

cache/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
artifacts

cache/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# cache
22

3-
Handles artifact fetching and cache.
3+
Handles artifact fetching and cache. This folder has a `Snakefile` which only contains a single function used for producing fetching rules.

cache/Snakefile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from cache import link
2+
from cache.util import uncompress
3+
import urllib.parse
4+
from dataclasses import dataclass
5+
from typing import Union
6+
from pathlib import Path
7+
8+
@dataclass
9+
class FetchConfig:
10+
directive: list[str]
11+
uncompress: bool = False
12+
13+
def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]):
14+
"""
15+
Produces fetch rules based on a dictionary mapping
16+
output files to their directory.py-based directive.
17+
"""
18+
# Map inputs to be wrapped with FetchConfig if list[str]
19+
input_dict = {k: FetchConfig(v) if isinstance(v, list) else v for k, v in input_dict.items()}
20+
21+
directives = [urllib.parse.quote_plus("/".join(directive.directive)) for directive in input_dict.values()]
22+
assert len(directives) == len(set(directives)), "Directives aren't unique!"
23+
24+
for output_file, config in input_dict.items():
25+
# Since placeholders are evaluated when the job is actually ran,
26+
# we pass data using params and output.
27+
rule:
28+
name: f"fetch_{urllib.parse.quote_plus("/".join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
29+
output: file=output_file
30+
params:
31+
config=config
32+
run:
33+
Path(output.file).parent.mkdir(exist_ok=True)
34+
link(Path(output.file), params.config.directive, uncompress=params.config.uncompress)

cache/__init__.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,53 @@
22
This is how spras-benchmarking handles artifact caching. `cache` should be used specifically inside `Snakefile`
33
"""
44

5+
from cache.util import uncompress as uncompress_file
56
from cache.directory import get_cache_item
67
from pathlib import Path
78
import os
89
from urllib.parse import quote_plus
10+
import pickle
911

1012
__all__ = ["link"]
1113

1214
dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
15+
artifacts_dir = dir_path / "artifacts"
1316

17+
def get_artifact_name(directive: list[str]) -> str:
18+
return quote_plus("/".join(directive))
1419

15-
def link(output: str, directive: list[str]):
20+
def has_expired(directive: list[str]) -> bool:
21+
"""
22+
Check if the artifact metadata associated with a directive has expired.
23+
Avoids re-downloading the artifact if nothing has changed.
24+
"""
25+
artifact_name = get_artifact_name(directive)
26+
cache_item = get_cache_item(directive)
27+
28+
metadata_dir = artifacts_dir / 'metadata'
29+
metadata_dir.mkdir(exist_ok=True)
30+
metadata_file = (artifacts_dir / 'metadata' / artifact_name).with_suffix((artifacts_dir / artifact_name).suffix + '.metadata')
31+
32+
# metadata never existed: we need to retrieve the new file
33+
if not metadata_file.exists():
34+
with open(metadata_file, 'wb') as f:
35+
pickle.dump(cache_item, f)
36+
return True
37+
38+
old_cache_item = None
39+
with open(metadata_file, 'rb') as f:
40+
old_cache_item = pickle.load(f)
41+
42+
# metadata expired: re-retrieve the item
43+
if old_cache_item != cache_item:
44+
with open(metadata_file, 'wb') as f:
45+
pickle.dump(cache_item, f)
46+
return True
47+
48+
# metadata hasn't changed and already existed: this hasn't expired
49+
return False
50+
51+
def link(output: str, directive: list[str], uncompress=False):
1652
"""
1753
Links output files from cache.directory directives.
1854
For example,
@@ -25,10 +61,22 @@ def link(output: str, directive: list[str]):
2561
(lying somewhere in the cache folder) with the desired `output`.
2662
"""
2763

28-
artifacts_dir = dir_path / "artifacts"
2964
artifacts_dir.mkdir(exist_ok=True)
3065

31-
artifact_name = quote_plus("/".join(directive))
66+
artifact_name = get_artifact_name(directive)
67+
68+
Path(output).unlink(missing_ok=True)
69+
70+
# Re-download if the directive has expired.
71+
cache_item = get_cache_item(directive)
72+
if has_expired(directive):
73+
(artifacts_dir / artifact_name).unlink(missing_ok=True)
74+
cache_item.download(artifacts_dir / artifact_name)
3275

33-
get_cache_item(directive).download(artifacts_dir / artifact_name)
34-
(artifacts_dir / artifact_name).symlink_to(output)
76+
if uncompress:
77+
uncompressed_artifact_path = Path(str(artifacts_dir / artifact_name) + '.uncompressed')
78+
uncompressed_artifact_path.unlink(missing_ok=True)
79+
uncompress_file(artifacts_dir / artifact_name, uncompressed_artifact_path)
80+
Path(output).symlink_to(uncompressed_artifact_path)
81+
else:
82+
Path(output).symlink_to(artifacts_dir / artifact_name)

cache/directory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def download(self, output: str | PathLike):
6666
cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
6767
online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
6868
),
69-
"9606.protein.alliases.txt.gz": CacheItem(
69+
"9606.protein.aliases.txt.gz": CacheItem(
7070
name="STRING 9606 protein aliases",
7171
cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
7272
online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",

databases/README.md

Lines changed: 0 additions & 3 deletions
This file was deleted.

databases/__init__.py

Whitespace-only changes.

datasets/diseases/Snakefile

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from cache import link
1+
include: "../../cache/Snakefile"
22

33
rule all:
44
input:
@@ -7,29 +7,21 @@ rule all:
77
"prize_files/alopecia_areata_prizes.txt",
88
"prize_files/diabetes_mellitus_prizes.txt"
99

10-
rule fetch:
11-
output:
12-
a="raw/human_disease_knowledge_filtered.tsv",
13-
b="raw/human_disease_textmining_filtered.tsv",
14-
c="raw/HumanDO.tsv",
15-
d="raw/tiga_gene-trait_stats.tsv",
16-
e="raw/ensg-ensp.tsv"
17-
f="raw/9606.protein.links.v12.0.txt",
18-
g="raw/9606.protein.aliases.v12.0.txt"
19-
run:
20-
link(output.a, ["DISEASES", "human_disease_textmining_filtered.tsv"])
21-
link(output.b, ["DISEASES", "human_disease_textmining_filtered.tsv"])
22-
link(output.c, ["DISEASES", "HumanDO.tsv"])
23-
link(output.d, ["DISEASES", "tiga_gene-tra)it_stats.tsv"])
24-
link(output.e, ["BioMart", "ensg-ensp.tsv"])
25-
link(output.f, ["STRING", "9606", "9606.protein.links.v12.0.txt"])
26-
link(output.g, ["STRING", "9606", "9606.protein.aliases.v12.0.txt"])
10+
produce_fetch_rules({
11+
"raw/human_disease_textmining_filtered.tsv": ["DISEASES", "human_disease_textmining_filtered.tsv"],
12+
"raw/human_disease_knowledge_filtered.tsv": ["DISEASES", "human_disease_knowledge_filtered.tsv"],
13+
"raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
14+
"raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
15+
"raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
16+
"raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
17+
"raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
18+
})
2719

2820
rule inputs:
2921
input:
3022
"raw/HumanDO.tsv",
3123
"raw/tiga_gene-trait_stats.tsv",
32-
"raw/9606.protein.aliases.v12.0.txt"
24+
"raw/9606.protein.aliases.txt"
3325
output:
3426
"data/inputs.csv"
3527
shell:
@@ -39,7 +31,8 @@ rule gold_standard:
3931
input:
4032
"raw/human_disease_knowledge_filtered.tsv",
4133
"raw/human_disease_textmining_filtered.tsv",
42-
"raw/9606.protein.aliases.v12.0.txt"
34+
"raw/9606.protein.aliases.txt",
35+
"raw/ensg-ensp.tsv"
4336
output:
4437
"data/gold_standard.csv"
4538
shell:
@@ -49,7 +42,7 @@ rule files:
4942
input:
5043
"data/inputs.csv",
5144
"data/gold_standard.csv",
52-
"raw/9606.protein.links.v12.0.txt"
45+
"raw/9606.protein.links.txt"
5346
output:
5447
# These are the two we use for the SPRAS run for now
5548
"GS_files/Alopecia_areata_GS.txt",

datasets/diseases/scripts/fetch.py

Lines changed: 0 additions & 44 deletions
This file was deleted.

datasets/diseases/scripts/files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ def main():
4040
df = df[["str_id"]]
4141
df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=None)
4242

43-
# See /databases/stringdb.py for information on how this was grabbed.
43+
# See /cache/directory.py for information on how this was grabbed.
4444
# 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
45-
string = pd.read_csv(diseases_path / ".." / ".." / "databases" / "string" / "9606.protein.links.v12.0.txt", sep=" ", skiprows=[0], header=None)
45+
string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None)
4646

4747
# Threshold anything above a confidence score of 900 to trim down the background interactome
4848
string = string[string.iloc[:, 2] > 900]

0 commit comments

Comments
 (0)