Skip to content

Commit 8ff381f

Browse files
committed
Merge branch 'main' into synthetic
2 parents fc12b4e + b1e08b8 commit 8ff381f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+8597
-1435
lines changed

.github/workflows/publish.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,9 @@ jobs:
5757
- name: Run Snakemake workflow for DMMMs
5858
shell: bash --login {0}
5959
run: snakemake --cores 4 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile
60-
# TODO: re-enable PRAs once RN/synthetic data PRs are merged.
61-
# - name: Run Snakemake workflow for PRAs
62-
# shell: bash --login {0}
63-
# run: snakemake --cores 1 --configfile configs/pra.yaml --show-failed-logs -s spras/Snakefile
60+
- name: Run Snakemake workflow for PRAs
61+
shell: bash --login {0}
62+
run: snakemake --cores 4 --configfile configs/pra.yaml --show-failed-logs -s spras/Snakefile
6463
- name: Setup PNPM
6564
uses: pnpm/action-setup@v4
6665
with:

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,6 @@ cython_debug/
164164

165165
# pnpm
166166
.pnpm-store
167+
168+
# mac
169+
.DS_Store

__init__.py

Whitespace-only changes.

cache/__init__.py

Whitespace-only changes.

cache/biomart/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# BioMart XML Queries
2+
3+
Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).

cache/biomart/ensg-ensp.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE Query>
3+
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
4+
5+
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
6+
<Attribute name = "ensembl_peptide_id" />
7+
<Attribute name = "ensembl_gene_id" />
8+
</Dataset>
9+
</Query>

cache/directory.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
from dataclasses import dataclass
2+
from typing import Union
3+
from os import PathLike
4+
from tempfile import NamedTemporaryFile
5+
import urllib.request
6+
import filecmp
7+
import urllib.parse
8+
import os
9+
from pathlib import Path
10+
11+
import gdown
12+
13+
dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
14+
15+
def fetch_biomart_url(xml: str) -> str:
16+
"""
17+
Access BioMart data through the BioMart REST API:
18+
https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml
19+
"""
20+
ROOT = "http://www.ensembl.org/biomart/martservice?query="
21+
return ROOT + urllib.parse.quote_plus(xml)
22+
23+
@dataclass
24+
class CacheItem:
25+
"""Class for differentriating between offline and online items in a cache."""
26+
27+
cached: str
28+
online: str
29+
30+
def download(self, output: str | PathLike):
31+
print(f"Downloading {self.online}...")
32+
33+
urllib.request.urlretrieve(self.online, output)
34+
35+
with NamedTemporaryFile() as cached_file:
36+
print(f"Downloading cache {self.cached}...")
37+
gdown.download(self.cached, cached_file)
38+
print("Checking that downloaded artifact matches with cached artifact...")
39+
filecmp.cmp(output, cached_file.name)
40+
41+
42+
CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
43+
44+
# An *unversioned* directory list.
45+
directory: CacheDirectory = {
46+
"STRING": {
47+
"9606": {
48+
"links": CacheItem(
49+
cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
50+
online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
51+
),
52+
"aliases": CacheItem(
53+
cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
54+
online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
55+
)
56+
}
57+
},
58+
"UniProt": {
59+
# We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth.
60+
"9606": {
61+
# We prefer manually curated genes.
62+
"SwissProt_9606.tsv": CacheItem(
63+
cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
64+
online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"
65+
),
66+
"HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
67+
cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
68+
online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
69+
),
70+
"HUMAN_9606_idmapping.dat.gz": CacheItem(
71+
cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
72+
online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"
73+
)
74+
}
75+
},
76+
"DISEASES": {
77+
# Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
78+
# archived files directory instead.
79+
"tiga_gene-trait_stats.tsv": CacheItem(
80+
cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
81+
online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
82+
),
83+
"HumanDO.tsv": CacheItem(
84+
cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
85+
online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
86+
),
87+
"human_disease_textmining_filtered.tsv": CacheItem(
88+
cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
89+
online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
90+
),
91+
"human_disease_knowledge_filtered.tsv": CacheItem(
92+
cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
93+
online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
94+
),
95+
},
96+
"BioMart": {
97+
"ensg-ensp.tsv": CacheItem(
98+
cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
99+
online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
100+
)
101+
},
102+
"DepMap": {
103+
"OmicsProfiles.csv": CacheItem(
104+
cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
105+
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"
106+
),
107+
"CRISPRGeneDependency.csv": CacheItem(
108+
cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
109+
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"
110+
),
111+
"OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
112+
cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
113+
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"
114+
),
115+
"OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
116+
cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
117+
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"
118+
),
119+
"OmicsCNGeneWGS.csv": CacheItem(
120+
cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
121+
online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"
122+
)
123+
}
124+
}
125+
126+
127+
def get_cache_item(path: list[str]) -> CacheItem:
128+
"""Takes a path and gets the underlying cache item."""
129+
assert len(path) != 0
130+
131+
current_item = directory
132+
for entry in path:
133+
if isinstance(current_item, CacheItem):
134+
raise ValueError(f"Path {path} leads to a cache item too early!")
135+
current_item = current_item[entry]
136+
137+
if not isinstance(current_item, CacheItem):
138+
raise ValueError(f"Path {path} doesn't lead to a cache item")
139+
140+
return current_item

cache/index.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Artifact caching

configs/dmmm.yaml

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,19 @@
1-
# Base Settings
21
hash_length: 7
3-
container_framework: docker
4-
unpack_singularity: false
52

6-
container_registry:
7-
base_url: docker.io
8-
owner: reedcompbio
3+
containers:
4+
registry:
5+
base_url: docker.io
6+
owner: reedcompbio
7+
framework: docker
8+
unpack_singularity: false
99

1010
reconstruction_settings:
1111
locations:
1212
reconstruction_dir: "output"
13-
run: true
1413

1514
analysis:
1615
summary:
1716
include: false
18-
graphspace:
19-
include: false
2017
cytoscape:
2118
include: false
2219
ml:
@@ -45,14 +42,14 @@ algorithms:
4542
datasets:
4643
# TODO: use old paramaters for datasets
4744
# HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
48-
- label: dmmmhiv060
45+
- label: dmmmhiv_060
4946
node_files: ["processed_prize_060.txt"]
50-
edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
47+
edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
5148
other_files: []
5249
data_dir: "datasets/hiv/processed"
53-
- label: dmmmhiv05
50+
- label: dmmmhiv_05
5451
node_files: ["processed_prize_05.txt"]
55-
edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
52+
edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
5653
other_files: []
5754
data_dir: "datasets/hiv/processed"
5855
# Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
@@ -61,3 +58,35 @@ datasets:
6158
edge_files: ["network1.txt"]
6259
other_files: []
6360
data_dir: "datasets/yeast-osmotic-stress/processed"
61+
- label: dmmmdiseases_alopecia_areata
62+
data_dir: datasets/diseases
63+
edge_files:
64+
- raw/string_interactome.txt
65+
node_files:
66+
- prize_files/alopecia_areata_prizes.txt
67+
other_files: []
68+
- label: dmmmdiseases_diabetes_mellitus
69+
data_dir: datasets/diseases
70+
edge_files:
71+
- raw/string_interactome.txt
72+
node_files:
73+
- prize_files/diabetes_mellitus_prizes.txt
74+
other_files: []
75+
- label: dmmmdepmap_cellline_fadu
76+
data_dir: datasets/depmap
77+
edge_files: ["../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
78+
node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
79+
other_files: []
80+
gold_standards:
81+
- label: gs0
82+
node_files: ['GS_files/Alopecia_areata_GS.txt']
83+
data_dir: "datasets/diseases"
84+
dataset_labels: ["dmmmdiseases_alopecia_areata"]
85+
- label: gs1
86+
node_files: ['GS_files/Diabetes_mellitus_GS.txt']
87+
data_dir: "datasets/diseases"
88+
dataset_labels: ["dmmmdiseases_diabetes_mellitus"]
89+
- label: gs_fadu
90+
node_files: ["processed/FADU_gold_standard.txt"]
91+
data_dir: datasets/depmap
92+
dataset_labels: ["dmmmdepmap_cellline_fadu"]

configs/pra.yaml

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,20 @@
1-
# Base Settings
21
# TODO: (same for dmmm.yaml): can we deduplicate this using snakemake?
32
hash_length: 7
4-
container_framework: docker
5-
unpack_singularity: false
63

7-
container_registry:
8-
base_url: docker.io
9-
owner: reedcompbio
4+
containers:
5+
registry:
6+
base_url: docker.io
7+
owner: reedcompbio
8+
framework: docker
9+
unpack_singularity: false
1010

1111
reconstruction_settings:
1212
locations:
1313
reconstruction_dir: "output"
14-
run: true
1514

1615
analysis:
1716
summary:
1817
include: false
19-
graphspace:
20-
include: false
2118
cytoscape:
2219
include: false
2320
ml:
@@ -52,7 +49,7 @@ algorithms:
5249
include: true
5350

5451
datasets:
55-
- label: pramuscleskeletal2018
52+
- label: prarn_muscleskeletal2018
5653
node_files: ["sources.txt", "targets.txt"]
5754
# DataLoader.py can currently only load a single edge file, which is the primary network
5855
edge_files: ["interactome.tsv"]

0 commit comments

Comments
 (0)