Reed-CompBio
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 3 additions & 4 deletions b/‎.github/workflows/publish.yml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎__init__.py‎ b/‎__init__.py‎
diff --git a/‎cache/__init__.py‎ b/‎cache/__init__.py‎
diff --git a/‎cache/biomart/README.md‎
Lines changed: 3 additions & 0 deletions b/‎cache/biomart/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cache/biomart/ensg-ensp.xml‎
Lines changed: 9 additions & 0 deletions b/‎cache/biomart/ensg-ensp.xml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎cache/directory.py‎
Lines changed: 140 additions & 0 deletions b/‎cache/directory.py‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎cache/index.py‎
Lines changed: 1 addition & 0 deletions b/‎cache/index.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/dmmm.yaml‎
Lines changed: 42 additions & 13 deletions b/‎configs/dmmm.yaml‎
Lines changed: 42 additions & 13 deletions
diff --git a/‎configs/pra.yaml‎
Lines changed: 7 additions & 10 deletions b/‎configs/pra.yaml‎
Lines changed: 7 additions & 10 deletions
@@ -57,10 +57,9 @@ jobs:
       - name: Run Snakemake workflow for DMMMs
         shell: bash --login {0}
         run: snakemake --cores 4 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile
-      # TODO: re-enable PRAs once RN/synthetic data PRs are merged.
-      # - name: Run Snakemake workflow for PRAs
-      #   shell: bash --login {0}
-      #   run: snakemake --cores 1 --configfile configs/pra.yaml --show-failed-logs -s spras/Snakefile
+      - name: Run Snakemake workflow for PRAs
+        shell: bash --login {0}
+        run: snakemake --cores 4 --configfile configs/pra.yaml --show-failed-logs -s spras/Snakefile
       - name: Setup PNPM
         uses: pnpm/action-setup@v4
         with:
 
@@ -164,3 +164,6 @@ cython_debug/
 
 # pnpm
 .pnpm-store
+
+# mac
+.DS_Store
@@ -0,0 +1,3 @@
+# BioMart XML Queries
+
+Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+			
+	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
+		<Attribute name = "ensembl_peptide_id" />
+		<Attribute name = "ensembl_gene_id" />
+	</Dataset>
+</Query>
@@ -0,0 +1,140 @@
+from dataclasses import dataclass
+from typing import Union
+from os import PathLike
+from tempfile import NamedTemporaryFile
+import urllib.request
+import filecmp
+import urllib.parse
+import os
+from pathlib import Path
+
+import gdown
+
+dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
+
+def fetch_biomart_url(xml: str) -> str:
+    """
+    Access BioMart data through the BioMart REST API:
+    https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml
+    """
+    ROOT = "http://www.ensembl.org/biomart/martservice?query="
+    return ROOT + urllib.parse.quote_plus(xml)
+
+@dataclass
+class CacheItem:
+    """Class for differentriating between offline and online items in a cache."""
+
+    cached: str
+    online: str
+
+    def download(self, output: str | PathLike):
+        print(f"Downloading {self.online}...")
+
+        urllib.request.urlretrieve(self.online, output)
+
+        with NamedTemporaryFile() as cached_file:
+            print(f"Downloading cache {self.cached}...")
+            gdown.download(self.cached, cached_file)
+            print("Checking that downloaded artifact matches with cached artifact...")
+            filecmp.cmp(output, cached_file.name)
+
+
+CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
+
+# An *unversioned* directory list.
+directory: CacheDirectory = {
+    "STRING": {
+        "9606": {
+            "links": CacheItem(
+                cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
+                online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
+            ),
+            "aliases": CacheItem(
+                cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
+                online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
+            )
+        }
+    },
+    "UniProt": {
+        # We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth.
+        "9606": {
+            # We prefer manually curated genes.
+            "SwissProt_9606.tsv": CacheItem(
+                cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
+                online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"
+            ),
+            "HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
+                cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
+                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
+            ),
+            "HUMAN_9606_idmapping.dat.gz": CacheItem(
+                cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
+                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"
+            )
+        }
+    },
+    "DISEASES": {
+        # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
+        # archived files directory instead.
+        "tiga_gene-trait_stats.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
+            online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
+        ),
+        "HumanDO.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
+            online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
+        ),
+        "human_disease_textmining_filtered.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
+            online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
+        ),
+        "human_disease_knowledge_filtered.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
+            online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
+        ),
+    },
+    "BioMart": {
+        "ensg-ensp.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
+            online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
+        )
+    },
+    "DepMap": {
+        "OmicsProfiles.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"
+        ),
+        "CRISPRGeneDependency.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"
+        ),
+        "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"
+        ),
+        "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"
+        ),
+        "OmicsCNGeneWGS.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"
+        )
+    }
+}
+
+
+def get_cache_item(path: list[str]) -> CacheItem:
+    """Takes a path and gets the underlying cache item."""
+    assert len(path) != 0
+
+    current_item = directory
+    for entry in path:
+        if isinstance(current_item, CacheItem):
+            raise ValueError(f"Path {path} leads to a cache item too early!")
+        current_item = current_item[entry]
+
+    if not isinstance(current_item, CacheItem):
+        raise ValueError(f"Path {path} doesn't lead to a cache item")
+
+    return current_item
@@ -0,0 +1 @@
+# Artifact caching
@@ -1,22 +1,19 @@
-# Base Settings
 hash_length: 7
-container_framework: docker
-unpack_singularity: false
 
-container_registry:
-  base_url: docker.io
-  owner: reedcompbio
+containers:
+  registry:
+    base_url: docker.io
+    owner: reedcompbio
+  framework: docker
+  unpack_singularity: false
 
 reconstruction_settings:
   locations:
     reconstruction_dir: "output"
-    run: true
 
 analysis:
   summary:
     include: false
-  graphspace:
-    include: false
   cytoscape:
     include: false
   ml:
@@ -45,14 +42,14 @@ algorithms:
 datasets:
   # TODO: use old paramaters for datasets
   # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
-  - label: dmmmhiv060
+  - label: dmmmhiv_060
     node_files: ["processed_prize_060.txt"]
-    edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
-  - label: dmmmhiv05
+  - label: dmmmhiv_05
     node_files: ["processed_prize_05.txt"]
-    edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
   # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
@@ -61,3 +58,35 @@ datasets:
     edge_files: ["network1.txt"]
     other_files: []
     data_dir: "datasets/yeast-osmotic-stress/processed"
+  - label: dmmmdiseases_alopecia_areata
+    data_dir: datasets/diseases
+    edge_files:
+      - raw/string_interactome.txt
+    node_files:
+      - prize_files/alopecia_areata_prizes.txt
+    other_files: []
+  - label: dmmmdiseases_diabetes_mellitus
+    data_dir: datasets/diseases
+    edge_files:
+      - raw/string_interactome.txt
+    node_files:
+      - prize_files/diabetes_mellitus_prizes.txt
+    other_files: []
+  - label: dmmmdepmap_cellline_fadu
+    data_dir: datasets/depmap
+    edge_files: ["../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
+    node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
+    other_files: []
+gold_standards:
+  - label: gs0
+    node_files: ['GS_files/Alopecia_areata_GS.txt']
+    data_dir: "datasets/diseases"
+    dataset_labels: ["dmmmdiseases_alopecia_areata"]
+  - label: gs1
+    node_files: ['GS_files/Diabetes_mellitus_GS.txt']
+    data_dir: "datasets/diseases"
+    dataset_labels: ["dmmmdiseases_diabetes_mellitus"]
+  - label: gs_fadu
+    node_files: ["processed/FADU_gold_standard.txt"]
+    data_dir: datasets/depmap
+    dataset_labels: ["dmmmdepmap_cellline_fadu"]
@@ -1,23 +1,20 @@
-# Base Settings
 # TODO: (same for dmmm.yaml): can we deduplicate this using snakemake?
 hash_length: 7
-container_framework: docker
-unpack_singularity: false
 
-container_registry:
-  base_url: docker.io
-  owner: reedcompbio
+containers:
+  registry:
+    base_url: docker.io
+    owner: reedcompbio
+  framework: docker
+  unpack_singularity: false
 
 reconstruction_settings:
   locations:
     reconstruction_dir: "output"
-    run: true
 
 analysis:
   summary:
     include: false
-  graphspace:
-    include: false
   cytoscape:
     include: false
   ml:
@@ -52,7 +49,7 @@ algorithms:
       include: true
 
 datasets:
-  - label: pramuscleskeletal2018
+  - label: prarn_muscleskeletal2018
     node_files: ["sources.txt", "targets.txt"]
     # DataLoader.py can currently only load a single edge file, which is the primary network
     edge_files: ["interactome.tsv"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# BioMart XML Queries`
	`2`	`+`
	`3`	`+Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).`