Reed-CompBio
diff --git a/‎cache/Snakefile‎
Lines changed: 1 addition & 1 deletion b/‎cache/Snakefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cache/directory.py‎
Lines changed: 33 additions & 0 deletions b/‎cache/directory.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎datasets/README.md‎
Lines changed: 9 additions & 0 deletions b/‎datasets/README.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎datasets/yeast-osmotic-stress/.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎datasets/yeast-osmotic-stress/.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎datasets/yeast-osmotic-stress/README.md‎
Lines changed: 12 additions & 12 deletions b/‎datasets/yeast-osmotic-stress/README.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎datasets/yeast-osmotic-stress/Snakefile‎
Lines changed: 12 additions & 0 deletions b/‎datasets/yeast-osmotic-stress/Snakefile‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎datasets/yeast-osmotic-stress/process_prizes.py‎
Lines changed: 4 additions & 5 deletions b/‎datasets/yeast-osmotic-stress/process_prizes.py‎
Lines changed: 4 additions & 5 deletions
@@ -25,7 +25,7 @@ def produce_fetch_rules(input_dict: dict[str, Union[FetchConfig, list[str]]]):
         # Since placeholders are evaluated when the job is actually ran,
         # we pass data using params and output.
         rule:
-            name: f"fetch_{urllib.parse.quote_plus("/".join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
+            name: f"fetch_{urllib.parse.quote_plus('/'.join(config.directive))}_to_{urllib.parse.quote_plus(output_file)}"
             output: file=output_file
             params:
                 config=config
 
@@ -152,6 +152,39 @@ def download(self, output: str | PathLike):
             online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
         ),
     },
+    "OsmoticStress": {
+        "yeast_pcsf_network.sif": CacheItem.cache_only(
+            # In the paper https://doi.org/10.1016/j.celrep.2018.08.085
+            name="Case Study Edge Results, from Supplementary Data 3",
+            cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h"
+        ),
+        # The following files are from https://github.com/gitter-lab/osmotic-stress
+        "prizes.txt": CacheItem(
+            name="Osmotic Stress Prizes",
+            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt",
+            cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg"
+        ),
+        "ChasmanNetwork-DirUndir.txt": CacheItem(
+            name="Network Input",
+            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt",
+            cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH"
+        ),
+        "dummy.txt": CacheItem(
+            name="Dummy Nodes File",
+            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt",
+            cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU"
+        ),
+        "_edgeFreq.eda ": CacheItem(
+            name="Case Study Omics Integrator Edge Frequencies",
+            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda",
+            cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR"
+        ),
+        "goldStandardUnionDetailed.txt": CacheItem(
+            name="Gold Standard Reference Pathways",
+            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt",
+            cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T"
+        ),
+    },
 }
 
 
 
@@ -2,3 +2,12 @@
 
 Datasets contains both the raw data (straight from the study/database), as well as Python scripts and an associated Snakemake file
 which take all of the raw data and produce SPRAS-compatible data.
+
+## Prior work
+
+Many of the datasets here have been stripped of their extra post-analysis. Here, we provide commit links to the original work.
+
+- [`hiv`](https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking)
+- [`diseases`](https://github.com/Reed-CompBio/spras-benchmarking/tree/3c0155567dbc43278531b91f9173f6d4f4486dd8/datasets/diseases)
+- [`depmap`](https://github.com/Reed-CompBio/spras-benchmarking/tree/b332c0ab53868f111cb89cd4e9f485e8c19aa9e3/datasets/depmap)
+- [`yeast-osmotic-stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast-osmotic-stress)
@@ -1 +1,2 @@
-processed
+/processed
+/raw
@@ -1,29 +1,29 @@
 # Overview
 
-This project is based on a published case study that studies the osmotic stress response of yeast cells by using proteomic data and Omics Integrator 1 to reconstruct pathways representing the cell response. Our [case study paper](https://doi.org/10.1016/j.celrep.2018.08.085) includes a time-series component and applies Omics Integrator first and then the Temporal Pathway Synthesizer second. Here we only examine the Omics Integrator results of that paper.
+This dataset is based on [_Synthesizing Signaling Pathways from Temporal Phosphoproteomic Data_](https://doi.org/10.1016/j.celrep.2018.08.085), which studies the osmotic stress response of yeast cells by using proteomic data and Omics Integrator 1 to reconstruct pathways representing the cell response.
+
+**NOTE**: The original paper also included a time-series component to use the [Temporal Pathway Synthesizer](https://doi.org/10.1016/j.celrep.2018.08.085). Here, until SPRAS supports temporal graphs, we only examine the non-temporal parts of this paper - specifically, we aim to reproduce the [OmicsIntegrator1](https://github.com/fraenkel-lab/omicsIntegrator) results.
 
 The set of files here was used to prepare the input Yeast Proteomic Data and conduct analysis on the output run by Omics Integrator. I swapped the files out as I got different output to analyze and compare. This worked because the number of files I worked with was small.
 
 The major assumption here is that a user will copy the SPRAS repo separately and take the input (the prize1_dummies file and ChasmanNetwork-DirUndir.txt file) and config.yaml files here to run them with SPRAS. Then use the output files from SPRAS as the inputs to the notebooks here. I have included my ensemble file and pathway summary files here in order to run my notebooks as I did.
 
-## Environment
-
-All necessary packages are available at the top-level `pyproject.toml`.
-
 ## Scripts
 
-The SPRAS_output folder contains my best SPRAS ensemble output, a single parameter combination output pathway with a Beta parameter of 1.75 exactly, and the pathway summary file for the ensemble file. Copy your files in here to analyze your outputs.
-
-1_Dummy_Node_Add.ipynb - Run 1st: Determines the largest prize value within our input prizes file and adds 3 dummy nodes all assigned with the highest prize to our input file. Outputs a new prizes file with the nodes added. Processes raw prizes file into the prize1_dummies file. Use this prize1_dummies file as your input to SPRAS. Note: I determined that the prizes file already contained 2 of the 5 dummy nodes with prizes, because of this I manually appended the other 3 from the dummy.txt file.
-
-2_Node_Summary_Histo.ipynb - Run 2nd: Takes the pathway-summary file and creates a histogram of the node results that were collected with prizes. Helps begin to understand the outputs.
+There is only one script, `process_prizes.py`, which:
+1. Determines the largest prize value within our input prizes file and adds 3 dummy nodes all assigned with (the highest prize? TODO: This seems to currently a magic value.)
+1. Outputs a new prizes file with the nodes added.
+1. Processes raw prizes file into the `prize1_dummies.txt` file as SPRAS input. Note: I determined that the prizes file already contained 2 of the 5 dummy nodes with prizes, because of this I manually appended the other 3 from the dummy.txt file.
 
-3_Oi1_Output_Eval.ipynb - Run 3rd: Main analysis file. Takes the best resulting ensemble pathway file, the gold standard nodes, the case study edge results, and the case study edge frequencies as input files. Performs various exploratory data analysis and data prep tasks. Main task is performing set overlap between case study edge results and our results. Creates stats for describing the difference. Includes Venn Diagram visualization code too. One key thing with this file is when trying to analyze the single pathway output file (instead of an ensemble file) you will need to change the path to point inside the folder with the single pathway output file.
+## Raw Files
 
-File_compare.py - Optional: I used this script to compare two network input files I received to confirm they were in fact the same input I needed for my Omics Integrator input. Can be used to compare any two files passed in as paths. This was specific to my case so if you use the input files here you do not need to run this.
+There are other raw files inside the Snakefile, but we don't use them here. We focus on these two raw files instead:
+- `prizes.txt`: From supplementary data 3, containing the prize data to be fed into SPRAS for reconstruction.
+- `ChasmanNetwork-DirUndir.txt`: The background interactome provided by [Pathway connectivity and signaling coordination in the yeast stress‐activated signaling network](https://doi.org/10.15252/msb.20145120).
 
 ## Future Work
 
+(_Note: results are from [this `config.yaml`](https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml)_).
 One huge factor in why my results may have been different than the original case study has to do with the lack of a dummy node parameter implemented in the SPRAS version of Omics Integrator 1, which allows a user to pass a file with a list of dummy nodes that the algorithm has to start its reconstructions through. This feature has since been added to SPRAS.
 
 In the case study they ran the tuned parameters with a Beta of 1.75 and r of 0.01 (to add edge noise) and generated 1000 forests. In my case Omics integrator doesn't have a way to run multiple outputs with the same parameter combination in order to ensemble the results and look at edge frequencies. My work around was to use `np.linspace` with a range between 1 and 2 and running 250 - 1000 parameter combinations. The idea being to run parameters as close to 1.75 as possible and compare the outputs.
 
@@ -1,8 +1,20 @@
+include: "../../cache/Snakefile"
+
 rule all:
     input:
         "processed/prizes1_dummies.txt",
         "processed/network1.txt"
 
+# Not all of these files are used. Most of these were used in the original yeast-osmotic-stress data processing (see ../README.md)
+produce_fetch_rules({
+    "raw/yeast_pcsf_network.sif": ["OsmoticStress", "yeast_pcsf_network.sif"],
+    "raw/prizes.txt": ["OsmoticStress", "prizes.txt"],
+    "raw/ChasmanNetwork-DirUndir.txt": ["OsmoticStress", "ChasmanNetwork-DirUndir.txt"],
+    "raw/dummy.txt": ["OsmoticStress", "dummy.txt"],
+    "raw/_edgeFreq.eda": ["OsmoticStress", "_edgeFreq.eda"],
+    "raw/goldStandardUnionDetailed.txt": ["OsmoticStress", "goldStandardUnionDetailed.txt"]
+})
+
 rule process_prizes:
     input:
         "raw/prizes.txt"
 
@@ -1,19 +1,18 @@
 # This prepares prizes with dummy nodes.
 import pandas as pd
 from pathlib import Path
-import os
 
-current_directory = Path(os.path.dirname(os.path.realpath(__file__)))
+data_directory = Path(__file__).parent.resolve()
 
 if __name__ == "__main__":
     # Get the raw prizes DF
-    prizes = current_directory / "raw" / "prizes.txt"
+    prizes = data_directory / "raw" / "prizes.txt"
     prizes_df = pd.read_csv(prizes, sep="\t", header=None, names=["NODEID", "prize"])
 
     # Use the manually curated prize info
-    # TODO: where did this come from?
+    # TODO: where did this score come from? These must be our three 'dummy' nodes, but this prize isn't even the highest prize.
     prizes_df2 = pd.DataFrame(data={"NODEID": ["YGR014W", "YDR420W", "YER118C"], "prize": 10.051863}, index=[1596, 1597, 1598])
 
-    new_prizes_path = current_directory / "processed" / "prizes1_dummies.txt"
+    new_prizes_path = data_directory / "processed" / "prizes1_dummies.txt"
     new_prizes = pd.concat([prizes_df, prizes_df2])
     new_prizes.to_csv(new_prizes_path, sep="\t", index=False, columns=["NODEID", "prize"], header=["NODEID", "prize"])