feat: unify workflow

tristan-f-r · tristan-f-r · commit a7ed9396740d · 2026-03-01T09:45:39.000Z
diff --git a/datasets/synthetic_data/.gitignore b/datasets/synthetic_data/.gitignore
@@ -1,3 +1,4 @@
 /intermediate
 /processed
 /raw
+/thresholded
diff --git a/datasets/synthetic_data/README.md b/datasets/synthetic_data/README.md
@@ -7,17 +7,20 @@ This entire workflow can also be done with `uv run snakemake --cores 1` inside t
 
 ## Workflow
 
+The workflow follows these steps in order:
+
 ## PANTHER Pathway Fetching
 
 PANTHER pathways are fetched from a singular OWL file containing a bundled collection of all pathways. Since the OWL file that
-PathwayCommons provides is over 10gb, we have a separate Snakemake workflow, located nuder `./panther_pathways`, that trims down the OWL file
+PathwayCommons provides is over 10gb, we have a separate Snakemake workflow, located under `./panther_pathways`, that trims down the OWL file
 to only contain pathways from PANTHER.
 
 Inside `scripts/fetch_pathway.py`, we use this intermediately-generated (and cached!) OWL file to individually generate associated OWL and
 SIF files for each pathway.
 
 We have a `./util/parse_pc_pathways.py`, which takes a `pathways.txt` provided by PathwayCommons, and allows us to map the
-human-readable pathway names in `pathways.jsonc` into [identifiers.org](https://identifiers.org/) identifiers.
+human-readable pathway names into [identifiers.org](https://identifiers.org/) identifiers, which we later trim down
+with our provided list of pathway names in `pathways.jsonc` using `list_curated_pathways.py`.
 
 ## Sources and Targets
 
@@ -26,30 +29,3 @@ are silico human surfaceomes receptors.
 
 [Targets]( https://guolab.wchscu.cn/AnimalTFDB4//#/), or `Homo_sapiens_TF.tsv`, (see [original paper](https://doi.org/10.1093/nar/gkac907))
 are human transcription factors.
-
-### 1. Process PANTHER Pathways
-
-1. Open `Snakefile` and add the name of any new pathways to the `pathways` entry.
-2. Run the command:
-   ```sh
-   uv run scripts/process_panther_pathway.py <pathway>
-   ```
-3. This will create five new files in the respective `pathway` subfolder of the `pathway-data/` directory:
-- `edges.txt`
-- `nodes.txt`
-- `prizes-100.txt`
-- `sources.txt`
-- `targets.txt`
-
-### 2. Convert Pathways to SPRAS-Compatible Format
-1.	In `panther_spras_formatting.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**.
-2.	From the synthetic_data/ directory, run the command:
-```
-python scripts/panther_spras_formatting.py
-```
-3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format.  
-Each subfolder will include the following three files:
-- `<pathway_name>_gs_edges.txt`
-- `<pathway_name>_gs_nodes.txt`
-- `<pathway_name>_node_prizes.txt`
-
diff --git a/datasets/synthetic_data/Snakefile b/datasets/synthetic_data/Snakefile
@@ -16,6 +16,7 @@ rule all:
 
 produce_fetch_rules({
     "raw/9606.protein.links.full.v12.0.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
+    "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
     "raw/human-interactome/table_S3_surfaceome.xlsx": ["Surfaceome", "table_S3_surfaceome.xlsx"],
     "raw/human-interactome/Homo_sapiens_TF.tsv": ["TranscriptionFactors", "Homo_sapiens_TF.tsv"],
     "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
@@ -26,12 +27,10 @@ produce_fetch_rules({
 
 rule interactome:
     input:
+        "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv",
         "raw/9606.protein.links.full.v12.0.txt",
         "raw/9606.protein.aliases.txt"
-    output:
-        "processed/proteins_missing_aliases.csv",
-        "processed/removed_edges.txt",
-        "processed/interactome.tsv"
+    output: "processed/interactome.tsv"
     shell:
         "uv run scripts/interactome.py"
 
@@ -46,7 +45,7 @@ rule process_tfs:
 
 rule process_panther_pathway:
     input:
-        "intermediate/pathway-data/{pathway}.txt",
+        "intermediate/pathway-pc-data/{pathway}.sif",
         "raw/human-interactome/table_S3_surfaceome.xlsx",
         "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
     output:
@@ -56,7 +55,7 @@ rule process_panther_pathway:
         "intermediate/{pathway}/targets.txt",
         "intermediate/{pathway}/prizes.txt"
     shell:
-        "uv run scripts/process_panther_pathway.py {wildcards.pathway}"
+        'uv run scripts/process_panther_pathway.py "{wildcards.pathway}"'
 
 rule make_spras_compatible:
     input:
@@ -70,7 +69,7 @@ rule make_spras_compatible:
         "processed/{pathway}/{pathway}_gs_edges.txt",
         "processed/{pathway}/{pathway}_gs_nodes.txt"
     shell:
-        "uv run scripts/panther_spras_formatting.py {wildcards.pathway}"
+        'uv run scripts/panther_spras_formatting.py "{wildcards.pathway}"'
 
 rule threshold:
     input:
@@ -80,23 +79,25 @@ rule threshold:
         expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
         expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
     shell:
-        "uv run scripts/sampling.py {wildcards.pathway}"
+        'uv run scripts/sampling.py "{wildcards.pathway}"'
 
 rule make_pathway_map:
     input:
         "raw/pathways.txt"
     output:
-        "processed/pathway_id_mapping.tsv"
+        "intermediate/curated_pathways_id_mapping.json"
     shell:
         "uv run scripts/list_curated_pathways.py"
 
-for pathway in pathways:
-    rule:
-        input:
-            "processed/pathway_id_mapping.tsv",
-            "raw/pc-panther-biopax.owl"
-        output:
-            "intermediate/pathway-data/{pathway}.owl",
-            "intermediate/pathway-data/{pathway}.sif"
-        shell:
-            f"uv run scripts/fetch_pathway.py {pathway}"
+rule process_pathways:
+    input:
+        "intermediate/curated_pathways_id_mapping.json",
+        "raw/pc-panther-biopax.owl"
+    params:
+        # A little trick from https://stackoverflow.com/a/71327709/7589775
+        pathway=lambda wildcards: wildcards.get("pathway")
+    output:
+        "intermediate/pathway-pc-data/{pathway}.owl",
+        "intermediate/pathway-pc-data/{pathway}.sif"
+    shell:
+        'uv run scripts/fetch_pathway.py "{params.pathway}"'
diff --git a/datasets/synthetic_data/pathways.jsonc b/datasets/synthetic_data/pathways.jsonc
@@ -17,9 +17,8 @@
     "Hedgehog signaling pathway",
     "FGF signaling pathway",
     "FAS signaling pathway",
-    // This is actually the Endothelin signaling pathway.
     // TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00019.
-    "untitled",
+    // We want to add the Endothelin signaling pathway, but it is currently labelled under "untitled."
     "EGF receptor signaling pathway",
     "Cadherin signaling pathway",
     "Apoptosis signaling pathway",
diff --git a/datasets/synthetic_data/scripts/fetch_pathway.py b/datasets/synthetic_data/scripts/fetch_pathway.py
@@ -1,7 +1,7 @@
 import argparse
+import json
 from pathlib import Path
 
-import pandas
 from paxtools.fetch import fetch
 from paxtools.sif import toSIF
 
@@ -18,10 +18,10 @@ def parser():
 
 def main():
     args = parser().parse_args()
-    curated_pathways_df = pandas.read_csv(synthetic_directory / "intermediate" / "curated_pathways.tsv", sep="\t")
-    associated_id = curated_pathways_df.loc[curated_pathways_df["Name"] == args.pathway_name].reset_index(drop=True).loc[0]["ID"]
+    curated_pathways_df = json.loads((synthetic_directory / "intermediate" / "curated_pathways_id_mapping.json").read_text())
+    associated_id = curated_pathways_df[args.pathway_name]
 
-    pathway_data_dir = synthetic_directory / "intermediate" / "pathway-data"
+    pathway_data_dir = synthetic_directory / "intermediate" / "pathway-pc-data"
     pathway_data_dir.mkdir(exist_ok=True, parents=True)
 
     fetch(
diff --git a/datasets/synthetic_data/scripts/list_curated_pathways.py b/datasets/synthetic_data/scripts/list_curated_pathways.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 from jsonc_parser.parser import JsoncParser
 import pandas
@@ -21,10 +22,7 @@ def main():
         if selected_pathways_count != 1:
             raise RuntimeError(f"{pathway} references {selected_pathways_count} pathways, when we need to uniquely get one!")
         pathway_mapping[pathway] = selected_pathways["PATHWAY_URI"].loc[0]
-    curated_pathway_df = pandas.DataFrame(pathway_mapping.items())
-    curated_pathway_df.columns = ["Name", "ID"]
-    (synthetic_directory / "intermediate").mkdir(exist_ok=True)
-    curated_pathway_df.to_csv(synthetic_directory / "intermediate" / "curated_pathways.tsv", index=False, sep="\t")
+    (synthetic_directory / "intermediate" / "curated_pathways_id_mapping.json").write_text(json.dumps(pathway_mapping, indent=4))
 
 
 if __name__ == "__main__":
diff --git a/datasets/synthetic_data/scripts/panther_spras_formatting.py b/datasets/synthetic_data/scripts/panther_spras_formatting.py
@@ -1,11 +1,11 @@
 import pandas as pd
 from pathlib import Path
-from .util.parser import parser
+from datasets.synthetic_data.scripts.util.parser import parser
 
-current_directory = Path(__file__).parent.resolve()
+synthetic_directory = Path(__file__).parent.parent.resolve()
 
-spras_compatible_dir = Path(current_directory, "..", "processed")
-directory = Path(current_directory, "..", "intermediate")
+spras_compatible_dir = synthetic_directory / "processed"
+directory = synthetic_directory / "intermediate"
 
 directed = [
     "controls-state-change-of",
diff --git a/datasets/synthetic_data/scripts/process_panther_pathway.py b/datasets/synthetic_data/scripts/process_panther_pathway.py
@@ -1,12 +1,13 @@
-import argparse
 import io
 import pandas as pd
 from pathlib import Path
 
-current_directory = Path(__file__).parent.resolve()
+from datasets.synthetic_data.scripts.util.parser import parser
 
-data_directory = current_directory / ".." / "raw" / "pathway-data"
-interactome_folder = current_directory / ".." / "raw" / "human-interactome"
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+data_directory = synthetic_directory / "intermediate" / "pathway-pc-data"
+interactome_folder = synthetic_directory / "raw" / "human-interactome"
 
 
 def process_pathway(file: Path, folder: Path):
@@ -65,18 +66,9 @@ def process_pathway(file: Path, folder: Path):
     scores["active"] = "true"
     scores.to_csv(folder / "prizes.txt", sep="\t", index=False)
 
-
-def parser():
-    parser = argparse.ArgumentParser(prog="PANTHER pathway parser")
-
-    parser.add_argument("pathway", choices=[file.stem for file in data_directory.iterdir()])
-
-    return parser
-
-
 if __name__ == "__main__":
     pathway = parser().parse_args().pathway
-    pathway_file = data_directory / Path(pathway).with_suffix(".txt")
-    intermediate_folder = current_directory / ".." / "intermediate" / pathway
+    pathway_file = data_directory / Path(pathway).with_suffix(".sif")
+    intermediate_folder = synthetic_directory / "intermediate" / pathway
     intermediate_folder.mkdir(parents=True, exist_ok=True)
     process_pathway(pathway_file, intermediate_folder)
diff --git a/datasets/synthetic_data/scripts/sampling.py b/datasets/synthetic_data/scripts/sampling.py
@@ -4,9 +4,9 @@
 from typing import OrderedDict, NamedTuple
 from tools.sample import attempt_sample
 from tools.trim import trim_data_file
-from .util.parser import parser
+from datasets.synthetic_data.scripts.util.parser import parser
 
-current_directory = Path(__file__).parent.resolve()
+synthetic_directory = Path(__file__).parent.parent.resolve()
 
 
 # From SPRAS. TODO: import once SPRAS uses pixi
@@ -22,7 +22,7 @@ def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
 
 def count_weights() -> OrderedDict[int, int]:
     """Returns an ordered map (lowest to highest weight) from the weight to the number of elements the weight has"""
-    weight_counts = pandas.read_csv(current_directory / ".." / "processed" / "weight-counts.tsv", sep="\t")
+    weight_counts = pandas.read_csv(synthetic_directory / "processed" / "weight-counts.tsv", sep="\t")
     return collections.OrderedDict(sorted({int(k * 1000): int(v) for k, v in dict(weight_counts.values).items()}.items()))
 
 
@@ -32,7 +32,7 @@ def read_pathway(pathway_name: str) -> pandas.DataFrame:
     with columns Interactor1 -> Interactor2.
     """
     pathway_df = pandas.read_csv(
-        current_directory / ".." / "processed" / pathway_name / f"{pathway_name}_gs_edges.txt",
+        synthetic_directory / "processed" / pathway_name / f"{pathway_name}_gs_edges.txt",
         sep="\t",
         names=["Interactor1", "Interactor2", "Weight", "Direction"],
     )
@@ -48,7 +48,7 @@ class SourcesTargets(NamedTuple):
 
 def get_node_data(pathway_name: str) -> pandas.DataFrame:
     return pandas.read_csv(
-        current_directory / ".." / "processed" / pathway_name / f"{pathway_name}_node_prizes.txt", sep="\t", usecols=["NODEID", "sources", "targets"]
+        synthetic_directory / "processed" / pathway_name / f"{pathway_name}_node_prizes.txt", sep="\t", usecols=["NODEID", "sources", "targets"]
     )
 
 
@@ -66,7 +66,7 @@ def main():
     pathway_name = parser().parse_args().pathway
     print("Reading interactome...")
     interactome_df = pandas.read_csv(
-        current_directory / ".." / "processed" / "interactome.tsv",
+        synthetic_directory / "processed" / "interactome.tsv",
         header=None,
         sep="\t",
         names=["Interactor1", "Interactor2", "Weight", "Direction"],
@@ -83,7 +83,7 @@ def main():
 
     # TODO: isolate percentage constant (this currently builds up 0%, 10%, ..., 100%)
     for percentage in map(lambda x: (x + 1) / 10, range(10)):
-        output_directory = current_directory / ".." / "thresholded" / str(percentage) / pathway_name
+        output_directory = synthetic_directory / "thresholded" / str(percentage) / pathway_name
         output_interactome = output_directory / "interactome.txt"
         output_gold_standard = output_directory / "gold_standard_edges.txt"
 
@@ -107,7 +107,7 @@ def main():
             print(f"Attempt number {attempt_number}")
 
         # We're done sampling:
-        (output_directory / "attempt-number.txt").write_text(attempt_number)
+        (output_directory / "attempt-number.txt").write_text(str(attempt_number))
         # we need to trim our data file as well.
         trim_data_file(data_df=node_data_df, gold_standard_df=pathway_df).to_csv(output_directory / "node_prizes.tsv", sep="\t", index=False)
 
diff --git a/datasets/synthetic_data/scripts/util/parser.py b/datasets/synthetic_data/scripts/util/parser.py
@@ -1,12 +1,14 @@
 import argparse
 from pathlib import Path
 
-scripts_directory = Path(__file__).parent.resolve()
+from jsonc_parser.parser import JsoncParser
+
+synthetic_directory = Path(__file__).parent.parent.parent.resolve()
 
 
 def parser():
     parser = argparse.ArgumentParser(prog="PANTHER pathway parser")
 
-    parser.add_argument("pathway", choices=[file.stem for file in (scripts_directory / ".." / "raw" / "pathway-data").iterdir()])
+    parser.add_argument("pathway", choices=JsoncParser.parse_file(synthetic_directory / "pathways.jsonc"))
 
     return parser