chore: cmt out unusable pathways

tristan-f-r · tristan-f-r · commit a413a03d63b8 · 2026-03-03T09:15:13.000Z
diff --git a/datasets/synthetic_data/Snakefile b/datasets/synthetic_data/Snakefile
@@ -15,8 +15,9 @@ rule all:
     input:
         "raw/9606.protein.links.full.v12.0.txt",
         expand([
-            "processed/pathways/{pathway}/node_prizes.txt",
-        ], pathway=file_compatible_pathways)
+            "thresholded/{threshold}/{pathway}/interactome.txt",
+            "thresholded/{threshold}/{pathway}/gold_standard_edges.txt",
+        ], pathway=file_compatible_pathways, threshold=thresholds)
 
 produce_fetch_rules({
     "raw/9606.protein.links.full.v12.0.txt": FetchConfig(("STRING", "9606", "9606.protein.links.full.txt.gz"), uncompress=True),
diff --git a/datasets/synthetic_data/explore/pathway_statistics.py b/datasets/synthetic_data/explore/pathway_statistics.py
@@ -9,6 +9,7 @@
 
 from tools.sample import find_connected_sources_targets
 
+
 # From SPRAS. TODO: import once SPRAS uses pixi
 def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
     mask = df["Direction"] == "U"
@@ -19,46 +20,49 @@ def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
     df = pandas.concat([df, new_df], ignore_index=True)
     return df
 
+
 current_directory = Path(__file__).parent.resolve()
-synthetic_directory = current_directory / '..'
+synthetic_directory = current_directory / ".."
+
 
 def main():
     data_entries = []
 
     # We identify pathways by their gold standard edges, since we have a few other files mixed in with `processed`.
     for pathway_folder in (synthetic_directory / "processed" / "pathways").rglob("*/"):
         gs_edges_graph = networkx.from_pandas_edgelist(
-            convert_undirected_to_directed(pandas.read_csv(
-                pathway_folder / "gs_edges.txt", sep='\t',
-                names=["Interactor1", "Interactor2", "Rank", "Direction"])),
+            convert_undirected_to_directed(
+                pandas.read_csv(pathway_folder / "gs_edges.txt", sep="\t", names=["Interactor1", "Interactor2", "Rank", "Direction"])
+            ),
             "Interactor1",
             "Interactor2",
-            create_using=networkx.DiGraph)
-        node_prizes = pandas.read_csv(pathway_folder / "node_prizes.txt", sep='\t')
+            create_using=networkx.DiGraph,
+        )
+        node_prizes = pandas.read_csv(pathway_folder / "node_prizes.txt", sep="\t")
 
-        sources = list(node_prizes[node_prizes["sources"] == True]['NODEID'])
-        targets = list(node_prizes[node_prizes["targets"] == True]['NODEID'])
+        sources = list(node_prizes[node_prizes["sources"] == True]["NODEID"])
+        targets = list(node_prizes[node_prizes["targets"] == True]["NODEID"])
 
         connected_sources_targets = find_connected_sources_targets(
             sources,
             targets,
             gs_edges_graph,
         )
-        data_entries.append((
-            urllib.parse.unquote(pathway_folder.stem),
-            len(sources),
-            len(targets),
-            (float(len(connected_sources_targets)) / float(len(sources) * len(targets))) \
-                if len(sources) * len(targets) != 0 else 0.0
-        ))
-    
+        data_entries.append(
+            (
+                urllib.parse.unquote(pathway_folder.stem),
+                len(sources),
+                len(targets),
+                (float(len(connected_sources_targets)) / float(len(sources) * len(targets))) if len(sources) * len(targets) != 0 else 0.0,
+            )
+        )
+
     data_df = pandas.DataFrame(data_entries, columns=("Name", "Sources", "Targets", "Connected Percentage"))
-    data_df.to_csv(current_directory / 'full_stats.tsv', sep='\t', index=False)
+    data_df.to_csv(current_directory / "full_stats.tsv", sep="\t", index=False)
 
-    filtered_df = data_df.loc[data_df['Sources'] != 0] \
-        .loc[data_df['Targets'] != 0] \
-        .loc[data_df['Connected Percentage'] != 0]
+    filtered_df = data_df.loc[data_df["Sources"] != 0].loc[data_df["Targets"] != 0].loc[data_df["Connected Percentage"] != 0]
     print(filtered_df)
 
+
 if __name__ == "__main__":
     main()
diff --git a/datasets/synthetic_data/pathways.jsonc b/datasets/synthetic_data/pathways.jsonc
@@ -1,7 +1,10 @@
 [
+    // All commented out pathways do not have enough sources, targets, or connections.
+    // To see more, re-comment in all commented out pathways and run explore/pathway_statistics.py.
+
     // Commonly known as the "CCKR signaling map", PathwayCommons also does not map this one correctly:
     // TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP06959
-    "Gastrin_CCK2R_240212",
+    // "Gastrin_CCK2R_240212",
     "Wnt signaling pathway",
     "VEGF signaling pathway",
     "Toll receptor signaling pathway",
@@ -11,24 +14,24 @@
     "JAK/STAT signaling pathway",
     "Interleukin signaling pathway",
     "Interferon-gamma signaling pathway",
-    "Integrin signalling pathway",
-    "Insulin/IGF pathway-protein kinase B signaling cascade",
+    // "Integrin signalling pathway",
+    // "Insulin/IGF pathway-protein kinase B signaling cascade",
     "Inflammation mediated by chemokine and cytokine signaling pathway",
     "Hedgehog signaling pathway",
-    "FGF signaling pathway",
+    // "FGF signaling pathway",
     "FAS signaling pathway",
     // TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00019.
     // We want to add the Endothelin signaling pathway, but it is currently labelled under "untitled."
     "EGF receptor signaling pathway",
     "Cadherin signaling pathway",
     "Apoptosis signaling pathway",
-    "Ras Pathway",
+    // "Ras Pathway",
     "PI3 kinase pathway",
     "p38 MAPK pathway",
-    "Insulin/IGF pathway-mitogen activated protein kinase kinase/MAP kinase cascade",
-    "p53 pathway",
-    "Hypoxia response via HIF activation",
-    "Oxidative stress response",
-    "B cell activation",
-    "T cell activation"
+    // "Insulin/IGF pathway-mitogen activated protein kinase kinase/MAP kinase cascade",
+    // "p53 pathway",
+    // "Hypoxia response via HIF activation",
+    // "Oxidative stress response",
+    "B cell activation"
+    // "T cell activation"
 ]
diff --git a/datasets/synthetic_data/scripts/panther_spras_formatting.py b/datasets/synthetic_data/scripts/panther_spras_formatting.py
@@ -75,9 +75,11 @@ def main():
     edges_df = edges_df.drop_duplicates(keep="first", ignore_index=True)
     # We trim the gold standard edges against the interactome
     interactome_df = pd.read_csv(
-        processed_directory / 'interactome.tsv', sep='\t',
-        header=None, names=["Interactor1", "Interactor2", "Weight", "Direction"],
-        dtype={"Interactor1": str, "Interactor2": str}
+        processed_directory / "interactome.tsv",
+        sep="\t",
+        header=None,
+        names=["Interactor1", "Interactor2", "Weight", "Direction"],
+        dtype={"Interactor1": str, "Interactor2": str},
     )
     edges_df = edges_df.merge(interactome_df, how="inner", on=["Interactor1", "Interactor2"])
     # We don't care about extraneous information provided by the interactome.
@@ -105,6 +107,5 @@ def main():
     data_df.to_csv(out_folder / "node_prizes.txt", sep="\t", index=False, header=True)
 
 
-
 if __name__ == "__main__":
     main()
diff --git a/datasets/synthetic_data/scripts/sampling.py b/datasets/synthetic_data/scripts/sampling.py
@@ -69,8 +69,12 @@ def main():
     arg_parser = parser()
     arg_parser.add_argument("--seed", help="The randomness seed to use", type=int, required=False)
     arg_parser.add_argument("--amount", help="The amount of thresholds to use", type=int, default=10)
-    arg_parser.add_argument("--percentage_thresholding_multiplier", help="The percentage multiplier to threshold by, " + \
-                            "to unlink the sampling percentage to the actual required percentage of connections", type=float, default=1.0)
+    arg_parser.add_argument(
+        "--percentage_thresholding_multiplier",
+        help="The percentage multiplier to threshold by, " + "to unlink the sampling percentage to the actual required percentage of connections",
+        type=float,
+        default=1.0,
+    )
 
     args = arg_parser.parse_args()
     pathway_location = args.pathway
diff --git a/datasets/synthetic_data/scripts/util/parser.py b/datasets/synthetic_data/scripts/util/parser.py
@@ -6,16 +6,15 @@
 
 synthetic_directory = Path(__file__).parent.parent.parent.resolve()
 
+
 # TODO: deduplicate from ../Snakefile
 def make_file_safe(input_str: str) -> str:
-    return urllib.parse.quote(input_str, safe='')
+    return urllib.parse.quote(input_str, safe="")
+
 
 def parser():
     parser = argparse.ArgumentParser(prog="PANTHER pathway parser")
 
-    parser.add_argument(
-        "pathway",
-        choices=list(map(make_file_safe, JsoncParser.parse_file(synthetic_directory / "pathways.jsonc")))
-    )
+    parser.add_argument("pathway", choices=list(map(make_file_safe, JsoncParser.parse_file(synthetic_directory / "pathways.jsonc"))))
 
     return parser