fix: lower percentage thresholding multiplier

tristan-f-r · tristan-f-r · commit 4122721fa2cd · 2026-03-03T03:53:07.000Z
diff --git a/datasets/synthetic_data/Snakefile b/datasets/synthetic_data/Snakefile
@@ -1,7 +1,7 @@
 include: "../../cache/Snakefile"
 from jsonc_parser.parser import JsoncParser
 
-pathways = JsoncParser.parse_file("pathways.jsonc")
+pathways = ["Interleukin signaling pathway"]
 
 # TODO: deduplicate from sampling.py
 thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10))))
@@ -100,7 +100,7 @@ rule threshold:
         expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
         expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
     shell:
-        'uv run scripts/sampling.py "{wildcards.pathway}"'
+        'uv run scripts/sampling.py "{wildcards.pathway}" --percentage_thresholding_multiplier=0.9'
 
 rule make_pathway_map:
     input:
diff --git a/datasets/synthetic_data/scripts/sampling.py b/datasets/synthetic_data/scripts/sampling.py
@@ -66,6 +66,10 @@ def sources_and_targets(pathway_node_prizes_df: pandas.DataFrame) -> SourcesTarg
 def main():
     arg_parser = parser()
     arg_parser.add_argument("--seed", help="The randomness seed to use", type=int, required=False)
+    arg_parser.add_argument("--amount", help="The amount of thresholds to use", type=int, default=10)
+    arg_parser.add_argument("--percentage_thresholding_multiplier", help="The percentage multiplier to threshold by, " + \
+                            "to unlink the sampling percentage to the actual required percentage of connections", type=float, default=1.0)
+
     args = arg_parser.parse_args()
     pathway_name = args.pathway
     if args.seed is not None:
@@ -88,19 +92,22 @@ def main():
     node_data_df = get_node_data(pathway_name)
     sources, targets = sources_and_targets(node_data_df)
 
-    # TODO: isolate percentage constant (this currently builds up 0%, 10%, ..., 100%)
-    for percentage in map(lambda x: (x + 1) / 10, range(10)):
-        output_directory = synthetic_directory / "thresholded" / str(percentage) / pathway_name
+    percentages = list(map(lambda x: (x + 1) / args.amount, range(args.amount)))
+    for percentage_to_sample in percentages:
+        percentage_to_threshold = percentage_to_sample * args.percentage_thresholding_multiplier
+
+        output_directory = synthetic_directory / "thresholded" / str(percentage_to_sample) / pathway_name
         output_interactome = output_directory / "interactome.txt"
         output_gold_standard = output_directory / "gold_standard_edges.txt"
 
-        print(f"Sampling with {percentage * 100:.1f}% of edges...")
+        print(f"Sampling with {percentage_to_sample * 100:.1f}% of edges...")
         attempt_number = 1
         while (
             attempt_sample(
                 pathway_name,
                 pathway_df,
-                percentage,
+                percentage_to_sample,
+                percentage_to_threshold,
                 weight_mapping,
                 interactome_df,
                 sources,
diff --git a/tools/sample.py b/tools/sample.py
@@ -38,7 +38,8 @@ def find_connected_sources_targets(sources: list[str], targets: list[str], graph
 def attempt_sample(
     pathway_name: str,
     pathway_df: pandas.DataFrame,
-    percentage: float,
+    percentage_to_sample: float,
+    percentage_to_require: float,
     weight_mapping: OrderedDict[int, int],
     interactome_df: pandas.DataFrame,
     sources: list[str],
@@ -54,7 +55,7 @@ def attempt_sample(
     returning the connections between {sources} and {targets},
     or None if the target percentage failed.
     """
-    interactome_df = sample_interactome(interactome_df, weight_mapping, percentage)
+    interactome_df = sample_interactome(interactome_df, weight_mapping, percentage_to_sample)
 
     print(f"Merging {pathway_name} with interactome...")
     # While we are merging this graph, we are preparing to compare the connectedness of the prev[ious] and curr[ent] (merged) graph
@@ -71,12 +72,12 @@ def attempt_sample(
     # We ask that at least `percentage` of the sources and targets are connected with one another.
     connection_percentage = float(len(curr_connections)) / float(len(prev_connections)) if len(prev_connections) != 0 else 0
 
-    if percentage <= connection_percentage:
-        print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage * 100:.1f}% threshold.")
+    if percentage_to_require <= connection_percentage:
+        print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage_to_require * 100:.1f}% required percentage threshold.")
         pathway_df.to_csv(output_gold_standard, sep="\t", index=False, header=False)
         interactome_df.to_csv(output_interactome, sep="\t", index=False, header=False)
         return curr_connections
-    print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage * 100:.1f}% threshold.")
+    print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage_to_require * 100:.1f}% required percentage threshold.")
     return None