Skip to content

Commit 4122721

Browse files
committed
fix: lower percentage thresholding multiplier
1 parent 3b54dea commit 4122721

File tree

3 files changed

+20
-12
lines changed

3 files changed

+20
-12
lines changed

datasets/synthetic_data/Snakefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include: "../../cache/Snakefile"
22
from jsonc_parser.parser import JsoncParser
33

4-
pathways = JsoncParser.parse_file("pathways.jsonc")
4+
pathways = ["Interleukin signaling pathway"]
55

66
# TODO: deduplicate from sampling.py
77
thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10))))
@@ -100,7 +100,7 @@ rule threshold:
100100
expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
101101
expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
102102
shell:
103-
'uv run scripts/sampling.py "{wildcards.pathway}"'
103+
'uv run scripts/sampling.py "{wildcards.pathway}" --percentage_thresholding_multiplier=0.9'
104104

105105
rule make_pathway_map:
106106
input:

datasets/synthetic_data/scripts/sampling.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ def sources_and_targets(pathway_node_prizes_df: pandas.DataFrame) -> SourcesTarg
6666
def main():
6767
arg_parser = parser()
6868
arg_parser.add_argument("--seed", help="The randomness seed to use", type=int, required=False)
69+
arg_parser.add_argument("--amount", help="The amount of thresholds to use", type=int, default=10)
70+
arg_parser.add_argument("--percentage_thresholding_multiplier", help="The percentage multiplier to threshold by, " + \
71+
"to unlink the sampling percentage to the actual required percentage of connections", type=float, default=1.0)
72+
6973
args = arg_parser.parse_args()
7074
pathway_name = args.pathway
7175
if args.seed is not None:
@@ -88,19 +92,22 @@ def main():
8892
node_data_df = get_node_data(pathway_name)
8993
sources, targets = sources_and_targets(node_data_df)
9094

91-
# TODO: isolate percentage constant (this currently builds up 0%, 10%, ..., 100%)
92-
for percentage in map(lambda x: (x + 1) / 10, range(10)):
93-
output_directory = synthetic_directory / "thresholded" / str(percentage) / pathway_name
95+
percentages = list(map(lambda x: (x + 1) / args.amount, range(args.amount)))
96+
for percentage_to_sample in percentages:
97+
percentage_to_threshold = percentage_to_sample * args.percentage_thresholding_multiplier
98+
99+
output_directory = synthetic_directory / "thresholded" / str(percentage_to_sample) / pathway_name
94100
output_interactome = output_directory / "interactome.txt"
95101
output_gold_standard = output_directory / "gold_standard_edges.txt"
96102

97-
print(f"Sampling with {percentage * 100:.1f}% of edges...")
103+
print(f"Sampling with {percentage_to_sample * 100:.1f}% of edges...")
98104
attempt_number = 1
99105
while (
100106
attempt_sample(
101107
pathway_name,
102108
pathway_df,
103-
percentage,
109+
percentage_to_sample,
110+
percentage_to_threshold,
104111
weight_mapping,
105112
interactome_df,
106113
sources,

tools/sample.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ def find_connected_sources_targets(sources: list[str], targets: list[str], graph
3838
def attempt_sample(
3939
pathway_name: str,
4040
pathway_df: pandas.DataFrame,
41-
percentage: float,
41+
percentage_to_sample: float,
42+
percentage_to_require: float,
4243
weight_mapping: OrderedDict[int, int],
4344
interactome_df: pandas.DataFrame,
4445
sources: list[str],
@@ -54,7 +55,7 @@ def attempt_sample(
5455
returning the connections between {sources} and {targets},
5556
or None if the target percentage failed.
5657
"""
57-
interactome_df = sample_interactome(interactome_df, weight_mapping, percentage)
58+
interactome_df = sample_interactome(interactome_df, weight_mapping, percentage_to_sample)
5859

5960
print(f"Merging {pathway_name} with interactome...")
6061
# While we are merging this graph, we are preparing to compare the connectedness of the prev[ious] and curr[ent] (merged) graph
@@ -71,12 +72,12 @@ def attempt_sample(
7172
# We ask that at least `percentage` of the sources and targets are connected with one another.
7273
connection_percentage = float(len(curr_connections)) / float(len(prev_connections)) if len(prev_connections) != 0 else 0
7374

74-
if percentage <= connection_percentage:
75-
print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage * 100:.1f}% threshold.")
75+
if percentage_to_require <= connection_percentage:
76+
print(f"Got {connection_percentage * 100:.1f}% connections above the {percentage_to_require * 100:.1f}% required percentage threshold.")
7677
pathway_df.to_csv(output_gold_standard, sep="\t", index=False, header=False)
7778
interactome_df.to_csv(output_interactome, sep="\t", index=False, header=False)
7879
return curr_connections
79-
print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage * 100:.1f}% threshold.")
80+
print(f"Failed {connection_percentage * 100:.1f}% connections below the {percentage_to_require * 100:.1f}% required percentage threshold.")
8081
return None
8182

8283

0 commit comments

Comments
 (0)