Skip to content

Commit a413a03

Browse files
committed
chore: cmt out unusable pathways
1 parent 3a6ea7c commit a413a03

File tree

6 files changed

+56
-44
lines changed

6 files changed

+56
-44
lines changed

datasets/synthetic_data/Snakefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ rule all:
1515
input:
1616
"raw/9606.protein.links.full.v12.0.txt",
1717
expand([
18-
"processed/pathways/{pathway}/node_prizes.txt",
19-
], pathway=file_compatible_pathways)
18+
"thresholded/{threshold}/{pathway}/interactome.txt",
19+
"thresholded/{threshold}/{pathway}/gold_standard_edges.txt",
20+
], pathway=file_compatible_pathways, threshold=thresholds)
2021

2122
produce_fetch_rules({
2223
"raw/9606.protein.links.full.v12.0.txt": FetchConfig(("STRING", "9606", "9606.protein.links.full.txt.gz"), uncompress=True),

datasets/synthetic_data/explore/pathway_statistics.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from tools.sample import find_connected_sources_targets
1111

12+
1213
# From SPRAS. TODO: import once SPRAS uses pixi
1314
def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
1415
mask = df["Direction"] == "U"
@@ -19,46 +20,49 @@ def convert_undirected_to_directed(df: pandas.DataFrame) -> pandas.DataFrame:
1920
df = pandas.concat([df, new_df], ignore_index=True)
2021
return df
2122

23+
2224
current_directory = Path(__file__).parent.resolve()
23-
synthetic_directory = current_directory / '..'
25+
synthetic_directory = current_directory / ".."
26+
2427

2528
def main():
2629
data_entries = []
2730

2831
# We identify pathways by their gold standard edges, since we have a few other files mixed in with `processed`.
2932
for pathway_folder in (synthetic_directory / "processed" / "pathways").rglob("*/"):
3033
gs_edges_graph = networkx.from_pandas_edgelist(
31-
convert_undirected_to_directed(pandas.read_csv(
32-
pathway_folder / "gs_edges.txt", sep='\t',
33-
names=["Interactor1", "Interactor2", "Rank", "Direction"])),
34+
convert_undirected_to_directed(
35+
pandas.read_csv(pathway_folder / "gs_edges.txt", sep="\t", names=["Interactor1", "Interactor2", "Rank", "Direction"])
36+
),
3437
"Interactor1",
3538
"Interactor2",
36-
create_using=networkx.DiGraph)
37-
node_prizes = pandas.read_csv(pathway_folder / "node_prizes.txt", sep='\t')
39+
create_using=networkx.DiGraph,
40+
)
41+
node_prizes = pandas.read_csv(pathway_folder / "node_prizes.txt", sep="\t")
3842

39-
sources = list(node_prizes[node_prizes["sources"] == True]['NODEID'])
40-
targets = list(node_prizes[node_prizes["targets"] == True]['NODEID'])
43+
sources = list(node_prizes[node_prizes["sources"] == True]["NODEID"])
44+
targets = list(node_prizes[node_prizes["targets"] == True]["NODEID"])
4145

4246
connected_sources_targets = find_connected_sources_targets(
4347
sources,
4448
targets,
4549
gs_edges_graph,
4650
)
47-
data_entries.append((
48-
urllib.parse.unquote(pathway_folder.stem),
49-
len(sources),
50-
len(targets),
51-
(float(len(connected_sources_targets)) / float(len(sources) * len(targets))) \
52-
if len(sources) * len(targets) != 0 else 0.0
53-
))
54-
51+
data_entries.append(
52+
(
53+
urllib.parse.unquote(pathway_folder.stem),
54+
len(sources),
55+
len(targets),
56+
(float(len(connected_sources_targets)) / float(len(sources) * len(targets))) if len(sources) * len(targets) != 0 else 0.0,
57+
)
58+
)
59+
5560
data_df = pandas.DataFrame(data_entries, columns=("Name", "Sources", "Targets", "Connected Percentage"))
56-
data_df.to_csv(current_directory / 'full_stats.tsv', sep='\t', index=False)
61+
data_df.to_csv(current_directory / "full_stats.tsv", sep="\t", index=False)
5762

58-
filtered_df = data_df.loc[data_df['Sources'] != 0] \
59-
.loc[data_df['Targets'] != 0] \
60-
.loc[data_df['Connected Percentage'] != 0]
63+
filtered_df = data_df.loc[data_df["Sources"] != 0].loc[data_df["Targets"] != 0].loc[data_df["Connected Percentage"] != 0]
6164
print(filtered_df)
6265

66+
6367
if __name__ == "__main__":
6468
main()
Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
[
2+
// All commented out pathways do not have enough sources, targets, or connections.
3+
// To see more, re-comment in all commented out pathways and run explore/pathway_statistics.py.
4+
25
// Commonly known as the "CCKR signaling map", PathwayCommons also does not map this one correctly:
36
// TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP06959
4-
"Gastrin_CCK2R_240212",
7+
// "Gastrin_CCK2R_240212",
58
"Wnt signaling pathway",
69
"VEGF signaling pathway",
710
"Toll receptor signaling pathway",
@@ -11,24 +14,24 @@
1114
"JAK/STAT signaling pathway",
1215
"Interleukin signaling pathway",
1316
"Interferon-gamma signaling pathway",
14-
"Integrin signalling pathway",
15-
"Insulin/IGF pathway-protein kinase B signaling cascade",
17+
// "Integrin signalling pathway",
18+
// "Insulin/IGF pathway-protein kinase B signaling cascade",
1619
"Inflammation mediated by chemokine and cytokine signaling pathway",
1720
"Hedgehog signaling pathway",
18-
"FGF signaling pathway",
21+
// "FGF signaling pathway",
1922
"FAS signaling pathway",
2023
// TODO: report to PathwayCommons: see https://apps.pathwaycommons.org/pathways?uri=https%3A%2F%2Fidentifiers.org%2Fpanther.pathway%3AP00019.
2124
// We want to add the Endothelin signaling pathway, but it is currently labelled under "untitled."
2225
"EGF receptor signaling pathway",
2326
"Cadherin signaling pathway",
2427
"Apoptosis signaling pathway",
25-
"Ras Pathway",
28+
// "Ras Pathway",
2629
"PI3 kinase pathway",
2730
"p38 MAPK pathway",
28-
"Insulin/IGF pathway-mitogen activated protein kinase kinase/MAP kinase cascade",
29-
"p53 pathway",
30-
"Hypoxia response via HIF activation",
31-
"Oxidative stress response",
32-
"B cell activation",
33-
"T cell activation"
31+
// "Insulin/IGF pathway-mitogen activated protein kinase kinase/MAP kinase cascade",
32+
// "p53 pathway",
33+
// "Hypoxia response via HIF activation",
34+
// "Oxidative stress response",
35+
"B cell activation"
36+
// "T cell activation"
3437
]

datasets/synthetic_data/scripts/panther_spras_formatting.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,11 @@ def main():
7575
edges_df = edges_df.drop_duplicates(keep="first", ignore_index=True)
7676
# We trim the gold standard edges against the interactome
7777
interactome_df = pd.read_csv(
78-
processed_directory / 'interactome.tsv', sep='\t',
79-
header=None, names=["Interactor1", "Interactor2", "Weight", "Direction"],
80-
dtype={"Interactor1": str, "Interactor2": str}
78+
processed_directory / "interactome.tsv",
79+
sep="\t",
80+
header=None,
81+
names=["Interactor1", "Interactor2", "Weight", "Direction"],
82+
dtype={"Interactor1": str, "Interactor2": str},
8183
)
8284
edges_df = edges_df.merge(interactome_df, how="inner", on=["Interactor1", "Interactor2"])
8385
# We don't care about extraneous information provided by the interactome.
@@ -105,6 +107,5 @@ def main():
105107
data_df.to_csv(out_folder / "node_prizes.txt", sep="\t", index=False, header=True)
106108

107109

108-
109110
if __name__ == "__main__":
110111
main()

datasets/synthetic_data/scripts/sampling.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,12 @@ def main():
6969
arg_parser = parser()
7070
arg_parser.add_argument("--seed", help="The randomness seed to use", type=int, required=False)
7171
arg_parser.add_argument("--amount", help="The amount of thresholds to use", type=int, default=10)
72-
arg_parser.add_argument("--percentage_thresholding_multiplier", help="The percentage multiplier to threshold by, " + \
73-
"to unlink the sampling percentage to the actual required percentage of connections", type=float, default=1.0)
72+
arg_parser.add_argument(
73+
"--percentage_thresholding_multiplier",
74+
help="The percentage multiplier to threshold by, " + "to unlink the sampling percentage to the actual required percentage of connections",
75+
type=float,
76+
default=1.0,
77+
)
7478

7579
args = arg_parser.parse_args()
7680
pathway_location = args.pathway

datasets/synthetic_data/scripts/util/parser.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,15 @@
66

77
synthetic_directory = Path(__file__).parent.parent.parent.resolve()
88

9+
910
# TODO: deduplicate from ../Snakefile
1011
def make_file_safe(input_str: str) -> str:
11-
return urllib.parse.quote(input_str, safe='')
12+
return urllib.parse.quote(input_str, safe="")
13+
1214

1315
def parser():
1416
parser = argparse.ArgumentParser(prog="PANTHER pathway parser")
1517

16-
parser.add_argument(
17-
"pathway",
18-
choices=list(map(make_file_safe, JsoncParser.parse_file(synthetic_directory / "pathways.jsonc")))
19-
)
18+
parser.add_argument("pathway", choices=list(map(make_file_safe, JsoncParser.parse_file(synthetic_directory / "pathways.jsonc"))))
2019

2120
return parser

0 commit comments

Comments
 (0)