Skip to content

Commit e237db5

Browse files
authored
Merge pull request #365 from PNNL-CompBio/bladder_pdo
BladderPDO and Schema Fixes
2 parents 943a399 + 5bb3e62 commit e237db5

File tree

7 files changed

+7671
-6632
lines changed

7 files changed

+7671
-6632
lines changed

build/bladderpdo/00_createBladderPDOSampleFile.py

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,49 @@
1+
#!/usr/bin/env python3
12
import synapseclient
23
import pandas as pd
34
import numpy as np
45
import argparse
56
import os
7+
import re
8+
import subprocess
69

10+
# Helper functions
11+
def _clean_geo_id(s):
12+
"""
13+
Normalise GEO sample IDs so they match Synapse naming.
14+
• 11.2 → 11_2
15+
• **_Tumor → *_Parental
16+
• *_orgP2 → *_Organoid_P2
17+
• *_xenoorgP4 → *_XenoOrganoid_P4
18+
"""
19+
s = s.strip()
20+
s = re.sub(r"(?<=\d)\.(?=\d)", "_", s) # dots between digits
21+
s = s.replace("_tumor", "_Parental") # tumour alias
22+
# lower-case 'orgP' / 'xenoorgP' fix
23+
s = re.sub(r"_(org)P(\d+)", r"_Organoid_P\2", s, flags=re.IGNORECASE)
24+
s = re.sub(r"_(xenoorg)P(\d+)", r"_XenoOrganoid_P\2", s, flags=re.IGNORECASE)
25+
return s
726

27+
28+
def _parse_model_type(sample_id):
29+
"""Derive model_type from Sample ID."""
30+
low = sample_id.lower()
31+
if "_xenoorganoid" in low:
32+
return "xenograft derived organoid"
33+
if "_organoid" in low:
34+
return "organoid"
35+
if "_xenograft" in low:
36+
return "patient derived xenograft"
37+
if "_parental" in low:
38+
return "tumor"
39+
return "unknown"
40+
41+
#Generate Samples Data
842
def get_bladder_pdo_samples(synLoginObject, maxval):
943

44+
45+
#Part 1: Get Data from Synapse
46+
1047
# download from Synapse..
1148
samples_syn = synLoginObject.get('syn64765486')
1249
# and read the file
@@ -19,7 +56,43 @@ def get_bladder_pdo_samples(synLoginObject, maxval):
1956
samples.loc[:,['other_id_source']] = 'Synapse'
2057
samples.loc[:,['other_names'] ]= ''
2158
samples.loc[:,['cancer_type']]=samples['cancer_type'].str.lower()
22-
samples.loc[:, ['model_type']] = samples['model_type'].str.lower()
59+
samples["model_type"] = samples["other_id"].apply(_parse_model_type)
60+
61+
#Part 2: Get Data from Geo
62+
subprocess.call (["Rscript", "--vanilla", "obtainGSMidLink.R"])
63+
GEO_ids_link = "./gsmlinkDf.csv"
64+
65+
geo_map = pd.read_csv(GEO_ids_link)
66+
geo_ids = geo_map["sampleid"].dropna().map(_clean_geo_id).unique()
67+
missing = sorted(set(geo_ids) - set(samples["other_id"]))
68+
69+
if missing:
70+
print(f"Adding {len(missing)} GEO samples not in Synapse sheet")
71+
72+
rows = []
73+
for oid in missing:
74+
common = oid.split("_")[0]
75+
ctype = (
76+
samples.loc[samples["common_name"] == common, "cancer_type"]
77+
.iloc[0]
78+
if (samples["common_name"] == common).any()
79+
else "bladder urothelial carcinoma"
80+
)
81+
rows.append(
82+
{
83+
"other_id": oid,
84+
"common_name": common,
85+
"cancer_type": ctype,
86+
"model_type": _parse_model_type(oid),
87+
"species": "Homo sapiens(Human)",
88+
"other_id_source": "GEO",
89+
"other_names": "",
90+
}
91+
)
92+
if rows:
93+
samples = pd.concat([samples, pd.DataFrame(rows)], ignore_index=True)
94+
95+
samples = samples.sort_values("other_id").reset_index(drop=True)
2396

2497
samples['improve_sample_id'] = range(maxval+1, maxval+1+samples.shape[0])
2598

@@ -29,11 +102,8 @@ def get_bladder_pdo_samples(synLoginObject, maxval):
29102
if __name__ == "__main__":
30103

31104
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
32-
33105
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
34-
35106
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
36-
37107
args = parser.parse_args()
38108

39109
print("Logging into Synapse")
@@ -46,5 +116,4 @@ def get_bladder_pdo_samples(synLoginObject, maxval):
46116
prev_max_improve_id = 0
47117

48118
bladder_pdo_samples = get_bladder_pdo_samples(synObject, prev_max_improve_id)
49-
50119
bladder_pdo_samples.to_csv("/tmp/bladderpdo_samples.csv", index=False)

build/bladderpdo/01_createBladderPDOOmicsFiles.py

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import gzip
88
import subprocess
99
import math
10+
import re
1011

1112
def get_copy_call(a):
1213
"""
@@ -31,7 +32,21 @@ def get_copy_call(a):
3132
else:
3233
return 'amp'
3334

34-
return pd.Series([get_copy_call(a) for a in arr])
35+
36+
def normalise_id(s):
37+
"""
38+
Make GEO sample IDs line up with 'other_id' in bladderpdo_samples.csv.
39+
"""
40+
if pd.isna(s):
41+
return s
42+
s = s.strip()
43+
s = re.sub(r"(?<=\d)\.(?=\d)", "_", s) # dots → underscore
44+
s = s.replace("_tumor", "_Parental") # tumour alias
45+
s = re.sub(r"_(org)P(\d+)", r"_Organoid_P\2", s, flags=re.IGNORECASE)
46+
s = re.sub(r"_(xenoorg)P(\d+)", r"_XenoOrganoid_P\2", s, flags=re.IGNORECASE)
47+
return s
48+
49+
3550

3651
def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
3752

@@ -40,30 +55,42 @@ def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
4055
transcriptomics = pd.read_csv(transcriptomic_txt, compression='gzip', sep="\t")
4156
subprocess.call (["/usr/bin/Rscript", "--vanilla", "obtainGSMidLink.R"])
4257

43-
GEO_ids_link = pd.read_csv("./gsmlinkDf.csv")
58+
GEO_ids = pd.read_csv(GEO_id_link_table)
59+
print(GEO_ids)
4460
fpkm_totals = transcriptomics.iloc[:, 1:43].sum()
4561
transcriptomics.iloc[:, 1:43] = transcriptomics.iloc[:, 1:43].div(fpkm_totals).mul(1e6)
4662
transcriptomics['ensembl'] = transcriptomics['Unnamed: 0'].str.split("_", expand=True)[0]
4763
mapped_df = transcriptomics.merge(genes[['entrez_id', 'other_id']].drop_duplicates(), left_on='ensembl', right_on='other_id', how='left')
4864
# transform data to long format
65+
print(mapped_df)
4966

50-
mapped_df.drop('other_id', axis=1)
67+
mapped_df = mapped_df.drop('other_id', axis=1)
5168
value_variables = transcriptomics.columns[transcriptomics.columns.str.contains("M")]
5269
melted_txomics = mapped_df.melt(id_vars = "entrez_id", value_vars = value_variables, var_name='sample_name')
5370
# use info from GEO to get Sample IDS
54-
txomics_with_GEOid = melted_txomics.merge(GEO_ids_link, how = 'left', left_on = "sample_name", right_on='RNAid')
71+
m1 = melted_txomics.merge(GEO_ids, how="left", left_on="sample_name", right_on="RNAid")
72+
m1["sampleid"] = m1["sampleid"].apply(normalise_id)
73+
print(m1)
74+
print(m1.sampleid.unique())
5575
# use samplesheet to link sample_ids to improve ids
56-
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("org", "Organoid_")
57-
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("tumor", "Tumor")
58-
txomics_with_improveid = txomics_with_GEOid.merge(samples, left_on="sampleid", right_on="other_id", how="left")
59-
final_transcriptomics = txomics_with_improveid[['entrez_id', 'value', 'improve_sample_id']]
60-
final_transcriptomics['source'] = "Gene Expression Omnibus"
61-
final_transcriptomics['study'] = "Lee etal 2018 Bladder PDOs"
62-
final_transcriptomics.rename({'value' : 'transcriptomics' })
63-
# remove duplicates
64-
toreturn = final_transcriptomics.drop_duplicates()
65-
66-
return toreturn
76+
tx_with_ids = m1.merge(
77+
samples, left_on="sampleid", right_on="other_id", how="left"
78+
)
79+
print(tx_with_ids)
80+
81+
final_tx = (
82+
tx_with_ids[["entrez_id", "value", "improve_sample_id"]]
83+
.drop_duplicates()
84+
.assign(source="Gene Expression Omnibus",
85+
study="Lee et al. 2018 Bladder PDOs")
86+
)
87+
final_tx.rename(columns= {"value":"transcriptomics"},inplace=True)
88+
final_tx = final_tx.drop_duplicates()
89+
final_tx = final_tx.dropna(subset=["entrez_id"])
90+
final_tx["improve_sample_id"] = final_tx["improve_sample_id"].astype(int)
91+
final_tx["entrez_id"] = final_tx["entrez_id"].astype(int)
92+
93+
return final_tx
6794

6895
def get_bladder_pdo_mutations(synObject, samples, genes):
6996
print(samples.head)
@@ -74,10 +101,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
74101
selectioncols_mutations = mutations_df[['Entrez_Gene_Id',"Variant_Classification", "Tumor_Sample_Barcode", "mutation"]]
75102
merged_mutations = selectioncols_mutations.merge(samples, left_on="Tumor_Sample_Barcode", right_on="other_id", how="left")
76103
merged_mutations_renamed = merged_mutations.rename({"Entrez_Gene_Id" : 'entrez_id', 'Variant_Classification' : "variant_classification"}, axis=1)
77-
print(merged_mutations_renamed.head)
78104
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
79105
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
80-
print(final_mutations.head)
106+
final_mutations = final_mutations.dropna(subset=["entrez_id"])
107+
final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
108+
final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
81109
return final_mutations
82110

83111
def get_bladder_pdo_copynumber(synObject, samples, genes):
@@ -94,7 +122,9 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
94122
final_copynumber = copynumber_with_correct_colnames[['entrez_id', 'improve_sample_id', 'copy_number', 'copy_call']]
95123
final_copynumber['source'] = "Synapse"
96124
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
97-
125+
final_copynumber = final_copynumber.dropna(subset=["entrez_id"])
126+
final_copynumber["improve_sample_id"] = final_copynumber["improve_sample_id"].astype(int)
127+
final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
98128
return final_copynumber
99129

100130

@@ -108,7 +138,7 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
108138
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
109139
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
110140
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
111-
parser.add_argument('-i', '--geolink', help=".csv file that is the output of 'CNV-segfile-anotation.R")
141+
parser.add_argument('-i', '--geolink', default = "./gsmlinkDf.csv", help=".csv file that is the output of 'CNV-segfile-anotation.R")
112142
parser.add_argument('-t', '--token', help='Synapse token')
113143

114144
args = parser.parse_args()
@@ -129,4 +159,4 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
129159
get_bladder_pdo_mutations(synObject, samples, genes).to_csv('/tmp/bladderpdo_mutations.csv', index=False)
130160

131161
if args.copy:
132-
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copynumber.csv", index=False)
162+
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copy_number.csv", index=False)

build/bladderpdo/build_exp.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit
66
echo "Running 04-drug_dosage_and_curves.py with drugfile $2 and curSampleFile $1"
77
python3 03_createBladderPDOExperimentFile.py --token $SYNAPSE_AUTH_TOKEN --drugfile $2 --curSampleFile $1 --output /tmp/bladderpdo_doserep.tsv
88

9-
python3 fit_curve.py --input /tmp/bladderpdo_doserep.tsv --output /tmp/bladderpdo_experiments.tsv
9+
python3 fit_curve.py --input /tmp/bladderpdo_doserep.tsv --output /tmp/bladderpdo_experiments.tsv
10+
rm /tmp/bladderpdo_doserep.tsv
11+
mv /tmp/bladderpdo_experiments.tsv.0 /tmp/bladderpdo_experiments.tsv

build/improve_drug_mapping.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"metadata": {
33
"builds": [
44
{
5-
"build_date": "01_24_25",
5+
"build_date": "2025-01-24",
66
"version": "2.0.0"
77
},
88
{

0 commit comments

Comments
 (0)