77import gzip
88import subprocess
99import math
10+ import re
1011
1112def get_copy_call (a ):
1213 """
@@ -31,7 +32,21 @@ def get_copy_call(a):
3132 else :
3233 return 'amp'
3334
34- return pd .Series ([get_copy_call (a ) for a in arr ])
35+
36+ def normalise_id (s ):
37+ """
38+ Make GEO sample IDs line up with 'other_id' in bladderpdo_samples.csv.
39+ """
40+ if pd .isna (s ):
41+ return s
42+ s = s .strip ()
43+ s = re .sub (r"(?<=\d)\.(?=\d)" , "_" , s ) # dots → underscore
44+ s = s .replace ("_tumor" , "_Parental" ) # tumour alias
45+ s = re .sub (r"_(org)P(\d+)" , r"_Organoid_P\2" , s , flags = re .IGNORECASE )
46+ s = re .sub (r"_(xenoorg)P(\d+)" , r"_XenoOrganoid_P\2" , s , flags = re .IGNORECASE )
47+ return s
48+
49+
3550
3651def get_bladder_pdo_transcriptomics (GEO_id_link_table , samples , genes ):
3752
@@ -40,30 +55,42 @@ def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
4055 transcriptomics = pd .read_csv (transcriptomic_txt , compression = 'gzip' , sep = "\t " )
4156 subprocess .call (["/usr/bin/Rscript" , "--vanilla" , "obtainGSMidLink.R" ])
4257
43- GEO_ids_link = pd .read_csv ("./gsmlinkDf.csv" )
58+ GEO_ids = pd .read_csv (GEO_id_link_table )
59+ print (GEO_ids )
4460 fpkm_totals = transcriptomics .iloc [:, 1 :43 ].sum ()
4561 transcriptomics .iloc [:, 1 :43 ] = transcriptomics .iloc [:, 1 :43 ].div (fpkm_totals ).mul (1e6 )
4662 transcriptomics ['ensembl' ] = transcriptomics ['Unnamed: 0' ].str .split ("_" , expand = True )[0 ]
4763 mapped_df = transcriptomics .merge (genes [['entrez_id' , 'other_id' ]].drop_duplicates (), left_on = 'ensembl' , right_on = 'other_id' , how = 'left' )
4864 # transform data to long format
65+ print (mapped_df )
4966
50- mapped_df .drop ('other_id' , axis = 1 )
67+ mapped_df = mapped_df .drop ('other_id' , axis = 1 )
5168 value_variables = transcriptomics .columns [transcriptomics .columns .str .contains ("M" )]
5269 melted_txomics = mapped_df .melt (id_vars = "entrez_id" , value_vars = value_variables , var_name = 'sample_name' )
5370 # use info from GEO to get Sample IDS
54- txomics_with_GEOid = melted_txomics .merge (GEO_ids_link , how = 'left' , left_on = "sample_name" , right_on = 'RNAid' )
71+ m1 = melted_txomics .merge (GEO_ids , how = "left" , left_on = "sample_name" , right_on = "RNAid" )
72+ m1 ["sampleid" ] = m1 ["sampleid" ].apply (normalise_id )
73+ print (m1 )
74+ print (m1 .sampleid .unique ())
5575 # use samplesheet to link sample_ids to improve ids
56- txomics_with_GEOid ['sampleid' ] = txomics_with_GEOid ['sampleid' ].str .replace ("org" , "Organoid_" )
57- txomics_with_GEOid ['sampleid' ] = txomics_with_GEOid ['sampleid' ].str .replace ("tumor" , "Tumor" )
58- txomics_with_improveid = txomics_with_GEOid .merge (samples , left_on = "sampleid" , right_on = "other_id" , how = "left" )
59- final_transcriptomics = txomics_with_improveid [['entrez_id' , 'value' , 'improve_sample_id' ]]
60- final_transcriptomics ['source' ] = "Gene Expression Omnibus"
61- final_transcriptomics ['study' ] = "Lee etal 2018 Bladder PDOs"
62- final_transcriptomics .rename ({'value' : 'transcriptomics' })
63- # remove duplicates
64- toreturn = final_transcriptomics .drop_duplicates ()
65-
66- return toreturn
76+ tx_with_ids = m1 .merge (
77+ samples , left_on = "sampleid" , right_on = "other_id" , how = "left"
78+ )
79+ print (tx_with_ids )
80+
81+ final_tx = (
82+ tx_with_ids [["entrez_id" , "value" , "improve_sample_id" ]]
83+ .drop_duplicates ()
84+ .assign (source = "Gene Expression Omnibus" ,
85+ study = "Lee et al. 2018 Bladder PDOs" )
86+ )
87+ final_tx .rename (columns = {"value" :"transcriptomics" },inplace = True )
88+ final_tx = final_tx .drop_duplicates ()
89+ final_tx = final_tx .dropna (subset = ["entrez_id" ])
90+ final_tx ["improve_sample_id" ] = final_tx ["improve_sample_id" ].astype (int )
91+ final_tx ["entrez_id" ] = final_tx ["entrez_id" ].astype (int )
92+
93+ return final_tx
6794
6895def get_bladder_pdo_mutations (synObject , samples , genes ):
6996 print (samples .head )
@@ -74,10 +101,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
74101 selectioncols_mutations = mutations_df [['Entrez_Gene_Id' ,"Variant_Classification" , "Tumor_Sample_Barcode" , "mutation" ]]
75102 merged_mutations = selectioncols_mutations .merge (samples , left_on = "Tumor_Sample_Barcode" , right_on = "other_id" , how = "left" )
76103 merged_mutations_renamed = merged_mutations .rename ({"Entrez_Gene_Id" : 'entrez_id' , 'Variant_Classification' : "variant_classification" }, axis = 1 )
77- print (merged_mutations_renamed .head )
78104 final_mutations = merged_mutations_renamed [['entrez_id' , "mutation" , "variant_classification" , "improve_sample_id" ]]
79105 final_mutations ['study' ] = "Lee etal 2018 Bladder PDOs"
80- print (final_mutations .head )
106+ final_mutations = final_mutations .dropna (subset = ["entrez_id" ])
107+ final_mutations ["improve_sample_id" ] = final_mutations ["improve_sample_id" ].astype (int )
108+ final_mutations ["entrez_id" ] = final_mutations ["entrez_id" ].astype (int )
81109 return final_mutations
82110
83111def get_bladder_pdo_copynumber (synObject , samples , genes ):
@@ -94,7 +122,9 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
94122 final_copynumber = copynumber_with_correct_colnames [['entrez_id' , 'improve_sample_id' , 'copy_number' , 'copy_call' ]]
95123 final_copynumber ['source' ] = "Synapse"
96124 final_copynumber ['study' ] = "Lee etal 2018 Bladder PDOs"
97-
125+ final_copynumber = final_copynumber .dropna (subset = ["entrez_id" ])
126+ final_copynumber ["improve_sample_id" ] = final_copynumber ["improve_sample_id" ].astype (int )
127+ final_copynumber ["entrez_id" ] = final_copynumber ["entrez_id" ].astype (int )
98128 return final_copynumber
99129
100130
@@ -108,7 +138,7 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
108138 parser .add_argument ('-c' , '--copy' , help = 'Flag to capture copy number data' , action = 'store_true' , default = False )
109139 parser .add_argument ('-m' , '--mutation' , help = 'Flag to capture mutation data' , action = 'store_true' , default = False )
110140 parser .add_argument ('-e' , '--expression' , help = 'Flag to capture transcriptomic data' , action = 'store_true' , default = False )
111- parser .add_argument ('-i' , '--geolink' , help = ".csv file that is the output of 'CNV-segfile-anotation.R" )
141+ parser .add_argument ('-i' , '--geolink' , default = "./gsmlinkDf.csv" , help = ".csv file that is the output of 'CNV-segfile-anotation.R" )
112142 parser .add_argument ('-t' , '--token' , help = 'Synapse token' )
113143
114144 args = parser .parse_args ()
@@ -129,4 +159,4 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
129159 get_bladder_pdo_mutations (synObject , samples , genes ).to_csv ('/tmp/bladderpdo_mutations.csv' , index = False )
130160
131161 if args .copy :
132- get_bladder_pdo_copynumber (synObject , samples , genes ).to_csv ("/tmp/bladderpdo_copynumber .csv" , index = False )
162+ get_bladder_pdo_copynumber (synObject , samples , genes ).to_csv ("/tmp/bladderpdo_copy_number .csv" , index = False )
0 commit comments