Skip to content

Commit d1e4b25

Browse files
adding proteomics function
1 parent 72ede80 commit d1e4b25

File tree

1 file changed

+28
-37
lines changed

1 file changed

+28
-37
lines changed

coderbuild/liver/02-omics-liver.py

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -226,56 +226,46 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
226226
return(improve_mapped_cn_df)
227227

228228

229-
def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
229+
230+
231+
def map_proteomics(proteomics_data, improve_id_data, entrez_data):
230232

231233
# read in data
232-
if isinstance(transciptomics_data, pd.DataFrame) == False:
233-
transciptomics_data = pd.read_csv(transciptomics_data)
234+
if isinstance(proteomics_data, pd.DataFrame) == False:
235+
proteomics_data = pd.read_csv(proteomics_data)
234236

235237
if isinstance(improve_id_data, pd.DataFrame) == False:
236238
improve_id_data = pd.read_csv(improve_id_data)
237-
239+
238240
if isinstance(entrez_data, pd.DataFrame) == False:
239241
entrez_data = pd.read_csv(entrez_data)
240242

241-
# first, convert genes, which are in ensembl id's to gene names
242-
transciptomics_data = transciptomics_data.rename(columns={'Unnamed: 0': 'stable_id'})
243-
mg = mygene.MyGeneInfo()
244-
ensembl_ids = transciptomics_data['stable_id'].values
245-
gene_info_list = mg.getgenes(ensembl_ids, fields='symbol')
246-
gene_df = pd.DataFrame.from_dict(gene_info_list)
247-
for_tpm = pd.merge(transciptomics_data, gene_df[['query','symbol']], how = 'inner', left_on= "stable_id", right_on= "query")
248-
for_tpm = for_tpm.dropna(subset=['symbol'])
249-
for_tpm = for_tpm.drop(columns=['query','stable_id'])
250-
for_tpm = for_tpm.rename(columns={'symbol':'stable_id'})
251-
for_tpm.to_csv("/tmp/counts_for_tpm_conversion.tsv", sep='\t')
252-
253-
254-
# run tpmFromCounts.py to convert counts to tpm
255-
os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv")
256-
257-
# melt the df so there is one sample and gene per row
258-
long_transcriptomics_df = pd.read_csv("/tmp/transcriptomics_tpm.tsv",sep='\t')
259-
long_transcriptomics_df = pd.melt(long_transcriptomics_df, id_vars=['stable_id'], value_vars=long_transcriptomics_df.columns[long_transcriptomics_df.columns != 'stable_id'])
260-
long_transcriptomics_df = long_transcriptomics_df.rename(columns = {'value':'transcriptomics', 0:'sample_name'})
261-
243+
# first, replace colnames with first row and delete first row
244+
proteomics_data.columns = proteomics_data.iloc[0,:]
245+
proteomics_data = proteomics_data.iloc[1:]
262246

263-
# map gene names to entrez id's
264-
mapped_transcriptomics_df = pd.merge(long_transcriptomics_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id")
265-
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna(subset=['entrez_id'])
247+
# melt the df so there is one sample and prot per row
248+
proteomics_data = proteomics_data.rename(columns = {proteomics_data.columns[0]:'gene_symbol'})
249+
long_prot_df = pd.melt(proteomics_data, id_vars=['gene_symbol'], value_vars=proteomics_data.columns[proteomics_data.columns != 'gene_symbol'])
250+
long_prot_df = long_prot_df.rename(columns = {0:'sample_name', 'value':'proteomics'})
251+
252+
253+
# map gene names to entrez id's
254+
mapped_proteomics_df = pd.merge(long_prot_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "gene_symbol", right_on= "other_id")
255+
mapped_proteomics_df = mapped_proteomics_df.dropna(subset=['entrez_id'])
266256

267257
# mapping improve sample id'samples_df
268-
mapped_transcriptomics_df = pd.merge(mapped_transcriptomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "other_id")
269-
258+
mapped_proteomics_df = pd.merge(mapped_proteomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "sample_name", right_on= "other_id")
259+
270260
# clean up column names and data types
271-
mapped_transcriptomics_df = mapped_transcriptomics_df.drop(columns=['stable_id','variable','other_id_x','other_id_y'])
272-
mapped_transcriptomics_df['source'] = "Synapse"
273-
mapped_transcriptomics_df['study'] = "liver"
274-
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna()
275-
mapped_transcriptomics_df = mapped_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
276-
mapped_transcriptomics_df = mapped_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']]
261+
mapped_proteomics_df = mapped_proteomics_df.drop(columns=['gene_symbol','sample_name','other_id_x','other_id_y'])
262+
mapped_proteomics_df['source'] = "Synapse"
263+
mapped_proteomics_df['study'] = "liver"
264+
mapped_proteomics_df = mapped_proteomics_df.dropna()
265+
mapped_proteomics_df = mapped_proteomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
266+
mapped_proteomics_df = mapped_proteomics_df[['entrez_id','proteomics','improve_sample_id','source','study']]
277267

278-
return(mapped_transcriptomics_df)
268+
return(mapped_proteomics_df)
279269

280270

281271
if __name__ == "__main__":
@@ -293,6 +283,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
293283
parser.add_argument('-M', '--mutations', action = 'store_true', default=False, help='Generate mutations data')
294284
parser.add_argument('-C', '--copy_number', action = 'store_true', default=False, help='Generate copy number data')
295285

286+
296287
args = parser.parse_args()
297288

298289

0 commit comments

Comments
 (0)