Skip to content

Commit dca728b

Browse files
accidentally deleted transcriptomics function!! adding back
1 parent 36e0f01 commit dca728b

File tree

1 file changed

+49
-0
lines changed

1 file changed

+49
-0
lines changed

coderbuild/liver/02-omics-liver.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,56 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
225225

226226
return(improve_mapped_cn_df)
227227

228+
def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
228229

230+
# read in data
231+
if isinstance(transciptomics_data, pd.DataFrame) == False:
232+
transciptomics_data = pd.read_csv(transciptomics_data)
233+
234+
if isinstance(improve_id_data, pd.DataFrame) == False:
235+
improve_id_data = pd.read_csv(improve_id_data)
236+
237+
if isinstance(entrez_data, pd.DataFrame) == False:
238+
entrez_data = pd.read_csv(entrez_data)
239+
240+
# first, convert genes, which are in ensembl id's to gene names
241+
transciptomics_data = transciptomics_data.rename(columns={'Unnamed: 0': 'stable_id'})
242+
mg = mygene.MyGeneInfo()
243+
ensembl_ids = transciptomics_data['stable_id'].values
244+
gene_info_list = mg.getgenes(ensembl_ids, fields='symbol')
245+
gene_df = pd.DataFrame.from_dict(gene_info_list)
246+
for_tpm = pd.merge(transciptomics_data, gene_df[['query','symbol']], how = 'inner', left_on= "stable_id", right_on= "query")
247+
for_tpm = for_tpm.dropna(subset=['symbol'])
248+
for_tpm = for_tpm.drop(columns=['query','stable_id'])
249+
for_tpm = for_tpm.rename(columns={'symbol':'stable_id'})
250+
for_tpm.to_csv("/tmp/counts_for_tpm_conversion.tsv", sep='\t')
251+
252+
253+
# run tpmFromCounts.py to convert counts to tpm
254+
os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv")
255+
256+
# melt the df so there is one sample and gene per row
257+
long_transcriptomics_df = pd.read_csv("/tmp/transcriptomics_tpm.tsv",sep='\t')
258+
long_transcriptomics_df = pd.melt(long_transcriptomics_df, id_vars=['stable_id'], value_vars=long_transcriptomics_df.columns[long_transcriptomics_df.columns != 'stable_id'])
259+
long_transcriptomics_df = long_transcriptomics_df.rename(columns = {'value':'transcriptomics', 0:'sample_name'})
260+
261+
262+
# map gene names to entrez id's
263+
mapped_transcriptomics_df = pd.merge(long_transcriptomics_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id")
264+
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna(subset=['entrez_id'])
265+
266+
# mapping improve sample id'samples_df
267+
mapped_transcriptomics_df = pd.merge(mapped_transcriptomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "other_id")
268+
269+
# clean up column names and data types
270+
mapped_transcriptomics_df = mapped_transcriptomics_df.drop(columns=['stable_id','variable','other_id_x','other_id_y'])
271+
mapped_transcriptomics_df['source'] = "Synapse"
272+
mapped_transcriptomics_df['study'] = "liver"
273+
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna()
274+
mapped_transcriptomics_df = mapped_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
275+
mapped_transcriptomics_df = mapped_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']]
276+
277+
return(mapped_transcriptomics_df)
229278

230279

231280
def map_proteomics(proteomics_data, improve_id_data, entrez_data):

0 commit comments

Comments
 (0)