Skip to content

Commit 87c321d

Browse files
committed
Fixed liver proteomic issues for sure this time
1 parent 7f58935 commit 87c321d

File tree

1 file changed

+50
-38
lines changed

1 file changed

+50
-38
lines changed

coderbuild/liver/02-omics-liver.py

Lines changed: 50 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -278,48 +278,60 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
278278

279279

280280
def map_proteomics(proteomics_data, improve_id_data, entrez_data):
281-
282-
# read in data
283-
if isinstance(proteomics_data, pd.DataFrame) == False:
284-
proteomics_data = pd.read_csv(proteomics_data, dtype=str, low_memory=False)
285-
286-
if isinstance(improve_id_data, pd.DataFrame) == False:
281+
if not isinstance(proteomics_data, pd.DataFrame):
282+
# use header=1 (second line as header), drop first row which is a comment
283+
proteomics_data = pd.read_csv(proteomics_data, header=1, index_col=0)
284+
if not isinstance(improve_id_data, pd.DataFrame):
287285
improve_id_data = pd.read_csv(improve_id_data)
288-
289-
if isinstance(entrez_data, pd.DataFrame) == False:
286+
if not isinstance(entrez_data, pd.DataFrame):
290287
entrez_data = pd.read_csv(entrez_data)
291288

292-
# first, replace colnames with first row and delete first row
293-
proteomics_data.columns = proteomics_data.iloc[0,:]
294-
proteomics_data = proteomics_data.iloc[1:]
295-
296-
# melt the df so there is one sample and prot per row
297-
proteomics_data = proteomics_data.rename(columns = {proteomics_data.columns[0]:'gene_symbol'})
298-
long_prot_df = pd.melt(proteomics_data, id_vars=['gene_symbol'], value_vars=proteomics_data.columns[proteomics_data.columns != 'gene_symbol'])
299-
long_prot_df = long_prot_df.rename(columns = {0:'sample_name', 'value':'proteomics'})
300-
289+
# Clean column names
290+
proteomics_data.columns = proteomics_data.columns.astype(str).str.strip()
291+
proteomics_data = proteomics_data.rename(columns={"Sample \nGene symbol": "gene_symbol"})
301292

302-
# Ensure both columns are string types for merging
303-
long_prot_df['gene_symbol'] = long_prot_df['gene_symbol'].astype(str)
304-
entrez_data['other_id'] = entrez_data['other_id'].astype(str)
305-
306-
# map gene names to entrez id's
307-
mapped_proteomics_df = pd.merge(long_prot_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "gene_symbol", right_on= "other_id")
308-
309-
mapped_proteomics_df = mapped_proteomics_df.dropna(subset=['entrez_id'])
310-
311-
# mapping improve sample id'samples_df
312-
mapped_proteomics_df = pd.merge(mapped_proteomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "sample_name", right_on= "other_id")
313-
314-
# clean up column names and data types
315-
mapped_proteomics_df = mapped_proteomics_df.drop(columns=['gene_symbol','sample_name','other_id_x','other_id_y'])
316-
mapped_proteomics_df['source'] = "Synapse"
317-
mapped_proteomics_df['study'] = "liver"
318-
mapped_proteomics_df = mapped_proteomics_df.dropna()
319-
mapped_proteomics_df = mapped_proteomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
320-
mapped_proteomics_df = mapped_proteomics_df[['entrez_id','proteomics','improve_sample_id','source','study']]
321-
322-
return(mapped_proteomics_df)
293+
# Drop any rows with missing gene symbol and all are strings
294+
proteomics_data = proteomics_data.dropna(subset=["gene_symbol"])
295+
proteomics_data["gene_symbol"] = proteomics_data["gene_symbol"].astype(str).str.strip()
296+
297+
value_cols = [c for c in proteomics_data.columns if c != "gene_symbol"]
298+
long_prot_df = proteomics_data.melt(
299+
id_vars=["gene_symbol"],
300+
value_vars=value_cols,
301+
var_name="sample_name",
302+
value_name="proteomics"
303+
)
304+
305+
#ensure strings
306+
long_prot_df["gene_symbol"] = long_prot_df["gene_symbol"].astype(str).str.strip()
307+
entrez_data["other_id"] = entrez_data["other_id"].astype(str).str.strip()
308+
309+
#Two merges
310+
mapped_proteomics_df = pd.merge(
311+
long_prot_df,
312+
entrez_data[["other_id", "entrez_id"]].drop_duplicates(),
313+
how="inner",
314+
left_on="gene_symbol",
315+
right_on="other_id"
316+
)
317+
318+
improve_id_data["other_id"] = improve_id_data["other_id"].astype(str).str.strip()
319+
mapped_proteomics_df = pd.merge(
320+
mapped_proteomics_df,
321+
improve_id_data[["other_id", "improve_sample_id"]].drop_duplicates(),
322+
how="inner",
323+
left_on="sample_name",
324+
right_on="other_id"
325+
)
326+
327+
mapped_proteomics_df = mapped_proteomics_df.drop(columns=["other_id_x", "other_id_y", "gene_symbol"])
328+
mapped_proteomics_df["source"] = "Synapse"
329+
mapped_proteomics_df["study"] = "liver"
330+
mapped_proteomics_df = mapped_proteomics_df.dropna(subset=["entrez_id", "improve_sample_id"])
331+
mapped_proteomics_df = mapped_proteomics_df.astype({"entrez_id": "int", "improve_sample_id": "int"})
332+
mapped_proteomics_df = mapped_proteomics_df[["entrez_id", "proteomics", "improve_sample_id", "source", "study"]]
333+
334+
return mapped_proteomics_df
323335

324336

325337
if __name__ == "__main__":

0 commit comments

Comments
 (0)