-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_clinical_data_cleaned_brightness.py
More file actions
executable file
·120 lines (94 loc) · 4.56 KB
/
get_clinical_data_cleaned_brightness.py
File metadata and controls
executable file
·120 lines (94 loc) · 4.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 4 15:30:59 2022
@author: dhrubas2
"""
## set up necessary directories/paths.
_wpath_ = "/Users/dhrubas2/OneDrive - National Institutes of Health/Projects/TMEcontribution/analysis/submission/Code/analysis/"
_mpath_ = "miscellaneous/py/"
## load necessary packages.
import os, sys
sys.path.append(_wpath_); os.chdir(_wpath_) # current path
if _mpath_ not in sys.path:
sys.path.append(_mpath_) # to load miscellaneous
import numpy as np, pandas as pd
from miscellaneous import date_time, write_xlsx
#%% read clinical data.
data_path = "../data/BrighTNess/"
data_file = "GSE164458_series_matrix.txt"
## read data with variable #fields as dataframe.
## tip: https://stackoverflow.com/questions/55589500/reading-text-file-with-variable-columns-in-pandas-dataframe
## initialize 'names' with large #columns to read (greater than max. available
## #fields- or error will occur).
clin_data_txt = pd.read_table(
data_path + data_file, sep = "\t", engine = "python", dtype = str,
header = None, index_col = 0,
names = ["field"] + [f"ln{nn}" for nn in range(500)]
)
clin_data_txt = clin_data_txt.dropna(axis = 1, how = "all") # remove empty lines
clin_data_txt.index = [fld_.replace("!", "") for fld_ in clin_data_txt.index] # remove '!' from field names
#%% clean data.
## get simplified treatment list per patient ['PBO' stands for placebo].
def get_treatments(arm_description):
treatments = "+".join([
trt.strip().split(" ")[0].capitalize() \
for trt in arm_description.split("+") if "PBO" not in trt
])
return treatments
## get sample charactistics.
keep = ["Sample_title", "Sample_characteristics_ch1"]
clin_data_samples = clin_data_txt.loc[keep].T.copy() # selects multiple rows named 'Sample_characteristics_ch1'
clin_data_samples.index = clin_data_samples[keep[0]].apply(
lambda id_: f"Sample_{id_.split('_')[0]}" # make sample names as 'Sample_{sample_id}'
).astype(str)
clin_data_samples.drop(columns = keep[0], inplace = True)
clin_data_samples.columns = clin_data_samples.iloc[0].apply(
lambda char_: char_.split(": ")[0] # keep characteristics name
).astype(str).values
clin_data_samples = clin_data_samples.applymap( # keep characteristics value
lambda info_: info_.split(": ")[1]
)
clin_data_samples[clin_data_samples == "NA"] = np.nan # replace 'NA' with nan
## simplified treatment list for ease of filtering.
clin_data_samples["treatment"] = \
clin_data_samples["description_of_planned_arm"].apply(get_treatments)
## from Methods/Procedures in Loibi et al. 2018 [PMID: 29501363]:
## "Patients who did not have surgery were counted as not achieving
## pathological complete response."
clin_data_samples["surgery"] = \
~clin_data_samples["residual_cancer_burden_class"].isna()
## convert to appropriate data types.
for fld_, info_ in clin_data_samples.items():
try:
clin_data_samples[fld_] = info_.astype(float)
except:
clin_data_samples[fld_] = info_.astype(str)
## reorder columns for convenience.
ordered_cols = [
"description_of_planned_arm", "pretreatment_lymphnode_stage",
"ac_planned_schedule", "smoking_history", "ecog_ps_baseline",
"planned_arm_code", "treatment", "surgery",
"residual_cancer_burden_class", "pathologic_complete_response"
]
clin_data_samples = clin_data_samples[ordered_cols]
print(f"\nvariable types = \n{clin_data_samples.dtypes}")
## additional info [all the fields with a single available line].
clin_data_supp = clin_data_txt[
clin_data_txt.isna().sum(axis = 1) == (clin_data_txt.shape[1] - 1)
]
clin_data_supp = clin_data_supp.dropna(axis = 1, how = "all").squeeze() # remove empty lines
clin_data_supp[clin_data_supp == "NA"] = np.nan
for fld_, info_ in clin_data_supp.items(): # convert to appropriate data types
try:
clin_data_supp[fld_] = float(info_)
except:
clin_data_supp[fld_] = str(info_)
#%% save data.
svdat = False # set True to save data
if svdat:
datestamp = date_time()
clin_file = f"GSE164458_BrighTNess_clinical_info_SRD_{datestamp}.xlsx"
clin_info = {"samples": clin_data_samples,
"additional_info": clin_data_supp}
write_xlsx(data_path + clin_file, data = clin_info)