2
2
3
3
The code is for demonstrate feature selection methods that CANDLE provides.
4
4
5
+ ## Download data
6
+ Code
7
+ ``` python
8
+ # download all the data if needed from the repo
9
+ data_url = ' http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/'
10
+ file_name = ' small_drug_descriptor_data_unique_samples.txt'
11
+ drug_descriptor = candle.get_file(file_name, data_url+ file_name, cache_subdir = ' examples' )
5
12
6
- ```
7
- $ python M16_test.py
13
+ file_name = ' small_drug_response_data.txt'
14
+ response_data = candle.get_file(file_name, data_url+ file_name, cache_subdir = ' examples' )
15
+
16
+ file_name = ' Gene_Expression_Full_Data_Unique_Samples.txt'
17
+ gene_expression = candle.get_file(file_name, data_url+ file_name, cache_subdir = ' examples' )
8
18
19
+ file_name = ' CCLE_NCI60_Gene_Expression_Full_Data.txt'
20
+ ccle_nci60 = candle.get_file(file_name, data_url+ file_name, cache_subdir = ' examples' )
21
+ ```
22
+ Output
23
+ ``` bash
9
24
Importing candle utils for keras
10
25
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_descriptor_data_unique_samples.txt
11
26
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/small_drug_response_data.txt
12
27
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/Gene_Expression_Full_Data_Unique_Samples.txt
13
28
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Data_For_Testing/CCLE_NCI60_Gene_Expression_Full_Data.txt
29
+ ```
30
+
31
+ ## Download gene set
32
+ Code
33
+ ``` python
34
+ # download all the gene_set files needed
35
+ data_url = ' http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/'
36
+ for gene_set_category in [' c2.cgp' ,' c2.cp.biocarta' ,' c2.cp.kegg' ,' c2.cp.pid' ,' c2.cp.reactome' ,' c5.bp' ,' c5.cc' ,' c5.mf' ,' c6.all' ]:
37
+ for gene_name_type in [' entrez' , ' symbols' ]:
38
+ file_name = gene_set_category+ ' .v7.0.' + gene_name_type+ ' .gmt'
39
+ local_file = candle.get_file(file_name, data_url+ file_name, cache_subdir = ' examples/Gene_Sets/MSigDB.v7.0' )
40
+ # extract base directory for gene_set data files
41
+ data_dir = local_file.split(file_name)[0 ]
42
+ print (' Gene Set data is locally stored at ' , data_dir)
43
+ ```
44
+ Output
45
+ ```
14
46
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.entrez.gmt
15
47
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cgp.v7.0.symbols.gmt
16
48
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c2.cp.biocarta.v7.0.entrez.gmt
@@ -30,8 +62,24 @@ Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_
30
62
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.entrez.gmt
31
63
Origin = http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/uno/Candle_Milestone_16_Version_12_15_2019/Data/Gene_Sets/MSigDB.v7.0/c6.all.v7.0.symbols.gmt
32
64
Gene Set data is locally stored at /Users/hsyoo/projects/CANDLE/Benchmarks/common/../Data/examples/Gene_Sets/MSigDB.v7.0/
65
+ ```
33
66
34
-
67
+ # Select features based on missing values
68
+ Code
69
+ ``` python
70
+ print (' Testing select_features_by_missing_values' )
71
+ print (' Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns)' )
72
+ data = pd.read_csv(drug_descriptor, sep = ' \t ' , engine = ' c' , na_values = [' na' , ' -' , ' ' ], header = 0 , index_col = 0 , low_memory = False )
73
+ print (data)
74
+ print (' Select features with missing rates smaller than 0.1' )
75
+ id = candle.select_features_by_missing_values(data, threshold = 0.1 )
76
+ print (' Feature IDs' , id )
77
+ print (' Select features with missing rates smaller than 0.3' )
78
+ id = candle.select_features_by_missing_values(data.values, threshold = 0.3 )
79
+ print (' Feature IDs' , id )
80
+ ```
81
+ Output
82
+ ``` bash
35
83
Testing select_features_by_missing_values
36
84
Drug descriptor dataframe includes 10 drugs (rows) and 10 drug descriptor features (columns)
37
85
MW AMW Sv Se ... Mv Psi_e_1d Psi_e_1s VE3sign_X
@@ -51,22 +99,65 @@ Select features with missing rates smaller than 0.1
51
99
Feature IDs [0 1 2 3 4 5 6]
52
100
Select features with missing rates smaller than 0.3
53
101
Feature IDs [0 1 2 3 4 5 6 9]
102
+ ```
54
103
104
+ # Select features based on variation
105
+ Code
106
+ ``` python
107
+ print (' Testing select_features_by_variation' )
108
+ print (' Select features with a variance larger than 100' )
109
+ id = candle.select_features_by_variation(data, variation_measure = ' var' , threshold = 100 , portion = None , draw_histogram = False )
110
+ print (' Feature IDs' , id )
111
+ print (' Select the top 2 features with the largest standard deviation' )
112
+ id = candle.select_features_by_variation(data, variation_measure = ' std' , portion = 0.2 )
113
+ print (' Feature IDs' , id )
114
+ ```
55
115
116
+ Output
117
+ ```
56
118
Testing select_features_by_variation
57
119
Select features with a variance larger than 100
58
120
Feature IDs [0 3 5]
59
121
Select the top 2 features with the largest standard deviation
60
122
Feature IDs [0 5]
123
+ ```
61
124
62
-
125
+ # Select decorrelated features
126
+ Code
127
+ ``` python
128
+ print (' Testing select_decorrelated_features' )
129
+ print (' Select features that are not identical to each other and are not all missing.' )
130
+ id = candle.select_decorrelated_features(data, threshold = None , random_seed = None )
131
+ print (' Feature IDs' , id )
132
+ print (' Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8' )
133
+ id = candle.select_decorrelated_features(data, method = ' spearman' , threshold = 0.8 , random_seed = 10 )
134
+ print (' Feature IDs' , id )
135
+ ```
136
+ Output
137
+ ```
63
138
Testing select_decorrelated_features
64
139
Select features that are not identical to each other and are not all missing.
65
140
Feature IDs [0 1 2 3 4 5 6 9]
66
141
Select features whose absolute mutual Spearman correlation coefficient is smaller than 0.8
67
142
Feature IDs [0 2 6 9]
143
+ ```
68
144
69
-
145
+ # Generate cross-validation partitions of data
146
+ Code
147
+ ``` python
148
+ print (' Testing generate_cross_validation_partition' )
149
+ print (' Generate 5-fold cross-validation partition of 10 samples twice' )
150
+ p = candle.generate_cross_validation_partition(range (10 ), n_folds = 5 , n_repeats = 2 , portions = None , random_seed = None )
151
+ print (p)
152
+ print (' Drug response data of 5 cell lines treated by various drugs.' )
153
+ data = pd.read_csv(response_data, sep = ' \t ' , engine = ' c' , na_values = [' na' , ' -' , ' ' ], header = 0 , index_col = None , low_memory = False )
154
+ print (data)
155
+ print (' Generate partition indices to divide the data into 4 sets without shared cell lines for 5 times.' )
156
+ p = candle.generate_cross_validation_partition(data.CELL , n_folds = 5 , n_repeats = 1 , portions = [1 , 1 , 1 , 2 ], random_seed = 1 )
157
+ print (p)
158
+ ```
159
+ Output
160
+ ```
70
161
Testing generate_cross_validation_partition
71
162
Generate 5-fold cross-validation partition of 10 samples twice
72
163
[[[0, 5], [1, 2, 3, 4, 6, 7, 8, 9]], [[1, 6], [0, 2, 3, 4, 5, 7, 8, 9]], [[2, 7], [0, 1, 3, 4, 5, 6, 8, 9]], [[3, 8], [0, 1, 2, 4, 5, 6, 7, 9]], [[4, 9], [0, 1, 2, 3, 5, 6, 7, 8]], [[5, 8], [0, 1, 2, 3, 4, 6, 7, 9]], [[3, 9], [0, 1, 2, 4, 5, 6, 7, 8]], [[2, 4], [0, 1, 3, 5, 6, 7, 8, 9]], [[1, 7], [0, 2, 3, 4, 5, 6, 8, 9]], [[0, 6], [1, 2, 3, 4, 5, 7, 8, 9]]]
@@ -95,9 +186,34 @@ found 0 categorical variables:
95
186
Standardizing Data across genes.
96
187
Fitting L/S model and finding priors
97
188
Finding parametric adjustments
189
+ ```
98
190
99
-
100
-
191
+ # Quantile normalization of gene expression data
192
+ Code
193
+ ``` python
194
+ print (' Testing quantile_normalization' )
195
+ print (' Gene expression data of 897 cell lines (columns) and 17741 genes (rows).' )
196
+ data = pd.read_csv(gene_expression, sep = ' \t ' , engine = ' c' , na_values = [' na' , ' -' , ' ' ], header = 0 , index_col = [0 , 1 ], low_memory = False )
197
+ print (data)
198
+ print (' Before normalization' )
199
+ third_quartile = data.quantile(0.75 , axis = 0 )
200
+ print (' Max difference of third quartile between cell lines is ' + str (np.round(a = np.max(third_quartile) - np.min(third_quartile), decimals = 2 )))
201
+ second_quartile = data.quantile(0.5 , axis = 0 )
202
+ print (' Max difference of median between cell lines is ' + str (np.round(a = np.max(second_quartile) - np.min(second_quartile), decimals = 2 )))
203
+ first_quartile = data.quantile(0.25 , axis = 0 )
204
+ print (' Max difference of first quartile between cell lines is ' + str (np.round(a = np.max(first_quartile) - np.min(first_quartile), decimals = 2 )))
205
+ norm_data = candle.quantile_normalization(np.transpose(data))
206
+ norm_data = np.transpose(norm_data)
207
+ print (' After normalization' )
208
+ third_quartile = norm_data.quantile(0.75 , axis = 0 )
209
+ print (' Max difference of third quartile between cell lines is ' + str (np.round(a = np.max(third_quartile) - np.min(third_quartile), decimals = 2 )))
210
+ second_quartile = norm_data.quantile(0.5 , axis = 0 )
211
+ print (' Max difference of median between cell lines is ' + str (np.round(a = np.max(second_quartile) - np.min(second_quartile), decimals = 2 )))
212
+ first_quartile = norm_data.quantile(0.25 , axis = 0 )
213
+ print (' Max difference of first quartile between cell lines is ' + str (np.round(a = np.max(first_quartile) - np.min(first_quartile), decimals = 2 )))
214
+ ```
215
+ Output
216
+ ```
101
217
Testing quantile_normalization
102
218
Gene expression data of 897 cell lines (columns) and 17741 genes (rows).
103
219
CCL_61 CCL_62 CCL_63 ... CCL_1076 CCL_1077 CCL_1078
@@ -123,8 +239,22 @@ After normalization
123
239
Max difference of third quartile between cell lines is 0.01
124
240
Max difference of median between cell lines is 0.02
125
241
Max difference of first quartile between cell lines is 0.06
242
+ ```
126
243
127
-
244
+ # Generate gene-set-level data
245
+ ``` python
246
+ print (' Testing generate_gene_set_data' )
247
+ gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[0 ] for i in norm_data.index], gene_name_type = ' entrez' ,
248
+ gene_set_category = ' c6.all' , metric = ' mean' , standardize = True , data_dir = data_dir)
249
+ print (' Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets' )
250
+ print (gene_set_data)
251
+ gene_set_data = candle.generate_gene_set_data(np.transpose(norm_data), [i[1 ] for i in norm_data.index], gene_name_type = ' symbols' ,
252
+ gene_set_category = ' c2.cp.kegg' , metric = ' sum' , standardize = True , data_dir = data_dir)
253
+ print (' Generate gene-set-level data of 897 cell lines and 186 KEGG pathways' )
254
+ print (gene_set_data)
255
+ ```
256
+ Output
257
+ ```
128
258
Testing generate_gene_set_data
129
259
Generate gene-set-level data of 897 cell lines and 189 oncogenic signature gene sets
130
260
GLI1_UP.V1_DN GLI1_UP.V1_UP ... LEF1_UP.V1_DN LEF1_UP.V1_UP
@@ -156,8 +286,57 @@ CCL_1077 -11.576702 ... -31.679962
156
286
CCL_1078 -10.355489 ... -26.232325
157
287
158
288
[897 rows x 186 columns]
289
+ ```
290
+
291
+ # Combat batch normalization on gene expression data
292
+ Code
293
+ ``` python
294
+ print (' Testing combat_batch_effect_removal' )
295
+ print (' Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes.' )
296
+ data = pd.read_csv(ccle_nci60, sep = ' \t ' , engine = ' c' , na_values = [' na' , ' -' , ' ' ], header = 0 , index_col = [0 , 1 ], low_memory = False )
297
+ print (data)
298
+ resource = np.array([i.split(' .' )[0 ] for i in data.columns])
299
+
300
+ print (' Before removal of batch effect between NCI60 and CCLE datasets' )
301
+ # Identify NCI60 cell lines and quantile normalize their gene expression data
302
+ id = np.where(resource == ' NCI60' )[0 ]
303
+ norm_data_NCI60 = candle.quantile_normalization(np.transpose(data.iloc[:, id ]))
304
+ print (' Average third quartile of NCI60 cell lines is ' + str (np.round(a = np.mean(norm_data_NCI60.quantile(0.75 , axis = 1 )), decimals = 2 )))
305
+ print (' Average median of NCI60 cell lines is ' + str (np.round(a = np.mean(norm_data_NCI60.quantile(0.5 , axis = 1 )), decimals = 2 )))
306
+ print (' Average first quartile of NCI60 cell lines is ' + str (np.round(a = np.mean(norm_data_NCI60.quantile(0.25 , axis = 1 )), decimals = 2 )))
307
+
308
+ # Identify CCLE cell lines and quantile normalize their gene expression data
309
+ id = np.where(resource == ' CCLE' )[0 ]
310
+ norm_data_CCLE = candle.quantile_normalization(np.transpose(data.iloc[:, id ]))
311
+ print (' Average third quartile of CCLE cell lines is ' + str (np.round(a = np.mean(norm_data_CCLE.quantile(0.75 , axis = 1 )), decimals = 2 )))
312
+ print (' Average median of CCLE cell lines is ' + str (np.round(a = np.mean(norm_data_CCLE.quantile(0.5 , axis = 1 )), decimals = 2 )))
313
+ print (' Average first quartile of CCLE cell lines is ' + str (np.round(a = np.mean(norm_data_CCLE.quantile(0.25 , axis = 1 )), decimals = 2 )))
314
+
315
+ # Combine normalized data of NCI60 cell lines and CCLE cell lines
316
+ norm_data = pd.concat((norm_data_NCI60, norm_data_CCLE), axis = 0 )
317
+ norm_data = np.transpose(norm_data)
159
318
319
+ # Apply ComBat algorithm to remove the batch effect between NCI60 and CCLE
320
+ corrected_data = candle.combat_batch_effect_removal(norm_data, pd.Series([i.split(' .' )[0 ] for i in norm_data.columns], index = norm_data.columns))
160
321
322
+ print (' After removal of batch effect between NCI60 and CCLE datasets' )
323
+
324
+ resource = np.array([i.split(' .' )[0 ] for i in corrected_data.columns])
325
+ id = np.where(resource == ' NCI60' )[0 ]
326
+ corrected_data_NCI60 = np.transpose(corrected_data.iloc[:, id ])
327
+ print (' Average third quartile of NCI60 cell lines is ' + str (np.round(a = np.mean(corrected_data_NCI60.quantile(0.75 , axis = 1 )), decimals = 2 )))
328
+ print (' Average median of NCI60 cell lines is ' + str (np.round(a = np.mean(corrected_data_NCI60.quantile(0.5 , axis = 1 )), decimals = 2 )))
329
+ print (' Average first quartile of NCI60 cell lines is ' + str (np.round(a = np.mean(corrected_data_NCI60.quantile(0.25 , axis = 1 )), decimals = 2 )))
330
+
331
+ # Identify CCLE cell lines and quantile normalize their gene expression data
332
+ id = np.where(resource == ' CCLE' )[0 ]
333
+ corrected_data_CCLE = np.transpose(corrected_data.iloc[:, id ])
334
+ print (' Average third quartile of CCLE cell lines is ' + str (np.round(a = np.mean(corrected_data_CCLE.quantile(0.75 , axis = 1 )), decimals = 2 )))
335
+ print (' Average median of CCLE cell lines is ' + str (np.round(a = np.mean(corrected_data_CCLE.quantile(0.5 , axis = 1 )), decimals = 2 )))
336
+ print (' Average first quartile of CCLE cell lines is ' + str (np.round(a = np.mean(corrected_data_CCLE.quantile(0.25 , axis = 1 )), decimals = 2 )))
337
+ ```
338
+ Output
339
+ ```
161
340
Testing combat_batch_effect_removal
162
341
Gene expression data of 60 NCI60 cell lines and 1018 CCLE cell lines with 17741 genes.
163
342
NCI60.786-0|CCL_1 ... CCLE.ZR7530|CCL_1078
0 commit comments