|
| 1 | +library(ggplot2) |
| 2 | +library(ggpubr) |
| 3 | +library(reshape2) |
| 4 | + |
| 5 | +scale_minMax <- function(x){ |
| 6 | + x_min = min(x) |
| 7 | + x_max = max(x) |
| 8 | + scaled = (x-x_min)/(x_max-x_min) |
| 9 | + return(scaled) |
| 10 | +} |
| 11 | + |
| 12 | +scale_Max <- function(x){ |
| 13 | + x_max = max(x) |
| 14 | + scaled = (x)/(x_max) |
| 15 | + return(scaled) |
| 16 | +} |
| 17 | + |
| 18 | + |
| 19 | +add_emp_pvalue <- function(fcat_df, a_model){ |
| 20 | + ### input: dataframe of merged fcat scores for shuffled and baseline fca scores. |
| 21 | + ### counts the number of observations in the empirical null distribution which are higher than the given fca score (fca_emp_h) |
| 22 | + ### calculates the empirical p-value by dividing the fca_emp_h by the total number of null dist observarions |
| 23 | + fcat_df_shuffle= fcat_df[fcat_df$type == 'shuffle',] |
| 24 | + null_empirical_dist = fcat_df_shuffle$importance[fcat_df_shuffle$model==a_model] |
| 25 | + |
| 26 | + model_fcat_base = fcat_df[fcat_df$type == 'baseline' & fcat_df$model == a_model,] |
| 27 | + model_fcat_base$pvalue = sapply(1:nrow(model_fcat_base), |
| 28 | + function(i) sum(null_empirical_dist>model_fcat_base$importance[i])/length(null_empirical_dist), |
| 29 | + simplify = T) |
| 30 | + return(model_fcat_base) |
| 31 | +} |
| 32 | + |
| 33 | + |
| 34 | +fcat_single_base = read.csv('/home/delaram/sciRED/benchmark/scMix/baseline/fcat_scMix_single_baseline.csv') |
| 35 | +fcat_single_base$type = 'baseline' |
| 36 | + |
| 37 | +file = '/home/delaram/sciRED/benchmark/scMix/shuffle/single/' |
| 38 | +fcat_single_list = lapply(list.files(file, pattern = "fcat_scMix*", full.names = T), read.csv) |
| 39 | +fcat_single_shuffle <- Reduce(rbind,fcat_single_list) |
| 40 | +fcat_single_shuffle$type = 'shuffle' |
| 41 | +head(fcat_single_shuffle) |
| 42 | + |
| 43 | +fcat_single = rbind(fcat_single_base, fcat_single_shuffle) |
| 44 | +fcat_single$importance_abs = abs(fcat_single$importance) |
| 45 | + |
| 46 | +ggplot(fcat_single, aes(x=model, y=importance, fill=type))+ |
| 47 | + geom_boxplot()+theme_classic()+ |
| 48 | + coord_flip()+scale_fill_manual(values=c("#999999", "maroon")) |
| 49 | + |
| 50 | +fcat_models<- split(fcat_single, fcat_single$model) |
| 51 | +#### scaling various classifier scores |
| 52 | +sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_scale <<- scale(fcat_models[[i]]$importance, center = FALSE)}, simplify = F) |
| 53 | +sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_z_trans <<- scale(fcat_models[[i]]$importance)}, simplify = F) |
| 54 | +sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_minmax <<- scale_minMax(fcat_models[[i]]$importance)}, simplify = F) |
| 55 | +sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_max_scale <<- scale_Max(fcat_models[[i]]$importance)}, simplify = F) |
| 56 | + |
| 57 | + |
| 58 | +###### Figure B for the benchmark panel |
| 59 | +fcat_models_df = Reduce(rbind, fcat_models) |
| 60 | +ggplot(fcat_models_df, aes(x=model, y=importance, fill=type))+geom_boxplot()+ |
| 61 | + theme_classic()+coord_flip()+scale_fill_manual(values=c("#56B4E9", "maroon"))+ |
| 62 | + theme(text = element_text(size=18))+xlab('') |
| 63 | + |
| 64 | +ggplot(fcat_models_df, aes(x=model, y=imp_minmax, fill=type))+geom_boxplot()+theme_classic()+ |
| 65 | + coord_flip()+scale_fill_manual(values=c("#56B4E9", "maroon"))+ |
| 66 | + theme(text = element_text(size=18))+xlab('')+ylab('Importance score (min-max scaled)') |
| 67 | + |
| 68 | + |
| 69 | +########### sanity check ########### |
| 70 | +fcat_models_df_base= fcat_models_df[fcat_models_df$type == 'baseline',] |
| 71 | +fcat_models_df_shuffle = fcat_models_df[fcat_models_df$type == 'shuffle',] |
| 72 | + |
| 73 | +model_names = names(table(fcat_models_df_shuffle$model)) |
| 74 | +ggplot(fcat_models_df_shuffle, aes(x=importance, fill=model))+ |
| 75 | + geom_histogram(alpha=0.5,color='black',bins=100)+theme_classic()+scale_fill_brewer(palette = 'Set1') |
| 76 | + |
| 77 | +ggplot(fcat_models_df_shuffle, aes(x=imp_minmax, fill=model))+ |
| 78 | + geom_histogram(alpha=0.5,color='black',bins=100)+theme_classic()+scale_fill_brewer(palette = 'Set1') |
| 79 | + |
| 80 | +a_model = "RandomForest" |
| 81 | +model_imp_shuffle_values = fcat_models_df_shuffle$importance[fcat_models_df_shuffle$model==a_model] |
| 82 | +ggplot(fcat_models_df_shuffle, aes(x=importance))+geom_histogram( bins=200,fill='grey')+ |
| 83 | + theme_classic()+ggtitle(a_model)+theme(text = element_text(size=18))+xlab('FCA scores for a single model')+ |
| 84 | + ylab("Frequency")+geom_vline(xintercept=0.09, color = "red", size=1, linetype="dashed") |
| 85 | + |
| 86 | + |
| 87 | +cor_df = data.frame(imp=fcat_models_df_base$importance, model=fcat_pvalue_df_base$model) |
| 88 | +cor_df_models<- split(cor_df, cor_df$model) |
| 89 | +sapply(1:length(cor_df_models), function(i) colnames(cor_df_models[[i]])[1]<<-names(cor_df_models)[i]) |
| 90 | +cor_df_merged = Reduce(cbind, cor_df_models) |
| 91 | +cor_df_merged <- cor_df_merged[,colnames(cor_df_merged) %in% names(cor_df_models)] |
| 92 | +cor_mat = cor(cor_df_merged) |
| 93 | +pheatmap::pheatmap(cor_mat, display_numbers = TRUE) |
| 94 | +########### ########### ########### |
| 95 | + |
| 96 | +########### calculating empirical p-values |
| 97 | +fcat_pvalue_list = sapply(1:length(model_names), function(i){add_emp_pvalue(fcat_models_df, model_names[i])}, simplify = F) |
| 98 | +names(fcat_pvalue_list) = model_names |
| 99 | +ggplot(fcat_pvalue_list$DecisionTree, aes(x=pvalue))+geom_histogram(alpha=0.8, bins=100)+theme_classic()+ggtitle(a_model) |
| 100 | + |
| 101 | +fcat_pvalue_df = Reduce(rbind, fcat_pvalue_list) |
| 102 | +head(fcat_pvalue_df) |
| 103 | + |
| 104 | +ggplot(fcat_pvalue_df, aes(x=model, y=pvalue, fill=model))+geom_boxplot(alpha=0.7)+ |
| 105 | + theme_classic()+scale_fill_brewer(palette = 'Set1')+coord_flip() |
| 106 | + |
| 107 | +ggplot(fcat_pvalue_df, aes(x=pvalue, fill=model))+ |
| 108 | + geom_density(alpha=0.5)+theme_classic()+scale_fill_brewer(palette = 'Set1') |
| 109 | + |
| 110 | + |
| 111 | +sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='XGB'] < 0.05) |
| 112 | +sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='RandomForest'] < 0.05) |
| 113 | +sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='DecisionTree'] < 0.05) |
| 114 | + |
| 115 | + |
| 116 | +############################################################################################### |
| 117 | +########################## importance evaluation for model comparison |
| 118 | +################################################################################################ |
| 119 | + |
| 120 | +fcat_mean_base = read.csv('/home/delaram/sciRED/benchmark/scMix/baseline/fcat_scMix_mean_baseline.csv') |
| 121 | +fcat_mean_base$type = 'baseline' |
| 122 | + |
| 123 | +file = '/home/delaram/sciRED/benchmark/scMix/shuffle/mean/' |
| 124 | +fcat_mean_list = lapply(list.files(file, pattern = "fcat_scMix_mean*", full.names = T), read.csv) |
| 125 | +fcat_mean_shuffle <- Reduce(rbind,fcat_single_list) |
| 126 | +fcat_mean_shuffle$type = 'shuffle' |
| 127 | +head(fcat_mean_shuffle) |
| 128 | + |
| 129 | +fcat_mean = rbind(fcat_mean_base, fcat_mean_shuffle) |
| 130 | +fcat_mean_m = melt(fcat_mean) |
| 131 | +ggplot(fcat_mean_m, aes(y=value, x=type, fill=type))+geom_boxplot()+coord_flip()+ylab('Mean fcat') |
| 132 | + |
| 133 | +fcat_mean_base_m = melt(fcat_mean_base) |
| 134 | +fcat_mean_shuffle_m = melt(fcat_mean_shuffle) |
| 135 | + |
| 136 | +fcat_mean_base_df = data.frame(cov_level=fcat_mean_base_m$X, |
| 137 | + factor=fcat_mean_base_m$variable, |
| 138 | + imp_score=fcat_mean_base_m$value, |
| 139 | + res=fcat_mean_base_m$residual_type) |
| 140 | +head(fcat_mean_base_df) |
| 141 | + |
| 142 | + |
| 143 | +fcat_mean_shuffle_split <- split(fcat_mean_shuffle_m, fcat_mean_shuffle_m$residual_type) |
| 144 | +fcat_mean_base_split <- split(fcat_mean_base_m, fcat_mean_base_m$residual_type) |
| 145 | + |
| 146 | +### this loop is helpful in cases were we try various residuals |
| 147 | +for(i in 1:length(fcat_mean_shuffle_split)){ |
| 148 | + a_mean_df_shuffle = fcat_mean_shuffle_split[[i]] |
| 149 | + a_mean_df_base = fcat_mean_base_split[[i]] |
| 150 | + fcat_mean_base_split[[i]]$pval = sapply(1:nrow(a_mean_df_base), function(i) |
| 151 | + sum(a_mean_df_shuffle$value>a_mean_df_base$value[i])/nrow(a_mean_df_shuffle)) |
| 152 | +} |
| 153 | + |
| 154 | +tab=rbind(pval_0.05=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.05))), |
| 155 | + pval_0.01=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.01))), |
| 156 | + pval_0.001=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.001)))) |
| 157 | + |
| 158 | +gridExtra::grid.table(t(tab)) |
| 159 | +dev.off() |
| 160 | + |
| 161 | +tab=rbind(pval_0.05=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.05)/180,2))), |
| 162 | + pval_0.01=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.01)/180,2))), |
| 163 | + pval_0.001=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.001)/180,2)))) |
| 164 | +gridExtra::grid.table(t(tab)) |
| 165 | + |
| 166 | +thr = 0.01 |
| 167 | +sapply(1:length(fcat_mean_base_split), function(i) {fcat_mean_base_split[[i]]$sig <<- fcat_mean_base_split[[i]]$pval < thr}) |
| 168 | + |
| 169 | +AvgFacSig_df_model = sapply(1:length(fcat_mean_base_split), function(i){ |
| 170 | + a_model_imp_df = fcat_mean_base_split[[i]] |
| 171 | + a_model_imp_df_cov = split(a_model_imp_df, a_model_imp_df$X) |
| 172 | + AvgFacSig = sapply(1:length(a_model_imp_df_cov), function(i){ |
| 173 | + sum(a_model_imp_df_cov[[i]]$sig) |
| 174 | + }) |
| 175 | + names(AvgFacSig) = names(a_model_imp_df_cov) |
| 176 | + return(AvgFacSig) |
| 177 | +}, simplify = T) |
| 178 | + |
| 179 | +colnames(AvgFacSig_df_model) = names(fcat_mean_base_split) |
| 180 | +AvgFacSig_df_model_m = melt(AvgFacSig_df_model) |
| 181 | +head(AvgFacSig_df_model_m) |
| 182 | + |
| 183 | +ggplot(AvgFacSig_df_model_m, aes(y=value,x=Var2))+geom_boxplot()+ |
| 184 | + theme_classic()+scale_fill_brewer(palette = 'Set1')+ |
| 185 | + coord_flip()+theme(text = element_text(size=17))+xlab('')+ |
| 186 | + ylab('Average #sig matched factors per covariate level')+ |
| 187 | + geom_hline(yintercept=1, color = "red", size=1, linetype="dashed")+ |
| 188 | + ggtitle(paste0('pvalue threshold=',thr)) |
| 189 | + |
0 commit comments