minor additions to analysis files

delipouya · delipouya · commit 1ad7529e702e · 2024-08-01T13:12:47.000-04:00
diff --git a/Benchmark_scMix_estPval.R b/Benchmark_scMix_estPval.R
@@ -0,0 +1,189 @@
+library(ggplot2)
+library(ggpubr)
+library(reshape2)
+
+scale_minMax <- function(x){
+  x_min = min(x)
+  x_max = max(x)
+  scaled = (x-x_min)/(x_max-x_min)
+  return(scaled)
+}
+
+scale_Max <- function(x){
+  x_max = max(x)
+  scaled = (x)/(x_max)
+  return(scaled)
+}
+
+
+add_emp_pvalue <- function(fcat_df, a_model){
+  ### input: dataframe of merged fcat scores for shuffled and baseline fca scores. 
+  ### counts the number of observations in the empirical null distribution which are higher than the given fca score (fca_emp_h)
+  ### calculates the empirical p-value by dividing the fca_emp_h by the total number of null dist observarions
+  fcat_df_shuffle= fcat_df[fcat_df$type == 'shuffle',]
+  null_empirical_dist = fcat_df_shuffle$importance[fcat_df_shuffle$model==a_model]
+  
+  model_fcat_base = fcat_df[fcat_df$type == 'baseline' & fcat_df$model == a_model,]
+  model_fcat_base$pvalue = sapply(1:nrow(model_fcat_base), 
+                                  function(i) sum(null_empirical_dist>model_fcat_base$importance[i])/length(null_empirical_dist), 
+                                  simplify = T)
+  return(model_fcat_base)
+}
+
+
+fcat_single_base = read.csv('/home/delaram/sciRED/benchmark/scMix/baseline/fcat_scMix_single_baseline.csv')
+fcat_single_base$type = 'baseline'
+
+file = '/home/delaram/sciRED/benchmark/scMix/shuffle/single/'
+fcat_single_list = lapply(list.files(file, pattern = "fcat_scMix*", full.names = T), read.csv)
+fcat_single_shuffle <- Reduce(rbind,fcat_single_list)
+fcat_single_shuffle$type = 'shuffle'
+head(fcat_single_shuffle)
+
+fcat_single = rbind(fcat_single_base, fcat_single_shuffle)
+fcat_single$importance_abs = abs(fcat_single$importance)
+
+ggplot(fcat_single, aes(x=model, y=importance, fill=type))+
+  geom_boxplot()+theme_classic()+
+  coord_flip()+scale_fill_manual(values=c("#999999", "maroon"))
+
+fcat_models<- split(fcat_single, fcat_single$model)
+#### scaling various classifier scores
+sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_scale <<- scale(fcat_models[[i]]$importance, center = FALSE)}, simplify = F)
+sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_z_trans <<- scale(fcat_models[[i]]$importance)}, simplify = F)
+sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_minmax <<- scale_minMax(fcat_models[[i]]$importance)}, simplify = F)
+sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_max_scale <<- scale_Max(fcat_models[[i]]$importance)}, simplify = F)
+
+
+###### Figure B for the benchmark panel
+fcat_models_df = Reduce(rbind, fcat_models)
+ggplot(fcat_models_df, aes(x=model, y=importance, fill=type))+geom_boxplot()+
+  theme_classic()+coord_flip()+scale_fill_manual(values=c("#56B4E9", "maroon"))+
+  theme(text = element_text(size=18))+xlab('')
+
+ggplot(fcat_models_df, aes(x=model, y=imp_minmax, fill=type))+geom_boxplot()+theme_classic()+
+  coord_flip()+scale_fill_manual(values=c("#56B4E9", "maroon"))+
+  theme(text = element_text(size=18))+xlab('')+ylab('Importance score (min-max scaled)')
+
+
+########### sanity check ########### 
+fcat_models_df_base= fcat_models_df[fcat_models_df$type == 'baseline',]
+fcat_models_df_shuffle = fcat_models_df[fcat_models_df$type == 'shuffle',]
+
+model_names = names(table(fcat_models_df_shuffle$model))
+ggplot(fcat_models_df_shuffle, aes(x=importance, fill=model))+
+  geom_histogram(alpha=0.5,color='black',bins=100)+theme_classic()+scale_fill_brewer(palette = 'Set1')
+
+ggplot(fcat_models_df_shuffle, aes(x=imp_minmax, fill=model))+
+  geom_histogram(alpha=0.5,color='black',bins=100)+theme_classic()+scale_fill_brewer(palette = 'Set1')
+
+a_model = "RandomForest"
+model_imp_shuffle_values = fcat_models_df_shuffle$importance[fcat_models_df_shuffle$model==a_model]
+ggplot(fcat_models_df_shuffle, aes(x=importance))+geom_histogram( bins=200,fill='grey')+
+  theme_classic()+ggtitle(a_model)+theme(text = element_text(size=18))+xlab('FCA scores for a single model')+
+  ylab("Frequency")+geom_vline(xintercept=0.09, color = "red", size=1, linetype="dashed")
+
+
+cor_df = data.frame(imp=fcat_models_df_base$importance, model=fcat_pvalue_df_base$model)
+cor_df_models<- split(cor_df, cor_df$model)
+sapply(1:length(cor_df_models), function(i) colnames(cor_df_models[[i]])[1]<<-names(cor_df_models)[i])
+cor_df_merged = Reduce(cbind, cor_df_models)
+cor_df_merged <- cor_df_merged[,colnames(cor_df_merged) %in% names(cor_df_models)]
+cor_mat = cor(cor_df_merged)
+pheatmap::pheatmap(cor_mat, display_numbers = TRUE)
+########### ########### ########### 
+
+########### calculating empirical p-values
+fcat_pvalue_list = sapply(1:length(model_names), function(i){add_emp_pvalue(fcat_models_df, model_names[i])}, simplify = F)
+names(fcat_pvalue_list) = model_names
+ggplot(fcat_pvalue_list$DecisionTree, aes(x=pvalue))+geom_histogram(alpha=0.8, bins=100)+theme_classic()+ggtitle(a_model)
+
+fcat_pvalue_df = Reduce(rbind, fcat_pvalue_list)
+head(fcat_pvalue_df)
+
+ggplot(fcat_pvalue_df, aes(x=model, y=pvalue, fill=model))+geom_boxplot(alpha=0.7)+
+  theme_classic()+scale_fill_brewer(palette = 'Set1')+coord_flip()
+
+ggplot(fcat_pvalue_df, aes(x=pvalue, fill=model))+
+  geom_density(alpha=0.5)+theme_classic()+scale_fill_brewer(palette = 'Set1')
+
+
+sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='XGB'] < 0.05)
+sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='RandomForest'] < 0.05)
+sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='DecisionTree'] < 0.05)
+
+
+###############################################################################################
+########################## importance evaluation for model comparison
+################################################################################################
+
+fcat_mean_base = read.csv('/home/delaram/sciRED/benchmark/scMix/baseline/fcat_scMix_mean_baseline.csv')
+fcat_mean_base$type = 'baseline'
+
+file = '/home/delaram/sciRED/benchmark/scMix/shuffle/mean/'
+fcat_mean_list = lapply(list.files(file, pattern = "fcat_scMix_mean*", full.names = T), read.csv)
+fcat_mean_shuffle <- Reduce(rbind,fcat_single_list)
+fcat_mean_shuffle$type = 'shuffle'
+head(fcat_mean_shuffle)
+
+fcat_mean = rbind(fcat_mean_base, fcat_mean_shuffle)
+fcat_mean_m = melt(fcat_mean)
+ggplot(fcat_mean_m, aes(y=value, x=type, fill=type))+geom_boxplot()+coord_flip()+ylab('Mean fcat')
+
+fcat_mean_base_m = melt(fcat_mean_base)
+fcat_mean_shuffle_m = melt(fcat_mean_shuffle)
+
+fcat_mean_base_df = data.frame(cov_level=fcat_mean_base_m$X, 
+                                 factor=fcat_mean_base_m$variable,
+                                 imp_score=fcat_mean_base_m$value,
+                                 res=fcat_mean_base_m$residual_type)
+head(fcat_mean_base_df)
+
+
+fcat_mean_shuffle_split <- split(fcat_mean_shuffle_m, fcat_mean_shuffle_m$residual_type)
+fcat_mean_base_split <- split(fcat_mean_base_m, fcat_mean_base_m$residual_type)
+
+### this loop is helpful in cases were we try various residuals
+for(i in 1:length(fcat_mean_shuffle_split)){
+  a_mean_df_shuffle = fcat_mean_shuffle_split[[i]]
+  a_mean_df_base = fcat_mean_base_split[[i]]
+  fcat_mean_base_split[[i]]$pval = sapply(1:nrow(a_mean_df_base), function(i) 
+    sum(a_mean_df_shuffle$value>a_mean_df_base$value[i])/nrow(a_mean_df_shuffle))
+}
+
+tab=rbind(pval_0.05=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.05))),
+          pval_0.01=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.01))),
+          pval_0.001=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.001))))
+
+gridExtra::grid.table(t(tab))
+dev.off()
+
+tab=rbind(pval_0.05=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.05)/180,2))),
+          pval_0.01=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.01)/180,2))),
+          pval_0.001=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.001)/180,2))))
+gridExtra::grid.table(t(tab))
+
+thr = 0.01
+sapply(1:length(fcat_mean_base_split), function(i) {fcat_mean_base_split[[i]]$sig <<- fcat_mean_base_split[[i]]$pval < thr})
+
+AvgFacSig_df_model = sapply(1:length(fcat_mean_base_split), function(i){
+  a_model_imp_df = fcat_mean_base_split[[i]]
+  a_model_imp_df_cov = split(a_model_imp_df, a_model_imp_df$X)
+  AvgFacSig = sapply(1:length(a_model_imp_df_cov), function(i){
+    sum(a_model_imp_df_cov[[i]]$sig)
+  })
+  names(AvgFacSig) = names(a_model_imp_df_cov)
+  return(AvgFacSig)
+}, simplify = T)
+
+colnames(AvgFacSig_df_model) = names(fcat_mean_base_split) 
+AvgFacSig_df_model_m = melt(AvgFacSig_df_model)
+head(AvgFacSig_df_model_m)
+
+ggplot(AvgFacSig_df_model_m, aes(y=value,x=Var2))+geom_boxplot()+
+  theme_classic()+scale_fill_brewer(palette = 'Set1')+
+  coord_flip()+theme(text = element_text(size=17))+xlab('')+
+  ylab('Average #sig matched factors per covariate level')+
+  geom_hline(yintercept=1, color = "red", size=1, linetype="dashed")+
+  ggtitle(paste0('pvalue threshold=',thr))
+
diff --git a/example_healthyHumanKidney.py b/example_healthyHumanKidney.py
@@ -121,6 +121,11 @@
 pca_scores_varimax_df_merged.to_csv('~/sciFA/Results/pca_scores_varimax_df_merged_kidneyMap.csv')
 varimax_loading_df.to_csv('~/sciFA/Results/varimax_loading_df_kidneyMap.csv')
 
+### read ~/sciFA/Results/pca_scores_varimax_df_merged_kidneyMap.csv
+pca_scores_varimax_df_merged = pd.read_csv('~/sciFA/Results/pca_scores_varimax_df_merged_kidneyMap.csv')
+#f1_index = pca_scores_varimax_df_merged.columns.get_loc('F1')
+factor_scores = pca_scores_varimax_df_merged.iloc[:, -NUM_COMPONENTS:]
+factor_scores = factor_scores.values
 
 ########################
 ######## PCA factors
diff --git a/example_healthyHumanLiver.py b/example_healthyHumanLiver.py
@@ -207,8 +207,8 @@
     factor_scores_df[col] = data.obs[col].values
 ### add rownames of data.obs to the factor_scores_df
 factor_scores_df['id'] = data.obs.index.values
-factor_scores_df.to_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv', index=False)
-pd.DataFrame(factor_loading).to_csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv', index=False)
+#factor_scores_df.to_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv', index=False)
+#pd.DataFrame(factor_loading).to_csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv', index=False)
 
 
 
@@ -224,7 +224,6 @@
 ## Specificity
 simpson_fcat = met.simpson_diversity_index(fcat)
 
-
 ### label dependent factor metrics
 asv_cell_type = met.average_scaled_var(factor_scores, covariate_vector=y_cell_type, mean_type='arithmetic')
 asv_sample = met.average_scaled_var(factor_scores, y_sample, mean_type='arithmetic')
@@ -241,4 +240,40 @@
 vis.plot_FIST(fist, title='Scaled metrics for all the factors')
 ### subset the first 15 factors of fist dataframe
 vis.plot_FIST(fist.iloc[0:15,:])
+### include factors F10, F19, F26, F28, F30
+vis.plot_FIST(fist.iloc[[9, 18, 25, 27, 29],:], 
+              x_axis_label=['F10', 'F19', 'F26', 'F28', 'F30'])
 vis.plot_FIST(fist.iloc[matched_factor_index,:])
+
+
+
+################################################################
+########  Creating the FIS table for a subset of factors ########
+################################################################
+#### Bimodality scores
+### subset factor scores to include factors F10, F19, F26, F28, F30
+selected_factors = [9, 18, 25, 27, 29]
+factor_scores_subset = factor_scores[:,selected_factors]
+silhouette_score = met.kmeans_bimodal_score(factor_scores, time_eff=True)
+bimodality_index = met.bimodality_index(factor_scores)
+bimodality_score = np.mean([silhouette_score, bimodality_index], axis=0)
+bimodality_score = bimodality_index
+#### Effect size
+factor_variance = met.factor_variance(factor_scores)
+
+## Specificity
+simpson_fcat = met.simpson_diversity_index(fcat)
+
+### label dependent factor metrics
+asv_cell_type = met.average_scaled_var(factor_scores, covariate_vector=y_cell_type, mean_type='arithmetic')
+asv_sample = met.average_scaled_var(factor_scores, y_sample, mean_type='arithmetic')
+
+
+########### create factor-interpretibility score table (FIST) ######
+metrics_dict = {'Bimodality':bimodality_score, 
+                    'Specificity':simpson_fcat,
+                    'Effect size': factor_variance,
+                    'Homogeneity (cell type)':asv_cell_type,
+                    'Homogeneity (sample)':asv_sample}
+
+fist = met.FIST(metrics_dict)
diff --git a/pathway_analysis.R b/pathway_analysis.R
@@ -157,4 +157,42 @@ ggplot(enrich_res_pos, aes(y=term_name,x=log_p))+geom_bar(stat = 'identity',fill
 varimax_df = read.csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv')
 genes = read.csv('/home/delaram/sciFA/Results/genes_humanlivermap.csv')
 df = data.frame(genes= genes$X0,factor=factor_loading$X28)
-varimax_loading_df_ord = df[order(df$factor, decreasing = F),]
+varimax_loading_df_ord = df[order(df$factor, decreasing = F),]
+
+
+
+##### Figure-1 example figure
+varimax_loading_vis = data.frame(genes=paste0('Gene ',1:10),factor=10:1/10+rnorm(n=10,mean = 0,sd = 0.02))
+
+varimax_loading_vis$genes <- factor(varimax_loading_vis$genes, levels=varimax_loading_vis$genes)
+ggplot(varimax_loading_vis,aes(x=genes, y=factor, color=factor))+geom_point(size=6,alpha=1.2)+theme_bw()+
+  theme(axis.text.x = element_text(color = "grey20", size = 22, angle = 90, hjust = .5, vjust = .5, face = "plain"),
+        axis.text.y = element_text(color = "grey20", size = 12, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
+        axis.title.x = element_text(color = "grey20", size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain"),
+        axis.title.y = element_text(color = "grey20", size = 25, angle = 90, hjust = .5, vjust = .5, face = "plain"),
+        legend.text = element_text(hjust = 1,angle = 0),
+        legend.position="left", legend.direction="vertical")+
+  scale_color_gradient(name='Factor\nLoading')+
+  #scale_colour_gradientn(colours=c("red", "blue"))+
+  scale_color_gradient2(name='',midpoint = 0, low = "deepskyblue2", mid = "white",
+                        high = "midnightblue", space = "Lab" )+
+  ylab('Factor\nLoading')+xlab('')
+
+
+
+enrich_res_pos = data.frame(term_name=paste0('Pathway ',1:8),log_p=8:1/8+rnorm(n=8,mean = 0,sd = 0.05))
+enrich_res_pos$factor[1:4] = enrich_res_pos$factor[1:4]+0.6
+
+enrich_res_pos$term_name <- factor(enrich_res_pos$term_name, 
+                                   levels =  enrich_res_pos$term_name[length(enrich_res_pos$term_name):1])
+
+
+title = ''#'stim'#'Male'
+ggplot(enrich_res_pos, aes(y=term_name,x=log_p))+geom_bar(stat = 'identity',fill='lightskyblue3',color='grey3')+xlab('-log(p value)')+
+  theme_classic()+ylab('')+ggtitle(title)+
+  scale_fill_manual(values = c('grey80'))+
+  theme(axis.text.x = element_text(color = "grey20", size = 13, angle = 0, hjust = .5, vjust = .5, face = "plain"),
+        axis.text.y = element_text(color = "grey20", size = 18, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
+        axis.title.x = element_text(color = "grey20", size = 23, angle = 0, hjust = .5, vjust = 0, face = "plain"),
+        axis.title.y = element_text(color = "grey20", size = 17, angle = 90, hjust = .5, vjust = .5, face = "plain"))
+
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-VERSION = '1.2.0' 
+VERSION = '1.2.1' 
 DESCRIPTION = 'single cell interpretable Residual Decomposition'
 LONG_DESCRIPTION = "sciRED is a Python package designed to improve the interpretation of single-cell RNA sequencing data, specifically focusing on signal extraction via factor decomposition. It simplifies the process by removing confounding effects, mapping factors to covariates, identifying unexplained factors, and annotating genes and biological processes. Applying sciRED to various scRNA-seq datasets can unveil diverse biological signals, such as health/disease variation, cell-type identity, sex/age differences, stimulation signals, and rare cell type signatures."