Skip to content

Commit 1ad7529

Browse files
committed
minor additions to analysis files
1 parent 64797b9 commit 1ad7529

File tree

5 files changed

+272
-5
lines changed

5 files changed

+272
-5
lines changed

Benchmark_scMix_estPval.R

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
library(ggplot2)
2+
library(ggpubr)
3+
library(reshape2)
4+
5+
scale_minMax <- function(x){
6+
x_min = min(x)
7+
x_max = max(x)
8+
scaled = (x-x_min)/(x_max-x_min)
9+
return(scaled)
10+
}
11+
12+
scale_Max <- function(x){
13+
x_max = max(x)
14+
scaled = (x)/(x_max)
15+
return(scaled)
16+
}
17+
18+
19+
add_emp_pvalue <- function(fcat_df, a_model){
20+
### input: dataframe of merged fcat scores for shuffled and baseline fca scores.
21+
### counts the number of observations in the empirical null distribution which are higher than the given fca score (fca_emp_h)
22+
### calculates the empirical p-value by dividing the fca_emp_h by the total number of null dist observarions
23+
fcat_df_shuffle= fcat_df[fcat_df$type == 'shuffle',]
24+
null_empirical_dist = fcat_df_shuffle$importance[fcat_df_shuffle$model==a_model]
25+
26+
model_fcat_base = fcat_df[fcat_df$type == 'baseline' & fcat_df$model == a_model,]
27+
model_fcat_base$pvalue = sapply(1:nrow(model_fcat_base),
28+
function(i) sum(null_empirical_dist>model_fcat_base$importance[i])/length(null_empirical_dist),
29+
simplify = T)
30+
return(model_fcat_base)
31+
}
32+
33+
34+
fcat_single_base = read.csv('/home/delaram/sciRED/benchmark/scMix/baseline/fcat_scMix_single_baseline.csv')
35+
fcat_single_base$type = 'baseline'
36+
37+
file = '/home/delaram/sciRED/benchmark/scMix/shuffle/single/'
38+
fcat_single_list = lapply(list.files(file, pattern = "fcat_scMix*", full.names = T), read.csv)
39+
fcat_single_shuffle <- Reduce(rbind,fcat_single_list)
40+
fcat_single_shuffle$type = 'shuffle'
41+
head(fcat_single_shuffle)
42+
43+
fcat_single = rbind(fcat_single_base, fcat_single_shuffle)
44+
fcat_single$importance_abs = abs(fcat_single$importance)
45+
46+
ggplot(fcat_single, aes(x=model, y=importance, fill=type))+
47+
geom_boxplot()+theme_classic()+
48+
coord_flip()+scale_fill_manual(values=c("#999999", "maroon"))
49+
50+
fcat_models<- split(fcat_single, fcat_single$model)
51+
#### scaling various classifier scores
52+
sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_scale <<- scale(fcat_models[[i]]$importance, center = FALSE)}, simplify = F)
53+
sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_z_trans <<- scale(fcat_models[[i]]$importance)}, simplify = F)
54+
sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_minmax <<- scale_minMax(fcat_models[[i]]$importance)}, simplify = F)
55+
sapply(1:length(fcat_models), function(i) {fcat_models[[i]]$imp_max_scale <<- scale_Max(fcat_models[[i]]$importance)}, simplify = F)
56+
57+
58+
###### Figure B for the benchmark panel
59+
fcat_models_df = Reduce(rbind, fcat_models)
60+
ggplot(fcat_models_df, aes(x=model, y=importance, fill=type))+geom_boxplot()+
61+
theme_classic()+coord_flip()+scale_fill_manual(values=c("#56B4E9", "maroon"))+
62+
theme(text = element_text(size=18))+xlab('')
63+
64+
ggplot(fcat_models_df, aes(x=model, y=imp_minmax, fill=type))+geom_boxplot()+theme_classic()+
65+
coord_flip()+scale_fill_manual(values=c("#56B4E9", "maroon"))+
66+
theme(text = element_text(size=18))+xlab('')+ylab('Importance score (min-max scaled)')
67+
68+
69+
########### sanity check ###########
70+
fcat_models_df_base= fcat_models_df[fcat_models_df$type == 'baseline',]
71+
fcat_models_df_shuffle = fcat_models_df[fcat_models_df$type == 'shuffle',]
72+
73+
model_names = names(table(fcat_models_df_shuffle$model))
74+
ggplot(fcat_models_df_shuffle, aes(x=importance, fill=model))+
75+
geom_histogram(alpha=0.5,color='black',bins=100)+theme_classic()+scale_fill_brewer(palette = 'Set1')
76+
77+
ggplot(fcat_models_df_shuffle, aes(x=imp_minmax, fill=model))+
78+
geom_histogram(alpha=0.5,color='black',bins=100)+theme_classic()+scale_fill_brewer(palette = 'Set1')
79+
80+
a_model = "RandomForest"
81+
model_imp_shuffle_values = fcat_models_df_shuffle$importance[fcat_models_df_shuffle$model==a_model]
82+
ggplot(fcat_models_df_shuffle, aes(x=importance))+geom_histogram( bins=200,fill='grey')+
83+
theme_classic()+ggtitle(a_model)+theme(text = element_text(size=18))+xlab('FCA scores for a single model')+
84+
ylab("Frequency")+geom_vline(xintercept=0.09, color = "red", size=1, linetype="dashed")
85+
86+
87+
cor_df = data.frame(imp=fcat_models_df_base$importance, model=fcat_pvalue_df_base$model)
88+
cor_df_models<- split(cor_df, cor_df$model)
89+
sapply(1:length(cor_df_models), function(i) colnames(cor_df_models[[i]])[1]<<-names(cor_df_models)[i])
90+
cor_df_merged = Reduce(cbind, cor_df_models)
91+
cor_df_merged <- cor_df_merged[,colnames(cor_df_merged) %in% names(cor_df_models)]
92+
cor_mat = cor(cor_df_merged)
93+
pheatmap::pheatmap(cor_mat, display_numbers = TRUE)
94+
########### ########### ###########
95+
96+
########### calculating empirical p-values
97+
fcat_pvalue_list = sapply(1:length(model_names), function(i){add_emp_pvalue(fcat_models_df, model_names[i])}, simplify = F)
98+
names(fcat_pvalue_list) = model_names
99+
ggplot(fcat_pvalue_list$DecisionTree, aes(x=pvalue))+geom_histogram(alpha=0.8, bins=100)+theme_classic()+ggtitle(a_model)
100+
101+
fcat_pvalue_df = Reduce(rbind, fcat_pvalue_list)
102+
head(fcat_pvalue_df)
103+
104+
ggplot(fcat_pvalue_df, aes(x=model, y=pvalue, fill=model))+geom_boxplot(alpha=0.7)+
105+
theme_classic()+scale_fill_brewer(palette = 'Set1')+coord_flip()
106+
107+
ggplot(fcat_pvalue_df, aes(x=pvalue, fill=model))+
108+
geom_density(alpha=0.5)+theme_classic()+scale_fill_brewer(palette = 'Set1')
109+
110+
111+
sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='XGB'] < 0.05)
112+
sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='RandomForest'] < 0.05)
113+
sum(fcat_pvalue_df$pvalue[fcat_pvalue_df$model=='DecisionTree'] < 0.05)
114+
115+
116+
###############################################################################################
117+
########################## importance evaluation for model comparison
118+
################################################################################################
119+
120+
fcat_mean_base = read.csv('/home/delaram/sciRED/benchmark/scMix/baseline/fcat_scMix_mean_baseline.csv')
121+
fcat_mean_base$type = 'baseline'
122+
123+
file = '/home/delaram/sciRED/benchmark/scMix/shuffle/mean/'
124+
fcat_mean_list = lapply(list.files(file, pattern = "fcat_scMix_mean*", full.names = T), read.csv)
125+
fcat_mean_shuffle <- Reduce(rbind,fcat_single_list)
126+
fcat_mean_shuffle$type = 'shuffle'
127+
head(fcat_mean_shuffle)
128+
129+
fcat_mean = rbind(fcat_mean_base, fcat_mean_shuffle)
130+
fcat_mean_m = melt(fcat_mean)
131+
ggplot(fcat_mean_m, aes(y=value, x=type, fill=type))+geom_boxplot()+coord_flip()+ylab('Mean fcat')
132+
133+
fcat_mean_base_m = melt(fcat_mean_base)
134+
fcat_mean_shuffle_m = melt(fcat_mean_shuffle)
135+
136+
fcat_mean_base_df = data.frame(cov_level=fcat_mean_base_m$X,
137+
factor=fcat_mean_base_m$variable,
138+
imp_score=fcat_mean_base_m$value,
139+
res=fcat_mean_base_m$residual_type)
140+
head(fcat_mean_base_df)
141+
142+
143+
fcat_mean_shuffle_split <- split(fcat_mean_shuffle_m, fcat_mean_shuffle_m$residual_type)
144+
fcat_mean_base_split <- split(fcat_mean_base_m, fcat_mean_base_m$residual_type)
145+
146+
### this loop is helpful in cases were we try various residuals
147+
for(i in 1:length(fcat_mean_shuffle_split)){
148+
a_mean_df_shuffle = fcat_mean_shuffle_split[[i]]
149+
a_mean_df_base = fcat_mean_base_split[[i]]
150+
fcat_mean_base_split[[i]]$pval = sapply(1:nrow(a_mean_df_base), function(i)
151+
sum(a_mean_df_shuffle$value>a_mean_df_base$value[i])/nrow(a_mean_df_shuffle))
152+
}
153+
154+
tab=rbind(pval_0.05=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.05))),
155+
pval_0.01=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.01))),
156+
pval_0.001=data.frame(lapply(fcat_mean_base_split, function(x) sum(x$pval<0.001))))
157+
158+
gridExtra::grid.table(t(tab))
159+
dev.off()
160+
161+
tab=rbind(pval_0.05=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.05)/180,2))),
162+
pval_0.01=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.01)/180,2))),
163+
pval_0.001=data.frame(lapply(fcat_mean_base_split, function(x) round(sum(x$pval<0.001)/180,2))))
164+
gridExtra::grid.table(t(tab))
165+
166+
thr = 0.01
167+
sapply(1:length(fcat_mean_base_split), function(i) {fcat_mean_base_split[[i]]$sig <<- fcat_mean_base_split[[i]]$pval < thr})
168+
169+
AvgFacSig_df_model = sapply(1:length(fcat_mean_base_split), function(i){
170+
a_model_imp_df = fcat_mean_base_split[[i]]
171+
a_model_imp_df_cov = split(a_model_imp_df, a_model_imp_df$X)
172+
AvgFacSig = sapply(1:length(a_model_imp_df_cov), function(i){
173+
sum(a_model_imp_df_cov[[i]]$sig)
174+
})
175+
names(AvgFacSig) = names(a_model_imp_df_cov)
176+
return(AvgFacSig)
177+
}, simplify = T)
178+
179+
colnames(AvgFacSig_df_model) = names(fcat_mean_base_split)
180+
AvgFacSig_df_model_m = melt(AvgFacSig_df_model)
181+
head(AvgFacSig_df_model_m)
182+
183+
ggplot(AvgFacSig_df_model_m, aes(y=value,x=Var2))+geom_boxplot()+
184+
theme_classic()+scale_fill_brewer(palette = 'Set1')+
185+
coord_flip()+theme(text = element_text(size=17))+xlab('')+
186+
ylab('Average #sig matched factors per covariate level')+
187+
geom_hline(yintercept=1, color = "red", size=1, linetype="dashed")+
188+
ggtitle(paste0('pvalue threshold=',thr))
189+

example_healthyHumanKidney.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@
121121
pca_scores_varimax_df_merged.to_csv('~/sciFA/Results/pca_scores_varimax_df_merged_kidneyMap.csv')
122122
varimax_loading_df.to_csv('~/sciFA/Results/varimax_loading_df_kidneyMap.csv')
123123

124+
### read ~/sciFA/Results/pca_scores_varimax_df_merged_kidneyMap.csv
125+
pca_scores_varimax_df_merged = pd.read_csv('~/sciFA/Results/pca_scores_varimax_df_merged_kidneyMap.csv')
126+
#f1_index = pca_scores_varimax_df_merged.columns.get_loc('F1')
127+
factor_scores = pca_scores_varimax_df_merged.iloc[:, -NUM_COMPONENTS:]
128+
factor_scores = factor_scores.values
124129

125130
########################
126131
######## PCA factors

example_healthyHumanLiver.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,8 @@
207207
factor_scores_df[col] = data.obs[col].values
208208
### add rownames of data.obs to the factor_scores_df
209209
factor_scores_df['id'] = data.obs.index.values
210-
factor_scores_df.to_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv', index=False)
211-
pd.DataFrame(factor_loading).to_csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv', index=False)
210+
#factor_scores_df.to_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv', index=False)
211+
#pd.DataFrame(factor_loading).to_csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv', index=False)
212212

213213

214214

@@ -224,7 +224,6 @@
224224
## Specificity
225225
simpson_fcat = met.simpson_diversity_index(fcat)
226226

227-
228227
### label dependent factor metrics
229228
asv_cell_type = met.average_scaled_var(factor_scores, covariate_vector=y_cell_type, mean_type='arithmetic')
230229
asv_sample = met.average_scaled_var(factor_scores, y_sample, mean_type='arithmetic')
@@ -241,4 +240,40 @@
241240
vis.plot_FIST(fist, title='Scaled metrics for all the factors')
242241
### subset the first 15 factors of fist dataframe
243242
vis.plot_FIST(fist.iloc[0:15,:])
243+
### include factors F10, F19, F26, F28, F30
244+
vis.plot_FIST(fist.iloc[[9, 18, 25, 27, 29],:],
245+
x_axis_label=['F10', 'F19', 'F26', 'F28', 'F30'])
244246
vis.plot_FIST(fist.iloc[matched_factor_index,:])
247+
248+
249+
250+
################################################################
251+
######## Creating the FIS table for a subset of factors ########
252+
################################################################
253+
#### Bimodality scores
254+
### subset factor scores to include factors F10, F19, F26, F28, F30
255+
selected_factors = [9, 18, 25, 27, 29]
256+
factor_scores_subset = factor_scores[:,selected_factors]
257+
silhouette_score = met.kmeans_bimodal_score(factor_scores, time_eff=True)
258+
bimodality_index = met.bimodality_index(factor_scores)
259+
bimodality_score = np.mean([silhouette_score, bimodality_index], axis=0)
260+
bimodality_score = bimodality_index
261+
#### Effect size
262+
factor_variance = met.factor_variance(factor_scores)
263+
264+
## Specificity
265+
simpson_fcat = met.simpson_diversity_index(fcat)
266+
267+
### label dependent factor metrics
268+
asv_cell_type = met.average_scaled_var(factor_scores, covariate_vector=y_cell_type, mean_type='arithmetic')
269+
asv_sample = met.average_scaled_var(factor_scores, y_sample, mean_type='arithmetic')
270+
271+
272+
########### create factor-interpretibility score table (FIST) ######
273+
metrics_dict = {'Bimodality':bimodality_score,
274+
'Specificity':simpson_fcat,
275+
'Effect size': factor_variance,
276+
'Homogeneity (cell type)':asv_cell_type,
277+
'Homogeneity (sample)':asv_sample}
278+
279+
fist = met.FIST(metrics_dict)

pathway_analysis.R

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,4 +157,42 @@ ggplot(enrich_res_pos, aes(y=term_name,x=log_p))+geom_bar(stat = 'identity',fill
157157
varimax_df = read.csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv')
158158
genes = read.csv('/home/delaram/sciFA/Results/genes_humanlivermap.csv')
159159
df = data.frame(genes= genes$X0,factor=factor_loading$X28)
160-
varimax_loading_df_ord = df[order(df$factor, decreasing = F),]
160+
varimax_loading_df_ord = df[order(df$factor, decreasing = F),]
161+
162+
163+
164+
##### Figure-1 example figure
165+
varimax_loading_vis = data.frame(genes=paste0('Gene ',1:10),factor=10:1/10+rnorm(n=10,mean = 0,sd = 0.02))
166+
167+
varimax_loading_vis$genes <- factor(varimax_loading_vis$genes, levels=varimax_loading_vis$genes)
168+
ggplot(varimax_loading_vis,aes(x=genes, y=factor, color=factor))+geom_point(size=6,alpha=1.2)+theme_bw()+
169+
theme(axis.text.x = element_text(color = "grey20", size = 22, angle = 90, hjust = .5, vjust = .5, face = "plain"),
170+
axis.text.y = element_text(color = "grey20", size = 12, angle = 0, hjust = 1, vjust = 0, face = "plain"),
171+
axis.title.x = element_text(color = "grey20", size = 14, angle = 0, hjust = .5, vjust = 0, face = "plain"),
172+
axis.title.y = element_text(color = "grey20", size = 25, angle = 90, hjust = .5, vjust = .5, face = "plain"),
173+
legend.text = element_text(hjust = 1,angle = 0),
174+
legend.position="left", legend.direction="vertical")+
175+
scale_color_gradient(name='Factor\nLoading')+
176+
#scale_colour_gradientn(colours=c("red", "blue"))+
177+
scale_color_gradient2(name='',midpoint = 0, low = "deepskyblue2", mid = "white",
178+
high = "midnightblue", space = "Lab" )+
179+
ylab('Factor\nLoading')+xlab('')
180+
181+
182+
183+
enrich_res_pos = data.frame(term_name=paste0('Pathway ',1:8),log_p=8:1/8+rnorm(n=8,mean = 0,sd = 0.05))
184+
enrich_res_pos$factor[1:4] = enrich_res_pos$factor[1:4]+0.6
185+
186+
enrich_res_pos$term_name <- factor(enrich_res_pos$term_name,
187+
levels = enrich_res_pos$term_name[length(enrich_res_pos$term_name):1])
188+
189+
190+
title = ''#'stim'#'Male'
191+
ggplot(enrich_res_pos, aes(y=term_name,x=log_p))+geom_bar(stat = 'identity',fill='lightskyblue3',color='grey3')+xlab('-log(p value)')+
192+
theme_classic()+ylab('')+ggtitle(title)+
193+
scale_fill_manual(values = c('grey80'))+
194+
theme(axis.text.x = element_text(color = "grey20", size = 13, angle = 0, hjust = .5, vjust = .5, face = "plain"),
195+
axis.text.y = element_text(color = "grey20", size = 18, angle = 0, hjust = 1, vjust = 0, face = "plain"),
196+
axis.title.x = element_text(color = "grey20", size = 23, angle = 0, hjust = .5, vjust = 0, face = "plain"),
197+
axis.title.y = element_text(color = "grey20", size = 17, angle = 90, hjust = .5, vjust = .5, face = "plain"))
198+

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from setuptools import setup, find_packages
22

3-
VERSION = '1.2.0'
3+
VERSION = '1.2.1'
44
DESCRIPTION = 'single cell interpretable Residual Decomposition'
55
LONG_DESCRIPTION = "sciRED is a Python package designed to improve the interpretation of single-cell RNA sequencing data, specifically focusing on signal extraction via factor decomposition. It simplifies the process by removing confounding effects, mapping factors to covariates, identifying unexplained factors, and annotating genes and biological processes. Applying sciRED to various scRNA-seq datasets can unveil diverse biological signals, such as health/disease variation, cell-type identity, sex/age differences, stimulation signals, and rare cell type signatures."
66

0 commit comments

Comments
 (0)