Skip to content

Commit 90a3e7d

Browse files
committed
scripts added for review analysis
1 parent b1b4f0d commit 90a3e7d

39 files changed

+6514
-28
lines changed

data_prep/data_humanLiver.R

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,53 @@ sum_MT_conuts = colSums(GetAssayData(HumanLiverSeurat, layer = 'counts')[mt_indi
1313
sum_counts = colSums(GetAssayData(HumanLiverSeurat, layer = 'counts'))
1414
HumanLiverSeurat[["percent.mt"]] = sum_MT_conuts/sum_counts
1515

16+
meta_data = HumanLiverSeurat@meta.data
17+
pca_df = Embeddings(HumanLiverSeurat, 'pca')
18+
19+
20+
HumanLiverSeurat <- RunUMAP(HumanLiverSeurat, dims = 1:30, reduction = "pca")
21+
umap_df = Embeddings(HumanLiverSeurat, 'umap')
22+
head(HumanLiverSeurat)
23+
head(umap_df)
24+
sum(colnames(HumanLiverSeurat) != rownames(umap_df))
25+
umap_df2 = cbind(umap_df, HumanLiverSeurat@meta.data)
26+
dim(umap_df2)
27+
dim(umap_df)
28+
29+
30+
library(RColorBrewer)
31+
num_colors = length(names(table(umap_df2$cell_type)))
32+
color_palette <- brewer.pal(n = , name = "Set3")
33+
34+
# Generate a 20-color palette from Set3 (or you can use another palette like Paired)
35+
colors_20 <- brewer.pal(12, "Set3") # Max 12 colors for Set3, so combine palettes
36+
colors_20 <- c(colors_20, brewer.pal(8, "Dark2")) # Combine with another palette
37+
38+
# Visualize
39+
barplot(rep(1, 20), col = colors_20, border = NA)
40+
41+
my_colors = c('#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
42+
'#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22')
43+
ggplot(umap_df2, aes(x=umap_1, y=umap_2, color=cell_type))+
44+
geom_point(size = 1.4, alpha = 0.8) + # Adjust point size and transparency
45+
#scale_color_brewer(palette = "Set1") + # Use the Set3 color palette
46+
scale_color_manual(values = colors_20) +
47+
theme_classic() + # Use a clean thmy_coloeme
48+
theme(
49+
axis.text.x = element_text(size = 12), # Make x-axis labels readable
50+
axis.text.y = element_text(size = 12), # Make y-axis labels readable
51+
axis.title.x = element_text(size = 14), # Make x-axis title readable
52+
axis.title.y = element_text(size = 14), # Make y-axis title readable
53+
legend.title = element_text(size = 13), # Make legend title readable
54+
legend.text = element_text(size = 11) # Make legend text readable
55+
) +
56+
labs(
57+
x = "UMAP Dimension 1", # Label for x-axis
58+
y = "UMAP Dimension 2", # Label for y-axis
59+
color = 'Cell type',#"Sample", # Legend title
60+
#title = "UMAP embedding of unintegrated data" # Title
61+
)
62+
1663
################################################################
1764
################# DO NOT RUN again #####################
1865

@@ -29,15 +76,15 @@ annotations=c('Hep1','abT cell','Hep2','infMac','Hep3','Hep4','plasma cell',
2976
'hepatic stellate cell')
3077

3178
label_df = data.frame(cluster=paste0('cluster_',1:20),labels=annotations)
32-
Idents(seur) = paste0('cluster_', as.character(seur$res.0.8))
33-
human_liver_annot = data.frame(umi=colnames(seur), cluster=Idents(seur))
79+
Idents(HumanLiverSeurat) = paste0('cluster_', as.character(HumanLiverSeurat$res.0.8))
80+
human_liver_annot = data.frame(umi=colnames(HumanLiverSeurat), cluster=Idents(HumanLiverSeurat))
3481
human_liver_annot = merge(human_liver_annot, label_df, by.x='cluster', by.y='cluster', all.x=T, sort=F)
3582

36-
human_liver_annot_sorted <- human_liver_annot[match(colnames(seur), human_liver_annot$umi),]
37-
sum(human_liver_annot_sorted$umi != colnames(seur))
38-
seur$cell_type = human_liver_annot_sorted$labels
83+
human_liver_annot_sorted <- human_liver_annot[match(colnames(HumanLiverSeurat), human_liver_annot$umi),]
84+
sum(human_liver_annot_sorted$umi != colnames(HumanLiverSeurat))
85+
HumanLiverSeurat$cell_type = human_liver_annot_sorted$labels
3986

40-
seur$sample = unlist(lapply(strsplit(colnames(seur), '_'), '[[', 1))
87+
HumanLiverSeurat$sample = unlist(lapply(strsplit(colnames(HumanLiverSeurat), '_'), '[[', 1))
4188

4289
SaveH5Seurat(seur, filename ='~/sciFA/Data/HumanLiverAtlas.h5Seurat' ,overwrite = TRUE)
4390
Convert('~/sciFA/Data/HumanLiverAtlas.h5Seurat', dest = "h5ad")
@@ -225,7 +272,9 @@ ggplot(tsne_df_merged_2, aes(umap_1,umap_2,color=factor_28))+geom_point(alpha=0.
225272
###########################################################################
226273
factor_loading = read.csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv')
227274
genes = read.csv('/home/delaram/sciFA/Results/genes_humanlivermap.csv')
228-
df = data.frame(genes= genes$X0,factor=factor_loading$X17)
275+
colnames(factor_loading) = paste0('F', 1:ncol(factor_loading))
276+
df = data.frame(genes= genes$X0,factor=factor_loading$F29)
277+
229278
varimax_loading_df_ord = df[order(df$factor, decreasing = F),]
230279
varimax_loading_vis = head(varimax_loading_df_ord, 20)
231280
varimax_loading_vis$genes

data_prep/data_scMixology.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ set.seed(5252)
1717
############### Merging the 3cl data to be imported to python #############
1818
###########################################################################
1919

20-
load("~/sciFA/data/sincell_with_class.RData") ## 3 cell line data
20+
load("~/scLMM/sc_mixology/data//sincell_with_class.RData") ## 3 cell line data
2121
#sce10x_qc contains the read counts after quality control processing from the 10x platform.
2222
#sce4_qc contains the read counts after quality control processing from the CEL-seq2 platform.
2323
#scedrop_qc_qc contains the read counts after quality control proessing from the Drop-seq platform.

data_prep/data_stimPBMC.R

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,196 @@ SaveH5Seurat(Kang18_8vs8_seur, filename = "~/sciFA/Data/PBMC_Lupus_Kang8vs8_data
3333
Convert("~/sciFA/Data/PBMC_Lupus_Kang8vs8_data.h5Seurat", dest = "h5ad")
3434

3535

36+
37+
38+
###########################################################################################
39+
#######################. evaluate correlation with QC parameters. ########################
40+
###########################################################################################
41+
Kang18_8vs8_seur <- readRDS("~/scLMM/LMM-scRNAseq//Data/PBMC_Lupus_Kang8vs8_data_counts.rds")
42+
factor_df = read.csv('~/sciFA/Results/pca_scores_varimax_df_merged_lupusPBMC.csv')
43+
head(factor_df)
44+
##### calculating the mt percentage
45+
#HumanLiverSeurat[["percent.mt"]] <- PercentageFeatureSet(HumanLiverSeurat, pattern = "^MT-")
46+
mt_indices = grep('^MT-',rownames(Kang18_8vs8_seur))
47+
sum_MT_conuts = colSums(GetAssayData(Kang18_8vs8_seur, layer = 'counts')[mt_indices,])
48+
sum_counts = colSums(GetAssayData(Kang18_8vs8_seur, layer = 'counts'))
49+
Kang18_8vs8_seur[["percent.mt"]] = sum_MT_conuts/sum_counts
50+
51+
sum(colnames(Kang18_8vs8_seur)!=factor_df$X)
52+
factor_df$percent.mt = Kang18_8vs8_seur[["percent.mt"]]
53+
54+
qc_columns = c('nCount_originalexp', 'nFeature_originalexp' )
55+
factor_cols = paste0('F', c(1, 3, 4, 8, 13, 15, 16, 17, 19, 21:24, 26:30 ))
56+
to_keep_cols = c(factor_cols, qc_columns)
57+
factor_df_sub = factor_df[,colnames(factor_df) %in% to_keep_cols]
58+
59+
cor_mat = cor(factor_df_sub)[qc_columns, factor_cols]
60+
61+
62+
library(pheatmap)
63+
# make the color pallete
64+
clrsp <- colorRampPalette(c("darkgreen", "white", "purple"))
65+
clrs <- clrsp(200)
66+
breaks1 <- seq(-1, 1, length.out = 200)
67+
rownames(cor_mat)[1:2] = c('Total Counts', 'Total Features')
68+
cor_mat.t = t(cor_mat)
69+
pheatmap(cor_mat.t, cluster_cols = F, breaks = breaks1, color = clrs, display_numbers = T,
70+
cluster_rows = F, fontsize_row = 11, fontsize_col = 12)
71+
72+
73+
74+
factor_number=25
75+
factor_df2 = factor_df
76+
factor_df2 = factor_df2[!is.na(factor_df2$cell),]
77+
#factor_df2 = factor_df2[factor_df2$F22<(20),]
78+
79+
80+
ggplot(factor_df2, aes(y = F29, x = cell,fill = cell)) +
81+
geom_boxplot() +
82+
scale_fill_manual(values = color_palette) +
83+
theme_classic() + # Use a clean theme
84+
theme(
85+
axis.text.x = element_text(size = 12),
86+
axis.text.y = element_text(size = 12),
87+
axis.title.x = element_text(size = 14),
88+
axis.title.y = element_text(size = 14),
89+
legend.title = element_text(size = 13),
90+
legend.text = element_text(size = 11),
91+
) + coord_flip()+
92+
labs(
93+
x = "", # Label for x-axis
94+
y = paste0("F",factor_number," Score"), # Label for y-axis
95+
color = "Cell Type", # Legend title
96+
#title = paste0("Scatter Plot of F1 vs F", factor_number, " Colored by cell type")
97+
)
98+
99+
100+
101+
102+
###################################################################
103+
104+
library(gprofiler2)
105+
106+
get_gprofiler_enrich <- function(markers, model_animal_name){
107+
gostres <- gost(query = markers,
108+
ordered_query = TRUE, exclude_iea =TRUE,
109+
sources=c('GO:BP' ,'REAC'),
110+
organism = model_animal_name)
111+
return(gostres)
112+
}
113+
114+
factor_loading = read.csv('~/sciFA/Results/varimax_loading_df_lupusPBMC.csv')
115+
116+
factor_i = 3
117+
df = data.frame(gene = factor_loading$X, factor = factor_loading[[paste0('F',factor_i)]])
118+
df$gene <- sapply(strsplit(as.character(df$gene), "-"), `[`, 1)
119+
df$gene <- sub("-EN.*", "", df$gene)
120+
# Select top and bottom 20 genes
121+
varimax_loading_vis = rbind(head(df[order(df$factor, decreasing = TRUE), ], 20),
122+
tail(df[order(df$factor, decreasing = TRUE), ], 20))
123+
124+
# Factor levels based on the ordering of genes
125+
varimax_loading_vis$gene <- factor(varimax_loading_vis$gene,
126+
levels = varimax_loading_vis$gene)
127+
128+
# Plot with vertical orientation using coord_flip()
129+
ggplot(varimax_loading_vis, aes(x = gene, y = factor, color = factor)) +
130+
geom_point(size = 2, alpha = 1) +
131+
theme_bw() +
132+
theme(
133+
axis.text.x = element_text(color = "grey20", size = 12, angle=-90), # Adjust font size for horizontal axis
134+
axis.text.y = element_text(color = "grey20", size = 11.5, hjust = 1, vjust = 0.5),
135+
axis.title.x = element_text(color = "grey20", size = 14),
136+
axis.title.y = element_text(color = "grey20", size = 14),
137+
legend.text = element_text(hjust = 1),
138+
legend.position = "left",
139+
legend.direction = "vertical"
140+
) +
141+
# Set gradient colors
142+
scale_color_gradient2(
143+
name = paste0("Factor ", factor_i),
144+
midpoint = 0,
145+
low = "darkgreen", # Color for negative values
146+
mid = "white", # Color for midpoint
147+
high = "darkred", # Color for positive values
148+
space = "Lab"
149+
) +
150+
ylab('Factor loading') +
151+
xlab('') +
152+
ggtitle(paste0("Factor ", factor_i))
153+
#coord_flip() # Flip the coordinates to make the plot vertical
154+
155+
156+
157+
158+
159+
model_animal_name ='hsapiens'
160+
161+
162+
factor_i = 22
163+
df = data.frame(gene = factor_loading$X, factor = factor_loading[[paste0('F',factor_i)]])
164+
#df$gene <- sapply(strsplit(as.character(df$gene), "-"), `[`, 1)
165+
df$gene <- sub("-EN.*", "", df$gene)
166+
167+
168+
df_pos = df[order(df$factor, decreasing = T),]
169+
df_neg = df[order(df$factor, decreasing = F),]
170+
171+
head(df_pos,10)
172+
head(df_neg,10)
173+
num_genes = 200
174+
175+
table_to_vis = df_pos[1:20,]
176+
rownames(table_to_vis) = NULL
177+
colnames(table_to_vis) = c('Gene', 'Score')
178+
table_to_vis$Score = round(table_to_vis$Score, 3)
179+
library(gridExtra)
180+
dev.off()
181+
tt2 <- ttheme_minimal()
182+
gridExtra::grid.table(table_to_vis, theme=tt2)
183+
184+
######## pos enrichment
185+
enrich_res = get_gprofiler_enrich(markers=df_pos$gene[1:num_genes],
186+
model_animal_name = model_animal_name )#'gp__SEA8_T0ld_VHU'
187+
######## neg enrichment
188+
enrich_res = get_gprofiler_enrich(markers=df_neg$gene[1:num_genes],
189+
model_animal_name = model_animal_name )#'gp__SEA8_T0ld_VHU'
190+
head(enrich_res$result,30)
191+
192+
193+
194+
enrich_res_df = data.frame(enrich_res$result)
195+
enrich_res_df$log_p = -log(as.numeric(enrich_res_df$p_value))
196+
enrich_res_df = enrich_res_df[order(enrich_res_df$log_p, decreasing = T),]
197+
#View(enrich_res_df)
198+
199+
num_term_vis = 15
200+
enrich_res_df = enrich_res_df[1:num_term_vis,]
201+
#enrich_res_df = enrich_res_df[c(2, 5,28,29,32,42,46,48,62,65),]
202+
203+
enrich_res_df = enrich_res_df[,colnames(enrich_res_df) %in% c('term_name', 'p_value')]
204+
enrich_res_df$log_p = -log(enrich_res_df$p_value)
205+
title = ''
206+
207+
enrich_res_df$term_name = gsub('metabolic process', 'metabolism',enrich_res_df$term_name)
208+
#enrich_res_df$term_name[9] = "The citric acid (TCA) cycle"
209+
enrich_res_df$term_name <- factor(enrich_res_df$term_name,
210+
levels = enrich_res_df$term_name[length(enrich_res_df$term_name):1])
211+
212+
color = "coral3"
213+
color = "darkseagreen3"
214+
215+
title = ''#'stim'#'Male'
216+
enrich_res_df = enrich_res_df[-7,]
217+
ggplot(enrich_res_df, aes(y=term_name,x=log_p))+
218+
geom_bar(stat = 'identity',fill=color,color='grey10')+
219+
xlab('-log(p value)')+
220+
theme_classic()+ylab('')+ggtitle(title)+
221+
scale_fill_manual(values = c(color))+
222+
theme(axis.text.x = element_text(color = "grey20", size = 13, angle = 0, hjust = .5, vjust = .5, face = "plain"),
223+
axis.text.y = element_text(color = "grey20", size = 12, angle = 0, hjust = 1, vjust = 0, face = "plain"),
224+
axis.title.x = element_text(color = "grey20", size = 17, angle = 0, hjust = .5, vjust = 0, face = "plain"),
225+
axis.title.y = element_text(color = "grey20", size = 17, angle = 90, hjust = .5, vjust = .5, face = "plain"))
226+
227+
228+

example_healthyHumanLiver.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
data = exproc.import_AnnData(data_file_path)
2929
data, gene_idx = proc.get_sub_data(data, num_genes=NUM_GENES) # subset the data to num_genes HVGs
3030

31-
3231
y, genes, num_cells, num_genes = proc.get_data_array(data)
3332

3433
#pd.DataFrame(genes).to_csv('/home/delaram/sciFA/Results/genes_humanlivermap.csv', index=False)
@@ -210,7 +209,12 @@
210209
#factor_scores_df.to_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv', index=False)
211210
#pd.DataFrame(factor_loading).to_csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv', index=False)
212211

213-
212+
### read the factor scores dataframe
213+
factor_scores_df = pd.read_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv')
214+
### select columns that start with factor_
215+
pattern_cols = [col for col in factor_scores_df.columns if 'factor_' in col]
216+
factor_scores_df_factor = factor_scores_df[pattern_cols]
217+
factor_scores = factor_scores_df_factor.values
214218

215219
####################################
216220
#### Bimodality scores
@@ -240,33 +244,41 @@
240244
vis.plot_FIST(fist, title='Scaled metrics for all the factors')
241245
### subset the first 15 factors of fist dataframe
242246
vis.plot_FIST(fist.iloc[0:15,:])
243-
### include factors F10, F19, F26, F28, F30
244-
vis.plot_FIST(fist.iloc[[9, 18, 25, 27, 29],:],
245-
x_axis_label=['F10', 'F19', 'F26', 'F28', 'F30'])
247+
### include factors F10, F19, F26, F29, F30
248+
vis.plot_FIST(fist.iloc[[9, 18, 25, 28, 29],:],
249+
x_axis_label=['F10', 'F19', 'F26', 'F29', 'F30'])
246250
vis.plot_FIST(fist.iloc[matched_factor_index,:])
247251

248252

249253

254+
### read the factor scores dataframe
255+
factor_scores_df = pd.read_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv')
256+
### select columns that start with factor_
257+
pattern_cols = [col for col in factor_scores_df.columns if 'factor_' in col]
258+
factor_scores_df_factor = factor_scores_df[pattern_cols]
259+
factor_scores = factor_scores_df_factor.values
250260
################################################################
251261
######## Creating the FIS table for a subset of factors ########
252262
################################################################
253263
#### Bimodality scores
254-
### subset factor scores to include factors F10, F19, F26, F28, F30
255-
selected_factors = [9, 18, 25, 27, 29]
264+
### subset factor scores to include factors F10, F19, F26, F29, F30
265+
selected_factors = [9, 18, 25, 27, 28, 29]
256266
factor_scores_subset = factor_scores[:,selected_factors]
257-
silhouette_score = met.kmeans_bimodal_score(factor_scores, time_eff=True)
258-
bimodality_index = met.bimodality_index(factor_scores)
259-
bimodality_score = np.mean([silhouette_score, bimodality_index], axis=0)
267+
#silhouette_score = met.kmeans_bimodal_score(factor_scores_subset, time_eff=True)
268+
bimodality_index = met.bimodality_index(factor_scores_subset)
269+
#bimodality_score = np.mean([silhouette_score, bimodality_index], axis=0)
260270
bimodality_score = bimodality_index
261271
#### Effect size
262-
factor_variance = met.factor_variance(factor_scores)
272+
factor_variance = met.factor_variance(factor_scores_subset)
263273

264274
## Specificity
265-
simpson_fcat = met.simpson_diversity_index(fcat)
275+
### subset the FCAT scores to include the selected factors
276+
fcat_subset = fcat.iloc[:,selected_factors]
277+
simpson_fcat = met.simpson_diversity_index(fcat_subset)
266278

267279
### label dependent factor metrics
268-
asv_cell_type = met.average_scaled_var(factor_scores, covariate_vector=y_cell_type, mean_type='arithmetic')
269-
asv_sample = met.average_scaled_var(factor_scores, y_sample, mean_type='arithmetic')
280+
asv_cell_type = met.average_scaled_var(factor_scores_subset, covariate_vector=y_cell_type, mean_type='arithmetic')
281+
asv_sample = met.average_scaled_var(factor_scores_subset, y_sample, mean_type='arithmetic')
270282

271283

272284
########### create factor-interpretibility score table (FIST) ######
@@ -276,4 +288,5 @@
276288
'Homogeneity (cell type)':asv_cell_type,
277289
'Homogeneity (sample)':asv_sample}
278290

279-
fist = met.FIST(metrics_dict)
291+
fist = met.FIST(metrics_dict)
292+
vis.plot_FIST(fist, x_axis_label=['F10', 'F19', 'F26','F28' ,'F29', 'F30'])

example_healthyRatLiver.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,15 @@
212212
x_axis_tick_fontsize=32, y_axis_tick_fontsize=34)
213213

214214

215+
fcat = pd.concat([fcat_strain, fcat_cell_type], axis=0)
216+
fcat = fcat[fcat.index != 'NA'] ### remove the rownames called NA from table
217+
218+
vis.plot_FCAT(fcat, title='', color='coolwarm',
219+
x_axis_fontsize=20, y_axis_fontsize=20, title_fontsize=22,
220+
x_axis_tick_fontsize=32, y_axis_tick_fontsize=34)
221+
222+
223+
215224
### using Otsu's method to calculate the threshold
216225
threshold = efca.get_otsu_threshold(fcat.values.flatten())
217226

0 commit comments

Comments
 (0)