BaderLab
diff --git a/‎data_prep/data_humanLiver.R‎
Lines changed: 56 additions & 7 deletions b/‎data_prep/data_humanLiver.R‎
Lines changed: 56 additions & 7 deletions
diff --git a/‎data_prep/data_scMixology.R‎
Lines changed: 1 addition & 1 deletion b/‎data_prep/data_scMixology.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_prep/data_stimPBMC.R‎
Lines changed: 193 additions & 0 deletions b/‎data_prep/data_stimPBMC.R‎
Lines changed: 193 additions & 0 deletions
diff --git a/‎example_healthyHumanLiver.py‎
Lines changed: 28 additions & 15 deletions b/‎example_healthyHumanLiver.py‎
Lines changed: 28 additions & 15 deletions
diff --git a/‎example_healthyRatLiver.py‎
Lines changed: 9 additions & 0 deletions b/‎example_healthyRatLiver.py‎
Lines changed: 9 additions & 0 deletions
@@ -13,6 +13,53 @@ sum_MT_conuts = colSums(GetAssayData(HumanLiverSeurat, layer = 'counts')[mt_indi
 sum_counts = colSums(GetAssayData(HumanLiverSeurat, layer = 'counts'))
 HumanLiverSeurat[["percent.mt"]]  = sum_MT_conuts/sum_counts
 
+meta_data = HumanLiverSeurat@meta.data
+pca_df = Embeddings(HumanLiverSeurat, 'pca')
+
+
+HumanLiverSeurat <- RunUMAP(HumanLiverSeurat, dims = 1:30, reduction = "pca")
+umap_df = Embeddings(HumanLiverSeurat, 'umap')
+head(HumanLiverSeurat)
+head(umap_df)
+sum(colnames(HumanLiverSeurat) != rownames(umap_df))
+umap_df2 = cbind(umap_df, HumanLiverSeurat@meta.data)
+dim(umap_df2)
+dim(umap_df)
+
+
+library(RColorBrewer)
+num_colors = length(names(table(umap_df2$cell_type)))
+color_palette <- brewer.pal(n = , name = "Set3")
+
+# Generate a 20-color palette from Set3 (or you can use another palette like Paired)
+colors_20 <- brewer.pal(12, "Set3")  # Max 12 colors for Set3, so combine palettes
+colors_20 <- c(colors_20, brewer.pal(8, "Dark2"))  # Combine with another palette
+
+# Visualize
+barplot(rep(1, 20), col = colors_20, border = NA)
+
+my_colors = c('#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 
+              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22')
+ggplot(umap_df2, aes(x=umap_1, y=umap_2, color=cell_type))+
+  geom_point(size = 1.4, alpha = 0.8) +  # Adjust point size and transparency
+  #scale_color_brewer(palette = "Set1") +  # Use the Set3 color palette
+  scale_color_manual(values = colors_20) +
+  theme_classic() +  # Use a clean thmy_coloeme
+  theme(
+    axis.text.x = element_text(size = 12),  # Make x-axis labels readable
+    axis.text.y = element_text(size = 12),  # Make y-axis labels readable
+    axis.title.x = element_text(size = 14), # Make x-axis title readable
+    axis.title.y = element_text(size = 14), # Make y-axis title readable
+    legend.title = element_text(size = 13), # Make legend title readable
+    legend.text = element_text(size = 11)   # Make legend text readable
+  ) +
+  labs(
+    x = "UMAP Dimension 1",  # Label for x-axis
+    y = "UMAP Dimension 2",  # Label for y-axis
+    color = 'Cell type',#"Sample",      # Legend title
+    #title = "UMAP embedding of unintegrated data"  # Title
+  )
+
 ################################################################
 ################# DO NOT RUN again #####################
 
@@ -29,15 +76,15 @@ annotations=c('Hep1','abT cell','Hep2','infMac','Hep3','Hep4','plasma cell',
               'hepatic stellate cell')
 
 label_df = data.frame(cluster=paste0('cluster_',1:20),labels=annotations)
-Idents(seur) = paste0('cluster_', as.character(seur$res.0.8))
-human_liver_annot = data.frame(umi=colnames(seur), cluster=Idents(seur))
+Idents(HumanLiverSeurat) = paste0('cluster_', as.character(HumanLiverSeurat$res.0.8))
+human_liver_annot = data.frame(umi=colnames(HumanLiverSeurat), cluster=Idents(HumanLiverSeurat))
 human_liver_annot = merge(human_liver_annot, label_df, by.x='cluster', by.y='cluster', all.x=T, sort=F)
 
-human_liver_annot_sorted <- human_liver_annot[match(colnames(seur), human_liver_annot$umi),]
-sum(human_liver_annot_sorted$umi != colnames(seur))
-seur$cell_type = human_liver_annot_sorted$labels
+human_liver_annot_sorted <- human_liver_annot[match(colnames(HumanLiverSeurat), human_liver_annot$umi),]
+sum(human_liver_annot_sorted$umi != colnames(HumanLiverSeurat))
+HumanLiverSeurat$cell_type = human_liver_annot_sorted$labels
 
-seur$sample = unlist(lapply(strsplit(colnames(seur), '_'), '[[', 1))
+HumanLiverSeurat$sample = unlist(lapply(strsplit(colnames(HumanLiverSeurat), '_'), '[[', 1))
 
 SaveH5Seurat(seur, filename ='~/sciFA/Data/HumanLiverAtlas.h5Seurat' ,overwrite = TRUE)
 Convert('~/sciFA/Data/HumanLiverAtlas.h5Seurat', dest = "h5ad")
@@ -225,7 +272,9 @@ ggplot(tsne_df_merged_2, aes(umap_1,umap_2,color=factor_28))+geom_point(alpha=0.
 ###########################################################################
 factor_loading = read.csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv')
 genes = read.csv('/home/delaram/sciFA/Results/genes_humanlivermap.csv')
-df = data.frame(genes= genes$X0,factor=factor_loading$X17)
+colnames(factor_loading) = paste0('F', 1:ncol(factor_loading))
+df = data.frame(genes= genes$X0,factor=factor_loading$F29)
+
 varimax_loading_df_ord = df[order(df$factor, decreasing = F),]
 varimax_loading_vis = head(varimax_loading_df_ord, 20)
 varimax_loading_vis$genes
 
@@ -17,7 +17,7 @@ set.seed(5252)
 ############### Merging the 3cl data to be imported to python #############
 ###########################################################################
 
-load("~/sciFA/data/sincell_with_class.RData") ## 3 cell line data
+load("~/scLMM/sc_mixology/data//sincell_with_class.RData") ## 3 cell line data
 #sce10x_qc contains the read counts after quality control processing from the 10x platform. 
 #sce4_qc contains the read counts after quality control processing from the CEL-seq2 platform. 
 #scedrop_qc_qc contains the read counts after quality control proessing from the Drop-seq platform.
 
@@ -33,3 +33,196 @@ SaveH5Seurat(Kang18_8vs8_seur, filename = "~/sciFA/Data/PBMC_Lupus_Kang8vs8_data
 Convert("~/sciFA/Data/PBMC_Lupus_Kang8vs8_data.h5Seurat", dest = "h5ad")
 
 
+
+
+###########################################################################################
+#######################. evaluate correlation with QC parameters. ########################
+###########################################################################################
+Kang18_8vs8_seur <- readRDS("~/scLMM/LMM-scRNAseq//Data/PBMC_Lupus_Kang8vs8_data_counts.rds")
+factor_df = read.csv('~/sciFA/Results/pca_scores_varimax_df_merged_lupusPBMC.csv')
+head(factor_df)
+##### calculating the mt percentage
+#HumanLiverSeurat[["percent.mt"]] <- PercentageFeatureSet(HumanLiverSeurat, pattern = "^MT-")
+mt_indices = grep('^MT-',rownames(Kang18_8vs8_seur))
+sum_MT_conuts = colSums(GetAssayData(Kang18_8vs8_seur, layer = 'counts')[mt_indices,])
+sum_counts = colSums(GetAssayData(Kang18_8vs8_seur, layer = 'counts'))
+Kang18_8vs8_seur[["percent.mt"]]  = sum_MT_conuts/sum_counts
+
+sum(colnames(Kang18_8vs8_seur)!=factor_df$X)
+factor_df$percent.mt = Kang18_8vs8_seur[["percent.mt"]] 
+
+qc_columns = c('nCount_originalexp', 'nFeature_originalexp' )
+factor_cols = paste0('F', c(1, 3, 4, 8, 13, 15, 16, 17, 19, 21:24, 26:30 ))
+to_keep_cols = c(factor_cols, qc_columns)
+factor_df_sub = factor_df[,colnames(factor_df) %in% to_keep_cols]
+
+cor_mat = cor(factor_df_sub)[qc_columns, factor_cols]
+
+
+library(pheatmap)
+# make the color pallete
+clrsp <- colorRampPalette(c("darkgreen", "white", "purple"))   
+clrs <- clrsp(200) 
+breaks1 <- seq(-1, 1, length.out = 200)
+rownames(cor_mat)[1:2] = c('Total Counts', 'Total Features')
+cor_mat.t = t(cor_mat)
+pheatmap(cor_mat.t, cluster_cols = F, breaks = breaks1, color =  clrs, display_numbers = T, 
+         cluster_rows = F, fontsize_row = 11, fontsize_col = 12)
+
+
+
+factor_number=25
+factor_df2 = factor_df
+factor_df2 = factor_df2[!is.na(factor_df2$cell),]
+#factor_df2 = factor_df2[factor_df2$F22<(20),]
+
+
+ggplot(factor_df2, aes(y = F29, x = cell,fill = cell)) +
+  geom_boxplot() +  
+  scale_fill_manual(values = color_palette) +  
+  theme_classic() +  # Use a clean theme
+  theme(
+    axis.text.x = element_text(size = 12),  
+    axis.text.y = element_text(size = 12),  
+    axis.title.x = element_text(size = 14), 
+    axis.title.y = element_text(size = 14), 
+    legend.title = element_text(size = 13), 
+    legend.text = element_text(size = 11),  
+  ) + coord_flip()+
+  labs(
+    x = "",  # Label for x-axis
+    y = paste0("F",factor_number," Score"),  # Label for y-axis
+    color = "Cell Type",  # Legend title
+    #title = paste0("Scatter Plot of F1 vs F", factor_number,  " Colored by cell type")
+  )
+
+
+
+
+###################################################################
+
+library(gprofiler2)
+
+get_gprofiler_enrich <- function(markers, model_animal_name){
+  gostres <- gost(query = markers,
+                  ordered_query = TRUE, exclude_iea =TRUE, 
+                  sources=c('GO:BP' ,'REAC'),
+                  organism = model_animal_name)
+  return(gostres)
+}
+
+factor_loading = read.csv('~/sciFA/Results/varimax_loading_df_lupusPBMC.csv')
+
+factor_i = 3
+df = data.frame(gene = factor_loading$X, factor = factor_loading[[paste0('F',factor_i)]])
+df$gene <- sapply(strsplit(as.character(df$gene), "-"), `[`, 1)
+df$gene <- sub("-EN.*", "", df$gene)
+# Select top and bottom 20 genes
+varimax_loading_vis = rbind(head(df[order(df$factor, decreasing = TRUE), ], 20), 
+                            tail(df[order(df$factor, decreasing = TRUE), ], 20))
+
+# Factor levels based on the ordering of genes
+varimax_loading_vis$gene <- factor(varimax_loading_vis$gene, 
+                                   levels = varimax_loading_vis$gene)
+
+# Plot with vertical orientation using coord_flip()
+ggplot(varimax_loading_vis, aes(x = gene, y = factor, color = factor)) +
+  geom_point(size = 2, alpha = 1) +
+  theme_bw() +
+  theme(
+    axis.text.x = element_text(color = "grey20", size = 12, angle=-90),  # Adjust font size for horizontal axis
+    axis.text.y = element_text(color = "grey20", size = 11.5, hjust = 1, vjust = 0.5),
+    axis.title.x = element_text(color = "grey20", size = 14),
+    axis.title.y = element_text(color = "grey20", size = 14),
+    legend.text = element_text(hjust = 1),
+    legend.position = "left",
+    legend.direction = "vertical"
+  ) +
+  # Set gradient colors
+  scale_color_gradient2(
+    name = paste0("Factor ", factor_i),
+    midpoint = 0,
+    low = "darkgreen",   # Color for negative values
+    mid = "white",       # Color for midpoint
+    high = "darkred",    # Color for positive values
+    space = "Lab"
+  ) +
+  ylab('Factor loading') +
+  xlab('') +
+  ggtitle(paste0("Factor ", factor_i))
+  #coord_flip()  # Flip the coordinates to make the plot vertical
+
+
+
+
+
+model_animal_name ='hsapiens'
+
+
+factor_i = 22
+df = data.frame(gene = factor_loading$X, factor = factor_loading[[paste0('F',factor_i)]])
+#df$gene <- sapply(strsplit(as.character(df$gene), "-"), `[`, 1)
+df$gene <- sub("-EN.*", "", df$gene)
+
+
+df_pos = df[order(df$factor, decreasing = T),]
+df_neg = df[order(df$factor, decreasing = F),]
+
+head(df_pos,10)
+head(df_neg,10)
+num_genes = 200
+
+table_to_vis = df_pos[1:20,]
+rownames(table_to_vis) = NULL
+colnames(table_to_vis) = c('Gene', 'Score')
+table_to_vis$Score = round(table_to_vis$Score, 3)
+library(gridExtra)
+dev.off()
+tt2 <- ttheme_minimal()
+gridExtra::grid.table(table_to_vis, theme=tt2)
+
+######## pos enrichment
+enrich_res = get_gprofiler_enrich(markers=df_pos$gene[1:num_genes], 
+                                  model_animal_name = model_animal_name )#'gp__SEA8_T0ld_VHU'
+######## neg enrichment
+enrich_res = get_gprofiler_enrich(markers=df_neg$gene[1:num_genes], 
+                                  model_animal_name = model_animal_name )#'gp__SEA8_T0ld_VHU'
+head(enrich_res$result,30)
+
+
+
+enrich_res_df = data.frame(enrich_res$result)
+enrich_res_df$log_p = -log(as.numeric(enrich_res_df$p_value))
+enrich_res_df = enrich_res_df[order(enrich_res_df$log_p, decreasing = T),]
+#View(enrich_res_df)
+
+num_term_vis = 15
+enrich_res_df = enrich_res_df[1:num_term_vis,]
+#enrich_res_df = enrich_res_df[c(2, 5,28,29,32,42,46,48,62,65),]
+
+enrich_res_df = enrich_res_df[,colnames(enrich_res_df) %in% c('term_name', 'p_value')]
+enrich_res_df$log_p = -log(enrich_res_df$p_value)
+title = ''
+
+enrich_res_df$term_name = gsub('metabolic process', 'metabolism',enrich_res_df$term_name)
+#enrich_res_df$term_name[9] = "The citric acid (TCA) cycle"
+enrich_res_df$term_name <- factor(enrich_res_df$term_name, 
+                                   levels =  enrich_res_df$term_name[length(enrich_res_df$term_name):1])
+
+color = "coral3"
+color = "darkseagreen3"
+
+title = ''#'stim'#'Male'
+enrich_res_df = enrich_res_df[-7,]
+ggplot(enrich_res_df, aes(y=term_name,x=log_p))+
+  geom_bar(stat = 'identity',fill=color,color='grey10')+
+  xlab('-log(p value)')+
+  theme_classic()+ylab('')+ggtitle(title)+
+  scale_fill_manual(values = c(color))+
+  theme(axis.text.x = element_text(color = "grey20", size = 13, angle = 0, hjust = .5, vjust = .5, face = "plain"),
+        axis.text.y = element_text(color = "grey20", size = 12, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
+        axis.title.x = element_text(color = "grey20", size = 17, angle = 0, hjust = .5, vjust = 0, face = "plain"),
+        axis.title.y = element_text(color = "grey20", size = 17, angle = 90, hjust = .5, vjust = .5, face = "plain"))
+
+
+
@@ -28,7 +28,6 @@
 data = exproc.import_AnnData(data_file_path)
 data, gene_idx = proc.get_sub_data(data, num_genes=NUM_GENES) # subset the data to num_genes HVGs
 
-
 y, genes, num_cells, num_genes = proc.get_data_array(data)
 
 #pd.DataFrame(genes).to_csv('/home/delaram/sciFA/Results/genes_humanlivermap.csv', index=False)
@@ -210,7 +209,12 @@
 #factor_scores_df.to_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv', index=False)
 #pd.DataFrame(factor_loading).to_csv('/home/delaram/sciFA/Results/factor_loading_humanlivermap.csv', index=False)
 
-
+### read the factor scores dataframe
+factor_scores_df = pd.read_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv')
+### select columns that start with factor_
+pattern_cols = [col for col in factor_scores_df.columns if 'factor_' in col]
+factor_scores_df_factor = factor_scores_df[pattern_cols]
+factor_scores = factor_scores_df_factor.values
 
 ####################################
 #### Bimodality scores
@@ -240,33 +244,41 @@
 vis.plot_FIST(fist, title='Scaled metrics for all the factors')
 ### subset the first 15 factors of fist dataframe
 vis.plot_FIST(fist.iloc[0:15,:])
-### include factors F10, F19, F26, F28, F30
-vis.plot_FIST(fist.iloc[[9, 18, 25, 27, 29],:], 
-              x_axis_label=['F10', 'F19', 'F26', 'F28', 'F30'])
+### include factors F10, F19, F26, F29, F30
+vis.plot_FIST(fist.iloc[[9, 18, 25, 28, 29],:], 
+              x_axis_label=['F10', 'F19', 'F26', 'F29', 'F30'])
 vis.plot_FIST(fist.iloc[matched_factor_index,:])
 
 
 
+### read the factor scores dataframe
+factor_scores_df = pd.read_csv('/home/delaram/sciFA/Results/factor_scores_umap_df_humanlivermap.csv')
+### select columns that start with factor_
+pattern_cols = [col for col in factor_scores_df.columns if 'factor_' in col]
+factor_scores_df_factor = factor_scores_df[pattern_cols]
+factor_scores = factor_scores_df_factor.values
 ################################################################
 ########  Creating the FIS table for a subset of factors ########
 ################################################################
 #### Bimodality scores
-### subset factor scores to include factors F10, F19, F26, F28, F30
-selected_factors = [9, 18, 25, 27, 29]
+### subset factor scores to include factors F10, F19, F26, F29, F30
+selected_factors = [9, 18, 25, 27, 28, 29]
 factor_scores_subset = factor_scores[:,selected_factors]
-silhouette_score = met.kmeans_bimodal_score(factor_scores, time_eff=True)
-bimodality_index = met.bimodality_index(factor_scores)
-bimodality_score = np.mean([silhouette_score, bimodality_index], axis=0)
+#silhouette_score = met.kmeans_bimodal_score(factor_scores_subset, time_eff=True)
+bimodality_index = met.bimodality_index(factor_scores_subset)
+#bimodality_score = np.mean([silhouette_score, bimodality_index], axis=0)
 bimodality_score = bimodality_index
 #### Effect size
-factor_variance = met.factor_variance(factor_scores)
+factor_variance = met.factor_variance(factor_scores_subset)
 
 ## Specificity
-simpson_fcat = met.simpson_diversity_index(fcat)
+### subset the FCAT scores to include the selected factors
+fcat_subset = fcat.iloc[:,selected_factors]
+simpson_fcat = met.simpson_diversity_index(fcat_subset)
 
 ### label dependent factor metrics
-asv_cell_type = met.average_scaled_var(factor_scores, covariate_vector=y_cell_type, mean_type='arithmetic')
-asv_sample = met.average_scaled_var(factor_scores, y_sample, mean_type='arithmetic')
+asv_cell_type = met.average_scaled_var(factor_scores_subset, covariate_vector=y_cell_type, mean_type='arithmetic')
+asv_sample = met.average_scaled_var(factor_scores_subset, y_sample, mean_type='arithmetic')
 
 
 ########### create factor-interpretibility score table (FIST) ######
@@ -276,4 +288,5 @@
                     'Homogeneity (cell type)':asv_cell_type,
                     'Homogeneity (sample)':asv_sample}
 
-fist = met.FIST(metrics_dict)
+fist = met.FIST(metrics_dict)
+vis.plot_FIST(fist, x_axis_label=['F10', 'F19', 'F26','F28' ,'F29', 'F30'])
@@ -212,6 +212,15 @@
               x_axis_tick_fontsize=32, y_axis_tick_fontsize=34)
 
 
+fcat = pd.concat([fcat_strain, fcat_cell_type], axis=0)
+fcat = fcat[fcat.index != 'NA'] ### remove the rownames called NA from table
+
+vis.plot_FCAT(fcat, title='', color='coolwarm',
+              x_axis_fontsize=20, y_axis_fontsize=20, title_fontsize=22,
+              x_axis_tick_fontsize=32, y_axis_tick_fontsize=34)
+
+
+
 ### using Otsu's method to calculate the threshold
 threshold = efca.get_otsu_threshold(fcat.values.flatten())