Merge pull request #28 from stemangiola/add_scaled_counts

stemangiola · web-flow · commit 886c1595f550 · 2022-11-06T11:16:27.000+11:00
add scale framework
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,8 +20,10 @@ importFrom(dplyr,filter)
 importFrom(dplyr,pull)
 importFrom(dplyr,tbl)
 importFrom(glue,glue)
+importFrom(magrittr,equals)
 importFrom(purrr,map)
 importFrom(purrr,map_int)
 importFrom(purrr,reduce)
+importFrom(purrr,when)
 importFrom(stringr,str_remove)
 importFrom(tidySingleCellExperiment,inner_join)
diff --git a/R/query.R b/R/query.R
@@ -6,7 +6,7 @@
 #' @param genes An optional character vector of genes to return the counts for. By default counts for all genes will be returned.
 #'
 #' @importFrom dplyr pull filter
-#' @importFrom tidySingleCellExperiment inner_join 
+#' @importFrom tidySingleCellExperiment inner_join
 #' @importFrom purrr reduce map map_int
 #' @importFrom BiocGenerics cbind
 #' @importFrom glue glue
@@ -15,13 +15,20 @@
 #' @importFrom stringr str_remove
 #' @importFrom SingleCellExperiment SingleCellExperiment
 #' @importFrom SummarizedExperiment colData assayNames<-
+#' @importFrom purrr when
+#' @importFrom magrittr equals
 #'
 #' @export
 #'
 #'
 get_SingleCellExperiment = function(
   .data,
-  repository = "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data",
+  assay = "counts",
+  repository = when(
+  	assay,
+  	equals(., "counts") ~ "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data",
+  	equals(., "counts_per_million") ~ "/vast/projects/RCP/human_cell_atlas/splitted_DB2_data_scaled"
+  ),
   genes = NULL
 ){
   # We have to convert to an in-memory table here, or some of the dplyr operations will fail when passed a database connection
@@ -41,17 +48,17 @@ get_SingleCellExperiment = function(
 		files_to_read |>
 		map(~ {
 			cat(".")
-		  
+
 		  sce = glue("{repository}/{.x}") |>
 			  loadHDF5SummarizedExperiment()
-		  
+
 		  if (!is.null(genes)){
 		    # Optionally subset the genes
 		    sce = sce[
-		      intersect(genes, rownames(sce))  
+		      intersect(genes, rownames(sce))
 		    ]
 		  }
-		  
+
 		  sce |>
 			  inner_join(
   				# Needed because cell IDs are not unique outside the file_id or file_id_db
@@ -71,15 +78,15 @@ get_SingleCellExperiment = function(
 		sces |>
 		do.call(cbind, args=_)
 
-	# Rename assay
-	assayNames(sce) = "counts"
+	# Rename assay THIS WILL NOT BE NEEDED EVENTUALLY
+	assayNames(sce) = assay
 
 	# Return
 	sce
 }
 
 #' @importFrom SeuratObject as.sparse
-#' @exportS3Method 
+#' @exportS3Method
 as.sparse.DelayedMatrix = function(x){
   # This is glue to ensure the SCE -> Seurat conversion works properly with
   # DelayedArray types
@@ -89,7 +96,7 @@ as.sparse.DelayedMatrix = function(x){
 #' Given a data frame of HCA metadata, returns a Seurat object corresponding to the samples in that data frame
 #'
 #' @inheritDotParams get_SingleCellExperiment
-#' @importFrom Seurat as.Seurat 
+#' @importFrom Seurat as.Seurat
 #' @export
 get_seurat = function(
   ...
diff --git a/README.Rmd b/README.Rmd
@@ -12,6 +12,8 @@ library(dplyr)
 library(dbplyr)
 library(SingleCellExperiment)
 library(tidySingleCellExperiment)
+options("restore_SingleCellExperiment_show" = TRUE)
+
 ```
 
 Load the data
@@ -29,10 +31,9 @@ get_metadata() |>
 	arrange(desc(n))
 ```
 
-Query
+Query raw counts
 
 ```{r}
-options("restore_SingleCellExperiment_show" = TRUE)
 sce = 
 	get_metadata() |> 
     filter(
@@ -47,4 +48,21 @@ sce =
 sce
 ```
 
+Query counts scaled per million. This is helpful if just few genes are of interest
+
+```{r}
+sce = 
+	get_metadata() |> 
+    filter(
+    	 ethnicity == "African" & 
+        assay %LIKE% "%10x%" & 
+        tissue == "lung parenchyma" & 
+        cell_type %LIKE% "%CD4%"
+    ) |> 
+	
+	get_SingleCellExperiment(assay = "counts_per_million")
+	
+sce
+```
+
 
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ library(dplyr)
 library(dbplyr)
 library(SingleCellExperiment)
 library(tidySingleCellExperiment)
+options("restore_SingleCellExperiment_show" = TRUE)
 ```
 
 Load the data
@@ -17,7 +18,7 @@ Load the data
 get_metadata()
 ```
 
-    ## # Source:   table<metadata> [?? x 48]
+    ## # Source:   table<metadata> [?? x 52]
     ## # Database: sqlite 3.39.2 [/vast/projects/RCP/human_cell_atlas/metadata.sqlite]
     ##    .cell   .sample .samp…¹ assay assay…² cell_…³ cell_…⁴ devel…⁵ devel…⁶ disease
     ##    <chr>   <chr>   <chr>   <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
@@ -31,7 +32,7 @@ get_metadata()
     ##  8 AAACGG… 5f20d7… D17PrP… 10x … EFO:00… basal … CL:000… 31-yea… HsapDv… normal 
     ##  9 AAACGG… 5f20d7… D17PrP… 10x … EFO:00… lumina… CL:000… 31-yea… HsapDv… normal 
     ## 10 AAACGG… 5f20d7… D17PrP… 10x … EFO:00… basal … CL:000… 31-yea… HsapDv… normal 
-    ## # … with more rows, 38 more variables: disease_ontology_term_id <chr>,
+    ## # … with more rows, 42 more variables: disease_ontology_term_id <chr>,
     ## #   ethnicity <chr>, ethnicity_ontology_term_id <chr>, file_id <chr>,
     ## #   is_primary_data.x <chr>, organism <chr>, organism_ontology_term_id <chr>,
     ## #   sample_placeholder <chr>, sex <chr>, sex_ontology_term_id <chr>,
@@ -65,10 +66,9 @@ get_metadata() |>
     ## 10 thymus                   17
     ## # … with more rows
 
-Query
+Query raw counts
 
 ``` r
-options("restore_SingleCellExperiment_show" = TRUE)
 sce = 
     get_metadata() |> 
     filter(
@@ -90,15 +90,53 @@ sce
 ```
 
     ## class: SingleCellExperiment 
-    ## dim: 28024 1571 
+    ## dim: 60661 1571 
+    ## metadata(0):
+    ## assays(1): counts
+    ## rownames(60661): TSPAN6 TNMD ... RP11-175I6.6 PRSS43P
+    ## rowData names(0):
+    ## colnames(1571): ACAGCCGGTCCGTTAA_F02526 GGGAATGAGCCCAGCT_F02526 ...
+    ##   TACAACGTCAGCATTG_SC84 CATTCGCTCAATACCG_F02526
+    ## colData names(51): .sample .sample_name ... cell_annotation_azimuth_l2
+    ##   cell_annotation_blueprint_singler
+    ## reducedDimNames(0):
+    ## mainExpName: NULL
+    ## altExpNames(0):
+
+Query counts scaled per million. This is helpful if just few genes are
+of interest
+
+``` r
+sce = 
+    get_metadata() |> 
+    filter(
+         ethnicity == "African" & 
+        assay %LIKE% "%10x%" & 
+        tissue == "lung parenchyma" & 
+        cell_type %LIKE% "%CD4%"
+    ) |> 
+    
+    get_SingleCellExperiment(assay = "counts_per_million")
+```
+
+    ## Reading 1 files.
+
+    ## .
+
+``` r
+sce
+```
+
+    ## class: SingleCellExperiment 
+    ## dim: 60661 1571 
     ## metadata(0):
-    ## assays(1): X
-    ## rownames(28024): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
+    ## assays(1): counts_per_million
+    ## rownames(60661): TSPAN6 TNMD ... RP11-175I6.6 PRSS43P
     ## rowData names(0):
     ## colnames(1571): ACAGCCGGTCCGTTAA_F02526 GGGAATGAGCCCAGCT_F02526 ...
     ##   TACAACGTCAGCATTG_SC84 CATTCGCTCAATACCG_F02526
-    ## colData names(47): .sample .sample_name ... file_id_db
-    ##   stringr..str_remove.stringr..str_remove..cell...sample...._...
+    ## colData names(51): .sample .sample_name ... cell_annotation_azimuth_l2
+    ##   cell_annotation_blueprint_singler
     ## reducedDimNames(0):
     ## mainExpName: NULL
     ## altExpNames(0):
diff --git a/dev/DB2_files.R b/dev/DB2_files.R
@@ -128,6 +128,17 @@ transformation =
 	pull(transformation) |>
 	unique()
 
+# Ad hoc transformations, not declared as log
+if(file_id %in% c(
+	"1e81a742-e457-4fc6-9c39-c55189ec9dc2",
+	"b51bfa2d-22b2-4c65-9803-d36d4de973fa",
+	"4e4bbb2d-f341-4523-a5a0-5407d8b03e0e",
+	"c48402e4-e7db-4c82-a9e9-51e285e5165c",
+	"82ad3285-e5d4-46d1-89c0-3acf91a9e33f",
+	"7addb561-c1bf-4fb5-ad10-16dd65b3643a",
+	"575513b2-6e53-41e2-85a9-bc08a6233ce4"
+)) transformation = "log"
+
 X =
 	X |>
 	when(
diff --git a/dev/scale_files.R b/dev/scale_files.R
@@ -0,0 +1,55 @@
+library(zellkonverter)
+library(Seurat)
+library(SingleCellExperiment) # load early to avoid masking dplyr::count()
+library(tidySingleCellExperiment)
+library(dplyr)
+library(cellxgenedp)
+library(tidyverse)
+#library(tidySingleCellExperiment)
+library(stringr)
+library(scMerge)
+library(glue)
+library(DelayedArray)
+library(HDF5Array)
+library(tidyseurat)
+library(celldex)
+library(SingleR)
+library(glmGamPoi)
+
+
+# # # CREATE MAKEFILE
+# tab = "\t"
+# root_directory = "/vast/projects/RCP/human_cell_atlas"
+# split_data_directory = glue("{root_directory}/splitted_DB2_data")
+# scaled_data_directory = glue("{root_directory}/splitted_DB2_data_scaled")
+#
+# dir(split_data_directory) |>
+# 	map( ~ glue("{scaled_data_directory}/{.x}:{split_data_directory}/{.x}\n{tab}Rscript scale_files.R {split_data_directory}/{.x} {scaled_data_directory}/{.x}")
+# ) |>
+# 	prepend(glue("CATEGORY=scale_data\nMEMORY=20000\nCORES=1\nWALL_TIME=30000")) |>
+# 	unlist()  |>
+# 	write_lines(glue("dev/scale_files.makeflow"))
+
+
+
+
+
+# Read arguments
+args = commandArgs(trailingOnly=TRUE)
+input_file = args[[1]]
+output_file = args[[2]]
+
+# Create directory
+output_file |>  dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)
+
+# Read file_cell_types
+data = loadHDF5SummarizedExperiment(input_file	)
+
+sce = SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data, assay.type = "X")))
+rownames(sce) = rownames(data)
+colnames(sce) = colnames(data)
+
+rm(data)
+gc()
+
+sce |>	saveHDF5SummarizedExperiment(output_file, replace=TRUE)
diff --git a/man/get_SingleCellExperiment.Rd b/man/get_SingleCellExperiment.Rd