add H5Seurat to Anndata

stemangiola · stemangiola · commit ba26a3f785bc · 2023-03-28T13:55:43.000+11:00
diff --git a/R/dev.R b/R/dev.R
@@ -128,33 +128,33 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
 }
 
 #' Converts a series of HDF5Array-serialized SingleCellExperiments to AnnData
-#' @param src A character scalar. The path to a directory containing one or more
+#' @param input_directory A character scalar. The path to a directory containing one or more
 #'  directories created by [HDF5Array::saveHDF5SummarizedExperiment()].
-#' @param dest A character scalar. The path to a directory in which to save the
+#' @param output_directory A character scalar. The path to a directory in which to save the
 #'  created anndata files.
 #' @keywords internal
 #' @return A character vector of the newly-created anndata files
 #' @examples
 #' \donttest{
-#' dir_to_anndata(
+#' hdf5_to_anndata(
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
 #' )
-#' dir_to_anndata(
+#' hdf5_to_anndata(
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_data_scaled_0.2.1",
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
 #' )
 #' }
-dir_to_anndata = function(src, dest){
-    dir.create(dest, showWarnings = FALSE)
+hdf5_to_anndata = function(input_directory, output_directory){
+    dir.create(output_directory, showWarnings = FALSE)
     # This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |>  purrr::walk(function(dir){
     basilisk::basiliskRun(fun = function(sce) {
-        list.dirs(src)[-1] |>
+        list.dirs(input_directory)[-1] |>
             purrr::map_chr(function(sce_dir){
                 cli::cli_alert_info("Processing {sce_dir}.")
                 prefix <- basename(sce_dir)
                 out_path <- glue::glue("{prefix}.h5ad") |>
-                    file.path(dest, name=_)
+                    file.path(output_directory, name=_)
                 
                 if (file.exists(out_path)) {
                     cli::cli_alert_info("{out_path} already exists. Skipping")
@@ -184,3 +184,72 @@ dir_to_anndata = function(src, dest){
             }, .progress = "Converting files")
     }, env = zellkonverter::zellkonverterAnnDataEnv())
 }
+
+
+#' Converts a series of H5-serialized Seurat to AnnData
+#' @param input_directory A character scalar. The path to a directory containing one or more
+#'  directories created by [SeuratDisk::SaveH5Seurat()].
+#' @param output_directory A character scalar. The path to a directory in which to save the
+#'  created anndata files.
+#' @keywords internal
+#' @return A character vector of the newly-created anndata files
+#' @examples
+#' \donttest{
+#' h5seurat_to_anndata(
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
+#' )
+#' h5seurat_to_anndata(
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_data_scaled_0.2.1",
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
+#' )
+#' }
+h5seurat_to_anndata = function(input_directory, output_directory, assays = "RNA"){
+  
+  # Check if package is loaded
+  if(!"SeuratDisk" %in% (.packages()))
+    stop("CuratedCellAtlas says: please manually load the SeuratDisk package first. Execute `library(SeuratDisk)`")
+    
+    
+  dir.create(output_directory, showWarnings = FALSE)
+  # This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |>  purrr::walk(function(dir){
+  basilisk::basiliskRun(fun = function(sce) {
+    dir(input_directory, full.names = TRUE) |>
+      purrr::map_chr(function(seurat_file){
+        cli::cli_alert_info("Processing {seurat_file}.")
+        prefix <- basename(seurat_file)
+        out_path <- glue::glue("{prefix}.h5ad") |>
+          file.path(output_directory, name=_)
+        
+        if (file.exists(out_path)) {
+          cli::cli_alert_info("{out_path} already exists. Skipping")
+        }
+        else {
+          sce <- 
+            LoadH5Seurat(seurat_file, assays = assays) |> 
+            Seurat::as.SingleCellExperiment()
+          
+          single_column <- length(colnames(sce)) == 1
+          if (single_column){
+            # Hack, so that single-column SCEs will convert 
+            # correctly
+            cli::cli_alert_info(
+              "{seurat_file} has only 1 column. Duplicating column."
+            )
+            sce <- cbind(sce, sce)
+            single_column <- TRUE
+          }
+          ad <- zellkonverter::SCE2AnnData(sce)
+          if (single_column){
+            # Remove the duplicate column
+            sce$X <- sce$X[1]
+          }
+          # TODO: customize chunking here, when anndata supports it
+          # (see https://github.com/scverse/anndata/issues/961)
+          ad$write_h5ad(out_path)
+        }
+        out_path
+      }, .progress = "Converting files")
+  }, env = zellkonverter::zellkonverterAnnDataEnv())
+}
+