stemangiola
diff --git a/‎R/dev.R‎
Lines changed: 76 additions & 8 deletions b/‎R/dev.R‎
Lines changed: 76 additions & 8 deletions
diff --git a/‎dev/DB2_files.R‎
Lines changed: 140 additions & 66 deletions b/‎dev/DB2_files.R‎
Lines changed: 140 additions & 66 deletions
diff --git a/‎dev/DB_files.R‎
Lines changed: 2 additions & 2 deletions b/‎dev/DB_files.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/annotate_files.R‎
Lines changed: 1 addition & 1 deletion b/‎dev/annotate_files.R‎
Lines changed: 1 addition & 1 deletion
@@ -128,33 +128,33 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
 }
 
 #' Converts a series of HDF5Array-serialized SingleCellExperiments to AnnData
-#' @param src A character scalar. The path to a directory containing one or more
+#' @param input_directory A character scalar. The path to a directory containing one or more
 #'  directories created by [HDF5Array::saveHDF5SummarizedExperiment()].
-#' @param dest A character scalar. The path to a directory in which to save the
+#' @param output_directory A character scalar. The path to a directory in which to save the
 #'  created anndata files.
 #' @keywords internal
 #' @return A character vector of the newly-created anndata files
 #' @examples
 #' \donttest{
-#' dir_to_anndata(
+#' hdf5_to_anndata(
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
 #' )
-#' dir_to_anndata(
+#' hdf5_to_anndata(
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_data_scaled_0.2.1",
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
 #' )
 #' }
-dir_to_anndata <- function(src, dest){
-    dir.create(dest, showWarnings = FALSE)
+hdf5_to_anndata = function(input_directory, output_directory){
+    dir.create(output_directory, showWarnings = FALSE)
     # This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |>  purrr::walk(function(dir){
     basilisk::basiliskRun(fun = function(sce) {
-        list.dirs(src)[-1] |>
+        list.dirs(input_directory)[-1] |>
             purrr::map_chr(function(sce_dir){
                 cli::cli_alert_info("Processing {sce_dir}.")
                 prefix <- basename(sce_dir)
                 out_path <- glue::glue("{prefix}.h5ad") |>
-                    file.path(dest, name=_)
+                    file.path(output_directory, name=_)
 
                 if (file.exists(out_path)) {
                     cli::cli_alert_info("{out_path} already exists. Skipping")
@@ -185,6 +185,73 @@ dir_to_anndata <- function(src, dest){
     }, env = zellkonverter::zellkonverterAnnDataEnv())
 }
 
+#' Converts a series of H5-serialized Seurat to AnnData
+#' @param input_directory A character scalar. The path to a directory containing one or more
+#'  directories created by [SeuratDisk::SaveH5Seurat()].
+#' @param output_directory A character scalar. The path to a directory in which to save the
+#'  created anndata files.
+#' @keywords internal
+#' @return A character vector of the newly-created anndata files
+#' @examples
+#' \donttest{
+#' h5seurat_to_anndata(
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
+#' )
+#' h5seurat_to_anndata(
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_data_scaled_0.2.1",
+#'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
+#' )
+#' }
+h5seurat_to_anndata = function(input_directory, output_directory, assays = "RNA"){
+  
+  # Check if package is loaded
+  if(!"SeuratDisk" %in% (.packages()))
+    stop("CuratedCellAtlas says: please manually load the SeuratDisk package first. Execute `library(SeuratDisk)`")
+    
+    
+  dir.create(output_directory, showWarnings = FALSE)
+  # This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |>  purrr::walk(function(dir){
+  basilisk::basiliskRun(fun = function(sce) {
+    dir(input_directory, full.names = TRUE) |>
+      purrr::map_chr(function(seurat_file){
+        cli::cli_alert_info("Processing {seurat_file}.")
+        prefix <- basename(seurat_file)
+        out_path <- glue::glue("{prefix}.h5ad") |>
+          file.path(output_directory, name=_)
+        
+        if (file.exists(out_path)) {
+          cli::cli_alert_info("{out_path} already exists. Skipping")
+        }
+        else {
+          sce <- 
+            LoadH5Seurat(seurat_file, assays = assays) |> 
+            Seurat::as.SingleCellExperiment()
+          
+          single_column <- length(colnames(sce)) == 1
+          if (single_column){
+            # Hack, so that single-column SCEs will convert 
+            # correctly
+            cli::cli_alert_info(
+              "{seurat_file} has only 1 column. Duplicating column."
+            )
+            sce <- cbind(sce, sce)
+            single_column <- TRUE
+          }
+          ad <- zellkonverter::SCE2AnnData(sce)
+          if (single_column){
+            # Remove the duplicate column
+            sce$X <- sce$X[1]
+          }
+          # TODO: customize chunking here, when anndata supports it
+          # (see https://github.com/scverse/anndata/issues/961)
+          ad$write_h5ad(out_path)
+        }
+        out_path
+      }, .progress = "Converting files")
+  }, env = zellkonverter::zellkonverterAnnDataEnv())
+}
+
 #' Makes a "downsampled" metadata file that only contains the minimal data
 #' needed to run the vignette.
 #' @param output Character scalar. Path to the output file.
@@ -234,3 +301,4 @@ downsample_metadata <- function(output = "sample_meta.parquet"){
 
     NULL
 }
+
@@ -10,13 +10,13 @@ library(scMerge)
 library(glue)
 library(DelayedArray)
 library(HDF5Array)
-library(HCAquery)
+# library(CuratedAtlasQueryR)
 library(openssl)
 
 
 # CREATE MAKEFILE
 tab = "\t"
-root_directory = "/vast/scratch/users/mangiola.s/human_cell_atlas"
+root_directory = "/vast/projects/cellxgene_curated"
 splitted_light_data_directory = "/vast/projects/RCP/human_cell_atlas/splitted_light_data" #glue("{root_directory}/splitted_light_data")
 DB_data_directory = glue("{root_directory}/splitted_DB_data")
 gene_names = glue("{root_directory}/gene_names.rds")
 
@@ -16,7 +16,7 @@ library(celldex)
 library(SingleR)
 library(glmGamPoi)
 source("utility.R")
-library(HCAquery)
+library(CuratedAtlasQueryR)
 library(BiocParallel)
 library(scuttle)