Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 928e459

Browse files
authored
Merge pull request #111 from stemangiola/allow-H5Seurat-for_Anndata
Allow h5 seurat for anndata
2 parents cfeaed2 + 9062d67 commit 928e459

15 files changed

+659
-243
lines changed

R/dev.R

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,33 +128,33 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
128128
}
129129

130130
#' Converts a series of HDF5Array-serialized SingleCellExperiments to AnnData
131-
#' @param src A character scalar. The path to a directory containing one or more
131+
#' @param input_directory A character scalar. The path to a directory containing one or more
132132
#' directories created by [HDF5Array::saveHDF5SummarizedExperiment()].
133-
#' @param dest A character scalar. The path to a directory in which to save the
133+
#' @param output_directory A character scalar. The path to a directory in which to save the
134134
#' created anndata files.
135135
#' @keywords internal
136136
#' @return A character vector of the newly-created anndata files
137137
#' @examples
138138
#' \donttest{
139-
#' dir_to_anndata(
139+
#' hdf5_to_anndata(
140140
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
141141
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
142142
#' )
143-
#' dir_to_anndata(
143+
#' hdf5_to_anndata(
144144
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_scaled_0.2.1",
145145
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
146146
#' )
147147
#' }
148-
dir_to_anndata <- function(src, dest){
149-
dir.create(dest, showWarnings = FALSE)
148+
hdf5_to_anndata = function(input_directory, output_directory){
149+
dir.create(output_directory, showWarnings = FALSE)
150150
# This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |> purrr::walk(function(dir){
151151
basilisk::basiliskRun(fun = function(sce) {
152-
list.dirs(src)[-1] |>
152+
list.dirs(input_directory)[-1] |>
153153
purrr::map_chr(function(sce_dir){
154154
cli::cli_alert_info("Processing {sce_dir}.")
155155
prefix <- basename(sce_dir)
156156
out_path <- glue::glue("{prefix}.h5ad") |>
157-
file.path(dest, name=_)
157+
file.path(output_directory, name=_)
158158

159159
if (file.exists(out_path)) {
160160
cli::cli_alert_info("{out_path} already exists. Skipping")
@@ -185,6 +185,73 @@ dir_to_anndata <- function(src, dest){
185185
}, env = zellkonverter::zellkonverterAnnDataEnv())
186186
}
187187

188+
#' Converts a series of H5-serialized Seurat to AnnData
189+
#' @param input_directory A character scalar. The path to a directory containing one or more
190+
#' directories created by [SeuratDisk::SaveH5Seurat()].
191+
#' @param output_directory A character scalar. The path to a directory in which to save the
192+
#' created anndata files.
193+
#' @keywords internal
194+
#' @return A character vector of the newly-created anndata files
195+
#' @examples
196+
#' \donttest{
197+
#' h5seurat_to_anndata(
198+
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
199+
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
200+
#' )
201+
#' h5seurat_to_anndata(
202+
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_scaled_0.2.1",
203+
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
204+
#' )
205+
#' }
206+
h5seurat_to_anndata = function(input_directory, output_directory, assays = "RNA"){
207+
208+
# Check if package is loaded
209+
if(!"SeuratDisk" %in% (.packages()))
210+
stop("CuratedCellAtlas says: please manually load the SeuratDisk package first. Execute `library(SeuratDisk)`")
211+
212+
213+
dir.create(output_directory, showWarnings = FALSE)
214+
# This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |> purrr::walk(function(dir){
215+
basilisk::basiliskRun(fun = function(sce) {
216+
dir(input_directory, full.names = TRUE) |>
217+
purrr::map_chr(function(seurat_file){
218+
cli::cli_alert_info("Processing {seurat_file}.")
219+
prefix <- basename(seurat_file)
220+
out_path <- glue::glue("{prefix}.h5ad") |>
221+
file.path(output_directory, name=_)
222+
223+
if (file.exists(out_path)) {
224+
cli::cli_alert_info("{out_path} already exists. Skipping")
225+
}
226+
else {
227+
sce <-
228+
LoadH5Seurat(seurat_file, assays = assays) |>
229+
Seurat::as.SingleCellExperiment()
230+
231+
single_column <- length(colnames(sce)) == 1
232+
if (single_column){
233+
# Hack, so that single-column SCEs will convert
234+
# correctly
235+
cli::cli_alert_info(
236+
"{seurat_file} has only 1 column. Duplicating column."
237+
)
238+
sce <- cbind(sce, sce)
239+
single_column <- TRUE
240+
}
241+
ad <- zellkonverter::SCE2AnnData(sce)
242+
if (single_column){
243+
# Remove the duplicate column
244+
sce$X <- sce$X[1]
245+
}
246+
# TODO: customize chunking here, when anndata supports it
247+
# (see https://github.com/scverse/anndata/issues/961)
248+
ad$write_h5ad(out_path)
249+
}
250+
out_path
251+
}, .progress = "Converting files")
252+
}, env = zellkonverter::zellkonverterAnnDataEnv())
253+
}
254+
188255
#' Makes a "downsampled" metadata file that only contains the minimal data
189256
#' needed to run the vignette.
190257
#' @param output Character scalar. Path to the output file.
@@ -234,3 +301,4 @@ downsample_metadata <- function(output = "sample_meta.parquet"){
234301

235302
NULL
236303
}
304+

dev/DB2_files.R

Lines changed: 140 additions & 66 deletions
Large diffs are not rendered by default.

dev/DB_files.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ library(scMerge)
1010
library(glue)
1111
library(DelayedArray)
1212
library(HDF5Array)
13-
library(HCAquery)
13+
# library(CuratedAtlasQueryR)
1414
library(openssl)
1515

1616

1717
# CREATE MAKEFILE
1818
tab = "\t"
19-
root_directory = "/vast/scratch/users/mangiola.s/human_cell_atlas"
19+
root_directory = "/vast/projects/cellxgene_curated"
2020
splitted_light_data_directory = "/vast/projects/RCP/human_cell_atlas/splitted_light_data" #glue("{root_directory}/splitted_light_data")
2121
DB_data_directory = glue("{root_directory}/splitted_DB_data")
2222
gene_names = glue("{root_directory}/gene_names.rds")

dev/annotate_files.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ library(celldex)
1616
library(SingleR)
1717
library(glmGamPoi)
1818
source("utility.R")
19-
library(HCAquery)
19+
library(CuratedAtlasQueryR)
2020
library(BiocParallel)
2121
library(scuttle)
2222

0 commit comments

Comments
 (0)