Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 5478974

Browse files
authored
Merge pull request #19 from stemangiola/avoid-gene-addition-on-the-fly
Avoid gene addition on the fly
2 parents c45b4a5 + 57613b0 commit 5478974

File tree

6 files changed

+271
-190
lines changed

6 files changed

+271
-190
lines changed

DESCRIPTION

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Depends:
1111
R (>= 4.1.0)
1212
Imports:
1313
dplyr,
14+
SummarizedExperiment,
15+
SingleCellExperiment,
1416
tidySingleCellExperiment,
1517
purrr,
1618
zellkonverter,

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ importFrom(HDF5Array,HDF5RealizationSink)
88
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
99
importFrom(RSQLite,SQLITE_RO)
1010
importFrom(RSQLite,SQLite)
11+
importFrom(SingleCellExperiment,SingleCellExperiment)
12+
importFrom(SummarizedExperiment,colData)
1113
importFrom(dplyr,as_tibble)
1214
importFrom(dplyr,pull)
1315
importFrom(dplyr,tbl)

R/query.R

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#' @importFrom stringr str_remove
1616
#' @importFrom HDF5Array HDF5RealizationSink
1717
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
18+
#' @importFrom SingleCellExperiment SingleCellExperiment
19+
#' @importFrom SummarizedExperiment colData
1820
#'
1921
#' @export
2022
#'
@@ -53,24 +55,9 @@ get_SingleCellExperiment = function(.data, repository = "/vast/projects/RCP/huma
5355
unlist() |>
5456
unique()
5557

56-
sce =
57-
sce |>
58-
map(~ {
59-
missing_genes = all_genes |> setdiff(rownames(.x))
60-
61-
missing_matrix =
62-
HDF5RealizationSink(c(length(missing_genes),ncol(.x)), as.sparse = TRUE) |>
63-
as("DelayedArray")
64-
65-
rownames(missing_matrix) = missing_genes
66-
colnames(missing_matrix) = colnames(.x)
67-
68-
missing_sce = SingleCellExperiment(list(X=missing_matrix), colData=colData(.x))
69-
missing_sce@int_colData = .x@int_colData
70-
71-
# Make cell name unique
72-
.x |> rbind(missing_sce )
73-
})
58+
# Drop files with one cell, which causes
59+
# the DFrame objects to combine must have the same column names
60+
sce = sce[map_int(sce, ncol)>1]
7461

7562
cat("\n")
7663

dev/DB2_files.R

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@ library(HCAquery)
1717
options(scipen = 999)
1818
#
1919

20-
# # CREATE MAKEFILE
21-
# tab = "\t"
22-
# root_directory = "/vast/projects/RCP/human_cell_atlas"
23-
# raw_data_directory = glue("{root_directory}/raw_data")
24-
# splitted_DB2_data_directory = glue("{root_directory}/splitted_DB2_data")
25-
# file_cell_types_directory = glue("{root_directory}/file_cell_types")
26-
# input_files_path = dir(file_cell_types_directory, full.names = TRUE)
27-
# #
20+
# CREATE MAKEFILE
21+
tab = "\t"
22+
root_directory = "/vast/projects/RCP/human_cell_atlas"
23+
raw_data_directory = glue("{root_directory}/raw_data")
24+
splitted_DB2_data_directory = glue("{root_directory}/splitted_DB2_data")
25+
file_cell_types_directory = glue("{root_directory}/file_cell_types")
26+
input_files_path = dir(file_cell_types_directory, full.names = TRUE)
27+
gene_names = glue("{root_directory}/gene_names.rds")
28+
#
29+
#
2830
# ## metadata = readRDS(metadata_path)
2931
#
3032
# get_metadata() |>
@@ -67,7 +69,7 @@ options(scipen = 999)
6769
# c(
6870
# glue("CATEGORY=split_data{..4}\nMEMORY={..3}\nCORES=1\nWALL_TIME=30000"),
6971
# glue(
70-
# "{..1}:{..2}\n{tab}Rscript DB2_files.R {..2} {..1}"
72+
# "{..1}:{..2}\n{tab}Rscript DB2_files.R {..2} {gene_names} {..1}"
7173
# )
7274
# )
7375
# )) |>
@@ -80,7 +82,8 @@ options(scipen = 999)
8082
# Read arguments
8183
args = commandArgs(trailingOnly = TRUE)
8284
input_file = args[[1]]
83-
output_file = args[[2]]
85+
all_gene_names = args[[2]]
86+
output_file = args[[3]]
8487

8588
output_file |> dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)
8689
file_id = basename(input_file) |> tools::file_path_sans_ext() |> str_split("___") %>% .[[1]] %>% .[1]
@@ -157,6 +160,21 @@ colnames(sce) = colnames(X)
157160
rm(X)
158161
gc()
159162

163+
# Add missing genes
164+
missing_genes = readRDS(all_gene_names) |> setdiff(rownames(sce))
165+
missing_matrix =
166+
HDF5RealizationSink(c(length(missing_genes),ncol(sce)), as.sparse = TRUE) |>
167+
as("DelayedArray")
168+
169+
rownames(missing_matrix) = missing_genes
170+
colnames(missing_matrix) = colnames(sce)
171+
172+
missing_sce = SingleCellExperiment(list(X=missing_matrix), colData=colData(sce))
173+
missing_sce@int_colData = sce@int_colData
174+
175+
# Make cell name unique
176+
sce = sce |> rbind(missing_sce)
177+
160178
sce |> saveHDF5SummarizedExperiment(output_file, replace=TRUE)
161179

162180

0 commit comments

Comments
 (0)