Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 09719b2

Browse files
committed
Remove tidy_SCE dependency, and do the processing in a separate function
1 parent 649300b commit 09719b2

File tree

6 files changed

+217
-120
lines changed

6 files changed

+217
-120
lines changed

DESCRIPTION

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ Imports:
1818
dplyr,
1919
SummarizedExperiment,
2020
SingleCellExperiment,
21-
tidySingleCellExperiment,
2221
purrr (>= 1.0.0),
2322
BiocGenerics,
2423
glue,

NAMESPACE

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,32 @@ S3method(as.sparse,DelayedMatrix)
44
export(get_SingleCellExperiment)
55
export(get_metadata)
66
export(get_seurat)
7+
importClassesFrom(SingleCellExperiment,SingleCellExperiment)
78
importFrom(BiocGenerics,cbind)
89
importFrom(DBI,dbConnect)
910
importFrom(HDF5Array,HDF5RealizationSink)
1011
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
1112
importFrom(RSQLite,SQLITE_RO)
1213
importFrom(RSQLite,SQLite)
14+
importFrom(S4Vectors,DataFrame)
1315
importFrom(Seurat,as.Seurat)
1416
importFrom(SeuratObject,as.sparse)
1517
importFrom(SingleCellExperiment,SingleCellExperiment)
1618
importFrom(SingleCellExperiment,simplifyToSCE)
1719
importFrom(SummarizedExperiment,"assayNames<-")
20+
importFrom(SummarizedExperiment,"colData<-")
1821
importFrom(SummarizedExperiment,colData)
1922
importFrom(assertthat,assert_that)
2023
importFrom(assertthat,has_name)
2124
importFrom(cli,cli_abort)
2225
importFrom(cli,cli_alert_info)
2326
importFrom(cli,cli_alert_success)
2427
importFrom(dplyr,as_tibble)
28+
importFrom(dplyr,collect)
2529
importFrom(dplyr,filter)
2630
importFrom(dplyr,full_join)
31+
importFrom(dplyr,inner_join)
32+
importFrom(dplyr,mutate)
2733
importFrom(dplyr,pull)
2834
importFrom(dplyr,tbl)
2935
importFrom(dplyr,tibble)
@@ -46,4 +52,4 @@ importFrom(purrr,transpose)
4652
importFrom(rappdirs,user_cache_dir)
4753
importFrom(rlang,.data)
4854
importFrom(stats,setNames)
49-
importFrom(tidySingleCellExperiment,inner_join)
55+
importFrom(tibble,column_to_rownames)

R/query.R

Lines changed: 100 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
3535
#' meta <- get_metadata() |> head(2)
3636
#' sce <- get_SingleCellExperiment(meta)
3737
#'
38-
#' @importFrom dplyr pull filter as_tibble
39-
#' @importFrom tidySingleCellExperiment inner_join
38+
#' @importFrom dplyr pull filter as_tibble inner_join collect
39+
#' @importFrom tibble column_to_rownames
4040
#' @importFrom purrr reduce map map_int imap keep
4141
#' @importFrom BiocGenerics cbind
4242
#' @importFrom glue glue
@@ -49,6 +49,7 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
4949
#' @importFrom cli cli_alert_success cli_alert_info
5050
#' @importFrom rlang .data
5151
#' @importFrom stats setNames
52+
#' @importFrom S4Vectors DataFrame
5253
#'
5354
#' @export
5455
#'
@@ -78,15 +79,14 @@ get_SingleCellExperiment <- function(
7879
## We have to convert to an in-memory table here, or some of the dplyr
7980
## operations will fail when passed a database connection
8081
cli_alert_info("Realising metadata.")
81-
raw_data <- as_tibble(data)
82+
raw_data <- collect(data)
8283
inherits(raw_data, "tbl") |> assert_that()
8384
has_name(raw_data, c(".cell", "file_id_db")) |> assert_that()
8485

8586
cache_directory |> dir.create(showWarnings = FALSE)
8687

87-
files_to_read <-
88-
raw_data |>
89-
pull(.data$file_id_db) |>
88+
cells_of_interest <- raw_data |>
89+
pull(.data$.cell) |>
9090
unique() |>
9191
as.character()
9292

@@ -95,6 +95,11 @@ get_SingleCellExperiment <- function(
9595
# The repository is optional. If not provided we load only from the cache
9696
if (!is.null(repository)) {
9797
cli_alert_info("Synchronising files")
98+
files_to_read <-
99+
raw_data |>
100+
pull(.data$file_id_db) |>
101+
unique() |>
102+
as.character()
98103
parsed_repo <- parse_url(repository)
99104
(parsed_repo$scheme %in% c("http", "https")) |> assert_that()
100105
sync_assay_files(
@@ -104,46 +109,28 @@ get_SingleCellExperiment <- function(
104109
subdirs = subdirs
105110
)
106111
}
107-
files_to_read <-
108-
raw_data |>
109-
pull(.data$file_id_db) |>
110-
unique() |>
111-
as.character()
112112

113-
subdirs |>
113+
cli_alert_info("Reading files.")
114+
sce <- subdirs |>
114115
imap(function(current_subdir, current_assay) {
116+
dir_prefix = file.path(
117+
cache_directory,
118+
current_subdir
119+
)
120+
115121
# Load each file
116-
sces <-
117-
files_to_read |>
118-
map(function(.x) {
119-
sce_path <- file.path(
120-
cache_directory,
121-
current_subdir,
122-
.x
123-
)
124-
125-
file.exists(sce_path) |>
126-
assert_that(
127-
msg = "Your cache does not contain a file you
128-
attempted to query. Please provide the repository
129-
parameter so that files can be synchronised from the
130-
internet"
131-
)
132-
133-
sce <- loadHDF5SummarizedExperiment(sce_path)
134-
135-
if (!is.null(features)) {
136-
# Optionally subset the genes
137-
sce <- sce[
138-
rownames(sce) |> intersect(features)
139-
]
140-
}
141-
142-
sce
143-
}, .progress = list(name = "Reading files")) |>
122+
sces <- raw_data |>
123+
dplyr::group_by(file_id_db) |>
124+
dplyr::summarise(sces = list(group_to_sce(
125+
dplyr::cur_group_id(),
126+
dplyr::cur_data_all(),
127+
dir_prefix,
128+
features
129+
))) |>
130+
dplyr::pull(sces) |>
144131
# Drop files with one cell, which causes the DFrame objects to
145132
# combine must have the same column names
146-
keep(~ ncol(.) > 1) |>
133+
# keep(~ ncol(.) > 1) |>
147134
# Combine each sce by column, since each sce has a different set
148135
# of cells
149136
do.call(cbind, args = _) |>
@@ -156,15 +143,79 @@ get_SingleCellExperiment <- function(
156143
aside(cli_alert_info("Compiling Single Cell Experiment.")) |>
157144
# Combine the assays into one list
158145
reduce(c) |>
159-
SingleCellExperiment(assays = _) |>
160-
aside(cli_alert_info("Attaching metadata.")) |>
161-
# Join back to metadata, which will become coldata annotations
162-
inner_join(
146+
SingleCellExperiment(assays = _)
147+
148+
cli_alert_info("Attaching metadata.")
149+
150+
colData(sce) <- raw_data |>
163151
# Needed because cell IDs are not unique outside the file_id or
164152
# file_id_db
165-
filter(raw_data, .data$file_id_db %in% files_to_read),
166-
by = ".cell"
153+
filter(.data$file_id_db %in% files_to_read) |>
154+
inner_join(
155+
colData(sce) |> as_tibble(rownames = ".cell"),
156+
by = ".cell"
157+
) |>
158+
column_to_rownames(".cell") |>
159+
as("DataFrame")
160+
161+
sce
162+
}
163+
164+
#' Converts a data frame into a single SCE
165+
#'
166+
#' @param prefix Prefix to be added to the column names
167+
#' @param df The data frame to be converted
168+
#' @param dir_prefix The path to the single cell experiment, minus the final segment
169+
#' @param features The list of genes/rows of interest
170+
#'
171+
#' @return A SingleCellExperiment object
172+
#' @importFrom dplyr mutate
173+
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
174+
#' @importFrom SummarizedExperiment colData<-
175+
#' @importFrom tibble column_to_rownames
176+
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
177+
#'
178+
group_to_sce = function(i, df, dir_prefix, features){
179+
sce_path <- df$file_id_db |>
180+
head(1) |>
181+
file.path(
182+
dir_prefix,
183+
suffix = _
167184
)
185+
186+
file.exists(sce_path) |>
187+
assert_that(
188+
msg = "Your cache does not contain a file you
189+
attempted to query. Please provide the repository
190+
parameter so that files can be synchronised from the
191+
internet"
192+
)
193+
194+
sce <- loadHDF5SummarizedExperiment(sce_path)
195+
# The cells we select here are those that are both available in the SCE
196+
# object, and requested for this particular file
197+
cells = colnames(sce) |> intersect(df$.cell)
198+
# We need to make the cell names globally unique, which we can guarantee
199+
# by adding a suffix that is derived from file_id_db, which is the grouping
200+
# variable
201+
new_cellnames = paste0(cells, "_", i)
202+
new_coldata = df |>
203+
mutate(original_cell_id = .cell, .cell = new_cellnames) |>
204+
column_to_rownames(".cell") |>
205+
as("DataFrame")
206+
207+
features |>
208+
is.null() |>
209+
{`if`} (
210+
sce[, cells],
211+
{
212+
# Optionally subset the genes
213+
genes = rownames(sce) |> intersect(features)
214+
sce[genes, cells]
215+
}
216+
) |>
217+
`colnames<-`(new_cellnames) |>
218+
`colData<-`(new_coldata)
168219
}
169220

170221
#' Synchronises one or more remote assays with a local copy
@@ -181,15 +232,15 @@ get_SingleCellExperiment <- function(
181232
#'
182233
#' @return A character vector of files that have been downloaded
183234
#' @importFrom purrr pmap_chr transpose
184-
#' @importFrom httr modify_url GET write_disk stop_for_status
235+
#' @importFrom httr modify_url GET write_disk stop_for_status parse_url
185236
#' @importFrom dplyr tibble transmute filter full_join
186237
#' @importFrom glue glue
187238
#' @importFrom assertthat assert_that
188239
#' @importFrom cli cli_alert_success cli_alert_info cli_abort
189240
#' @noRd
190241
#'
191242
sync_assay_files <- function(
192-
url = httr::parse_url(REMOTE_URL),
243+
url = parse_url(REMOTE_URL),
193244
cache_dir,
194245
subdirs,
195246
files

0 commit comments

Comments
 (0)