Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 34e6900

Browse files
authored
Merge pull request #39 from stemangiola/no-tidy-sce
No tidy SCE
2 parents a168755 + 4fbfc46 commit 34e6900

File tree

6 files changed

+365
-94
lines changed

6 files changed

+365
-94
lines changed

DESCRIPTION

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ Imports:
1818
dplyr,
1919
SummarizedExperiment,
2020
SingleCellExperiment,
21-
tidySingleCellExperiment,
2221
purrr (>= 1.0.0),
2322
BiocGenerics,
2423
glue,

NAMESPACE

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,32 @@ S3method(as.sparse,DelayedMatrix)
44
export(get_SingleCellExperiment)
55
export(get_metadata)
66
export(get_seurat)
7+
importClassesFrom(SingleCellExperiment,SingleCellExperiment)
78
importFrom(BiocGenerics,cbind)
89
importFrom(DBI,dbConnect)
910
importFrom(HDF5Array,HDF5RealizationSink)
1011
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
1112
importFrom(RSQLite,SQLITE_RO)
1213
importFrom(RSQLite,SQLite)
14+
importFrom(S4Vectors,DataFrame)
1315
importFrom(Seurat,as.Seurat)
1416
importFrom(SeuratObject,as.sparse)
1517
importFrom(SingleCellExperiment,SingleCellExperiment)
1618
importFrom(SingleCellExperiment,simplifyToSCE)
1719
importFrom(SummarizedExperiment,"assayNames<-")
20+
importFrom(SummarizedExperiment,"colData<-")
1821
importFrom(SummarizedExperiment,colData)
1922
importFrom(assertthat,assert_that)
2023
importFrom(assertthat,has_name)
2124
importFrom(cli,cli_abort)
2225
importFrom(cli,cli_alert_info)
2326
importFrom(cli,cli_alert_success)
2427
importFrom(dplyr,as_tibble)
28+
importFrom(dplyr,collect)
2529
importFrom(dplyr,filter)
2630
importFrom(dplyr,full_join)
31+
importFrom(dplyr,inner_join)
32+
importFrom(dplyr,mutate)
2733
importFrom(dplyr,pull)
2834
importFrom(dplyr,tbl)
2935
importFrom(dplyr,tibble)
@@ -46,4 +52,4 @@ importFrom(purrr,transpose)
4652
importFrom(rappdirs,user_cache_dir)
4753
importFrom(rlang,.data)
4854
importFrom(stats,setNames)
49-
importFrom(tidySingleCellExperiment,inner_join)
55+
importFrom(tibble,column_to_rownames)

R/query.R

Lines changed: 101 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
3535
#' meta <- get_metadata() |> head(2)
3636
#' sce <- get_SingleCellExperiment(meta)
3737
#'
38-
#' @importFrom dplyr pull filter as_tibble
39-
#' @importFrom tidySingleCellExperiment inner_join
38+
#' @importFrom dplyr pull filter as_tibble inner_join collect
39+
#' @importFrom tibble column_to_rownames
4040
#' @importFrom purrr reduce map map_int imap keep
4141
#' @importFrom BiocGenerics cbind
4242
#' @importFrom glue glue
@@ -49,6 +49,7 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
4949
#' @importFrom cli cli_alert_success cli_alert_info
5050
#' @importFrom rlang .data
5151
#' @importFrom stats setNames
52+
#' @importFrom S4Vectors DataFrame
5253
#'
5354
#' @export
5455
#'
@@ -64,7 +65,7 @@ get_SingleCellExperiment <- function(
6465
assays %in% names(assay_map) |>
6566
all() |>
6667
assert_that(
67-
msg = 'assays must be a character vector containing "counts" and/or
68+
msg = 'assays must be a character vector containing "counts" and/or
6869
"cpm"'
6970
)
7071
(!anyDuplicated(assays)) |> assert_that()
@@ -78,15 +79,14 @@ get_SingleCellExperiment <- function(
7879
## We have to convert to an in-memory table here, or some of the dplyr
7980
## operations will fail when passed a database connection
8081
cli_alert_info("Realising metadata.")
81-
raw_data <- as_tibble(data)
82+
raw_data <- collect(data)
8283
inherits(raw_data, "tbl") |> assert_that()
8384
has_name(raw_data, c(".cell", "file_id_db")) |> assert_that()
8485

8586
cache_directory |> dir.create(showWarnings = FALSE)
8687

87-
files_to_read <-
88-
raw_data |>
89-
pull(.data$file_id_db) |>
88+
cells_of_interest <- raw_data |>
89+
pull(.data$.cell) |>
9090
unique() |>
9191
as.character()
9292

@@ -95,6 +95,11 @@ get_SingleCellExperiment <- function(
9595
# The repository is optional. If not provided we load only from the cache
9696
if (!is.null(repository)) {
9797
cli_alert_info("Synchronising files")
98+
files_to_read <-
99+
raw_data |>
100+
pull(.data$file_id_db) |>
101+
unique() |>
102+
as.character()
98103
parsed_repo <- parse_url(repository)
99104
(parsed_repo$scheme %in% c("http", "https")) |> assert_that()
100105
sync_assay_files(
@@ -104,67 +109,96 @@ get_SingleCellExperiment <- function(
104109
subdirs = subdirs
105110
)
106111
}
107-
files_to_read <-
108-
raw_data |>
109-
pull(.data$file_id_db) |>
110-
unique() |>
111-
as.character()
112112

113-
subdirs |>
113+
cli_alert_info("Reading files.")
114+
sces <- subdirs |>
114115
imap(function(current_subdir, current_assay) {
115-
# Load each file
116-
sces <-
117-
files_to_read |>
118-
map(function(.x) {
119-
sce_path <- file.path(
120-
cache_directory,
121-
current_subdir,
122-
.x
123-
)
116+
# Build up an SCE for each assay
117+
dir_prefix <- file.path(
118+
cache_directory,
119+
current_subdir
120+
)
121+
122+
raw_data |>
123+
dplyr::group_by(file_id_db) |>
124+
# Load each file and attach metadata
125+
dplyr::summarise(sces = list(group_to_sce(
126+
dplyr::cur_group_id(),
127+
dplyr::cur_data_all(),
128+
dir_prefix,
129+
features
130+
))) |>
131+
dplyr::pull(sces) |>
132+
# Combine each sce by column, since each sce has a different set
133+
# of cells
134+
do.call(cbind, args = _)
135+
})
124136

125-
file.exists(sce_path) |>
126-
assert_that(
127-
msg = "Your cache does not contain a file you
128-
attempted to query. Please provide the repository
137+
cli_alert_info("Compiling Single Cell Experiment.")
138+
# Combine all the assays
139+
sce <- sces[[1]]
140+
SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
141+
SummarizedExperiment::assays(sce)[[1]]
142+
})
143+
144+
sce
145+
}
146+
147+
#' Converts a data frame into a single SCE
148+
#'
149+
#' @param prefix Prefix to be added to the column names
150+
#' @param df The data frame to be converted
151+
#' @param dir_prefix The path to the single cell experiment, minus the final segment
152+
#' @param features The list of genes/rows of interest
153+
#'
154+
#' @return A SingleCellExperiment object
155+
#' @importFrom dplyr mutate
156+
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
157+
#' @importFrom SummarizedExperiment colData<-
158+
#' @importFrom tibble column_to_rownames
159+
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
160+
#'
161+
group_to_sce <- function(i, df, dir_prefix, features) {
162+
sce_path <- df$file_id_db |>
163+
head(1) |>
164+
file.path(
165+
dir_prefix,
166+
suffix = _
167+
)
168+
169+
file.exists(sce_path) |>
170+
assert_that(
171+
msg = "Your cache does not contain a file you
172+
attempted to query. Please provide the repository
129173
parameter so that files can be synchronised from the
130174
internet"
131-
)
132-
133-
sce <- loadHDF5SummarizedExperiment(sce_path)
175+
)
134176

135-
if (!is.null(features)) {
136-
# Optionally subset the genes
137-
sce <- sce[
138-
rownames(sce) |> intersect(features)
139-
]
140-
}
177+
sce <- loadHDF5SummarizedExperiment(sce_path)
178+
# The cells we select here are those that are both available in the SCE
179+
# object, and requested for this particular file
180+
cells <- colnames(sce) |> intersect(df$.cell)
181+
# We need to make the cell names globally unique, which we can guarantee
182+
# by adding a suffix that is derived from file_id_db, which is the grouping
183+
# variable
184+
new_cellnames <- paste0(cells, "_", i)
185+
new_coldata <- df |>
186+
mutate(original_cell_id = .cell, .cell = new_cellnames) |>
187+
column_to_rownames(".cell") |>
188+
as("DataFrame")
141189

142-
sce
143-
}, .progress = list(name = "Reading files")) |>
144-
# Drop files with one cell, which causes the DFrame objects to
145-
# combine must have the same column names
146-
keep(~ ncol(.) > 1) |>
147-
# Combine each sce by column, since each sce has a different set
148-
# of cells
149-
do.call(cbind, args = _) |>
150-
# We only need the assay, since we ultimately need to combine
151-
# them We need to use :: here since we already have an assays
152-
# argument
153-
SummarizedExperiment::assays() |>
154-
setNames(current_assay)
190+
features |>
191+
is.null() |>
192+
{
193+
`if`
194+
}(
195+
sce[, cells], {
196+
# Optionally subset the genes
197+
genes <- rownames(sce) |> intersect(features)
198+
sce[genes, cells]
155199
}) |>
156-
aside(cli_alert_info("Compiling Single Cell Experiment.")) |>
157-
# Combine the assays into one list
158-
reduce(c) |>
159-
SingleCellExperiment(assays = _) |>
160-
aside(cli_alert_info("Attaching metadata.")) |>
161-
# Join back to metadata, which will become coldata annotations
162-
inner_join(
163-
# Needed because cell IDs are not unique outside the file_id or
164-
# file_id_db
165-
filter(raw_data, .data$file_id_db %in% files_to_read),
166-
by = ".cell"
167-
)
200+
`colnames<-`(new_cellnames) |>
201+
`colData<-`(value = new_coldata)
168202
}
169203

170204
#' Synchronises one or more remote assays with a local copy
@@ -181,17 +215,19 @@ get_SingleCellExperiment <- function(
181215
#'
182216
#' @return A character vector of files that have been downloaded
183217
#' @importFrom purrr pmap_chr transpose
184-
#' @importFrom httr modify_url GET write_disk stop_for_status
218+
#' @importFrom httr modify_url GET write_disk stop_for_status parse_url
185219
#' @importFrom dplyr tibble transmute filter full_join
186220
#' @importFrom glue glue
187221
#' @importFrom assertthat assert_that
188222
#' @importFrom cli cli_alert_success cli_alert_info cli_abort
189223
#' @noRd
190224
#'
191-
sync_assay_files <- function(url = httr::parse_url(REMOTE_URL),
192-
cache_dir,
193-
subdirs,
194-
files) {
225+
sync_assay_files <- function(
226+
url = parse_url(REMOTE_URL),
227+
cache_dir,
228+
subdirs,
229+
files
230+
) {
195231
# Find every combination of file name, sample id, and assay, since each
196232
# will be a separate file we need to download
197233
expand.grid(

0 commit comments

Comments
 (0)