Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 54712d4

Browse files
authored
Merge pull request #143 from myushen/master
update validation columns for get_sce
2 parents 77fa66b + ad959fc commit 54712d4

File tree

8 files changed

+123
-167
lines changed

8 files changed

+123
-167
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: CuratedAtlasQueryR
33
Title: Queries the Human Cell Atlas
4-
Version: 1.3.4
4+
Version: 1.3.5
55
Authors@R: c(
66
person(
77
"Stefano",

R/counts.R

Lines changed: 85 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -46,37 +46,67 @@ get_SingleCellExperiment <- function(...){
4646
}
4747

4848
#' Gets a SingleCellExperiment from curated metadata
49-
#'
50-
#' @inheritDotParams get_summarized_experiment
49+
#'
50+
#' @param data A data frame containing, at minimum, `cell_`, `file_id_db` columns,
51+
#' which correspond to a single cell ID, file subdivision for internal use.
52+
#' They can be obtained from the [get_metadata()] function.
53+
#' @inheritDotParams get_data_container
5154
#' @examples
5255
#' meta <- get_metadata() |> head(2)
5356
#' sce <- get_single_cell_experiment(meta)
5457
#' @export
55-
get_single_cell_experiment <- function(...){
56-
get_summarized_experiment(..., type = "sce")
58+
get_single_cell_experiment <- function(data, ...){
59+
raw_data <- collect(data)
60+
assert_that(
61+
inherits(raw_data, "tbl"),
62+
has_name(raw_data, c("cell_", "file_id_db"))
63+
)
64+
get_data_container(data, ..., repository = COUNTS_URL, grouping_column = "file_id_db")
5765
}
5866

5967
#' Gets a Pseudobulk from curated metadata
6068
#'
61-
#' @inheritDotParams get_summarized_experiment
69+
#' @param data A data frame containing, at minimum, `cell_`, `file_id`,
70+
#' `sample_`, `cell_type_harmonised` columns, which correspond to a single cell ID,
71+
#' file subdivision for internal use, a singlel cell sample ID and harmonised cell type.
72+
#' They can be obtained from the [get_metadata()] function.
73+
#' @inheritDotParams get_data_container
6274
#' @examples
6375
#' \dontrun{
6476
#' meta <- get_metadata() |> filter(tissue_harmonised == "lung")
6577
#' pseudobulk <- meta |> get_pseudobulk()
6678
#' }
6779
#' @export
68-
get_pseudobulk <- function(...) {
69-
get_summarized_experiment(..., type = "pseudobulk")
80+
get_pseudobulk <- function(data, ...) {
81+
raw_data <- collect(data)
82+
assert_that(
83+
inherits(raw_data, "tbl"),
84+
has_name(raw_data, c("cell_", "file_id", "sample_", "cell_type_harmonised"))
85+
)
86+
get_data_container(data, ..., repository = pseudobulk_url, grouping_column = "file_id")
7087
}
7188

72-
#' Gets a Summarized Experiment from curated metadata
89+
#' Gets data from curated metadata container
7390
#'
7491
#' Given a data frame of Curated Atlas metadata obtained from [get_metadata()],
7592
#' returns a [`SummarizedExperiment::SummarizedExperiment-class`] object
7693
#' corresponding to the samples in that data frame
77-
#' @inheritParams param_validation
78-
#' @param type A character vector of length one. Either "sce" for Single Cell Experiment,
79-
#' or "pseudobulk".
94+
#' @param data A data frame containing, at minimum, `cell_`, `file_id_db`, `file_id` column, which
95+
#' correspond to a single cell ID, file subdivision for internal use, and a single cell sample ID.
96+
#' They can be obtained from the [get_metadata()] function.
97+
#' @param assays A character vector whose elements must be either "counts"
98+
#' and/or "cpm" and/or "quantile_normalised", representing the corresponding assay(s) you want to request.
99+
#' By default only the count assay is downloaded. If you are interested in
100+
#' comparing a limited amount of genes, the "cpm" assay is more appropriate.
101+
#' @param cache_directory An optional character vector of length one. If
102+
#' provided, it should indicate a local file path where any remotely accessed
103+
#' files should be copied.
104+
#' @param repository A character vector of length one. If provided, it should be
105+
#' an HTTP URL pointing to the location where the single cell data is stored.
106+
#' @param grouping_column A character vector of metadata column for grouping. "file_id_db" for
107+
#' for Single Cell Experiment, "file_id" for pseudobulk.
108+
#' @param features An optional character vector of features (ie genes) to return
109+
#' the counts for. By default counts for all features will be returned.
80110
#' @importFrom dplyr pull filter as_tibble inner_join collect
81111
#' @importFrom tibble column_to_rownames
82112
#' @importFrom purrr reduce map map_int imap keep
@@ -88,23 +118,43 @@ get_pseudobulk <- function(...) {
88118
#' @importFrom cli cli_alert_success cli_alert_info
89119
#' @importFrom rlang .data
90120
#' @importFrom S4Vectors DataFrame
91-
get_summarized_experiment <- function(
121+
get_data_container <- function(
92122
data,
93123
assays = "counts",
94124
cache_directory = get_default_cache_dir(),
95-
repository = COUNTS_URL,
96-
features = NULL,
97-
type = "sce"
125+
repository,
126+
grouping_column,
127+
features = NULL
128+
98129
) {
99-
repository <- if (type == "sce") COUNTS_URL else pseudobulk_url
100-
validated <- param_validation(data, assays, cache_directory, repository, features)
130+
# Parameter validation
131+
assays %in% names(assay_map) |>
132+
all() |>
133+
assert_that(
134+
msg = 'assays must be a character vector containing "counts" and/or
135+
"cpm"'
136+
)
137+
assert_that(
138+
!anyDuplicated(assays),
139+
inherits(cache_directory, "character"),
140+
is.null(repository) || is.character(repository),
141+
is.null(features) || is.character(features)
142+
)
101143

102-
# Extract variables from validation
103-
raw_data <- validated$raw_data
104-
versioned_cache_directory <- validated$versioned_cache_directory
105-
subdirs <- validated$subdirs
144+
# Data parameter validation (last, because it's slower)
145+
## Evaluate the promise now so that we get a sensible error message
146+
force(data)
147+
## We have to convert to an in-memory table here, or some of the dplyr
148+
## operations will fail when passed a database connection
149+
cli_alert_info("Realising metadata.")
150+
raw_data <- collect(data)
151+
versioned_cache_directory <- cache_directory
152+
versioned_cache_directory |> dir.create(
153+
showWarnings = FALSE,
154+
recursive = TRUE
155+
)
106156

107-
file_id_col <- if (type == "sce") "file_id_db" else "file_id"
157+
subdirs <- assay_map[assays]
108158

109159
# The repository is optional. If not provided we load only from the cache
110160
if (!is.null(repository)) {
@@ -116,7 +166,7 @@ get_summarized_experiment <- function(
116166

117167
files_to_read <-
118168
raw_data |>
119-
pull(.data[[file_id_col]]) |>
169+
pull(.data[[grouping_column]]) |>
120170
unique() |>
121171
as.character() |>
122172
sync_assay_files(
@@ -135,16 +185,16 @@ get_summarized_experiment <- function(
135185
versioned_cache_directory,
136186
current_subdir
137187
)
138-
188+
139189
experiment_list <- raw_data |>
140-
dplyr::group_by(.data[[file_id_col]]) |>
190+
dplyr::group_by(.data[[grouping_column]]) |>
141191
dplyr::summarise(experiments = list(
142-
group_to_sme(
192+
group_to_data_container(
143193
dplyr::cur_group_id(),
144194
dplyr::cur_data_all(),
145195
dir_prefix,
146196
features,
147-
type
197+
grouping_column
148198
)
149199
)) |>
150200
dplyr::pull(experiments)
@@ -159,6 +209,7 @@ get_summarized_experiment <- function(
159209
cli_alert_info("Compiling Experiment.")
160210
# Combine all the assays
161211
experiment <- experiments[[1]]
212+
162213
SummarizedExperiment::assays(experiment) <- map(experiments, function(exp) {
163214
SummarizedExperiment::assays(exp)[[1]]
164215
})
@@ -178,6 +229,7 @@ get_summarized_experiment <- function(
178229
#' @param dir_prefix The path to the single cell experiment, minus the final
179230
#' segment
180231
#' @param features The list of genes/rows of interest
232+
#' @param grouping_column A character vector of metadata column for grouping
181233
#' @return A `SummarizedExperiment` object
182234
#' @importFrom dplyr mutate filter
183235
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
@@ -188,10 +240,9 @@ get_summarized_experiment <- function(
188240
#' @importFrom glue glue
189241
#' @importFrom stringr str_replace_all
190242
#' @noRd
191-
group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
243+
group_to_data_container <- function(i, df, dir_prefix, features, grouping_column) {
192244
# Set file name based on type
193-
file_id_col <- if (type == "sce") "file_id_db" else "file_id"
194-
experiment_path <- df[[file_id_col]] |>
245+
experiment_path <- df[[grouping_column]] |>
195246
head(1) |>
196247
file.path(
197248
dir_prefix,
@@ -213,7 +264,7 @@ group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
213264
# Fix for https://github.com/tidyverse/dplyr/issues/6746
214265
force(i)
215266

216-
if (type == "sce") {
267+
if (grouping_column == "file_id_db") {
217268
# Process specific to SCE
218269
cells <- colnames(experiment) |> intersect(df$cell_)
219270

@@ -247,7 +298,8 @@ group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
247298
`colnames<-`(new_coldata$cell_) |>
248299
`colData<-`(value = new_coldata)
249300
}
250-
else if (type == "pseudobulk") {
301+
else if (grouping_column == "file_id") {
302+
# Process specific to Pseudobulk
251303
# remove cell-level annotations
252304
cell_level_anno <- c("cell_", "cell_type", "confidence_class", "file_id_db",
253305
"cell_annotation_blueprint_singler",
@@ -257,7 +309,7 @@ group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
257309
"sample_id_db")
258310

259311
new_coldata <- df |>
260-
select(-dplyr::all_of(cell_level_anno)) |>
312+
select(-dplyr::all_of(intersect(names(df), cell_level_anno))) |>
261313
distinct() |>
262314
mutate(
263315
sample_identifier = glue("{sample_}___{cell_type_harmonised}"),
@@ -368,66 +420,3 @@ check_gene_overlap <- function(obj_list) {
368420
common_genes
369421
}
370422

371-
#' Validate parameters for Summarized Experiment analysis
372-
#' @param data A data frame containing, at minimum, `cell_`, `file_id_db`, `file_id` column, which
373-
#' correspond to a single cell ID, file subdivision for internal use, and a single cell sample ID.
374-
#' They can be obtained from the [get_metadata()] function.
375-
#' @param assays A character vector whose elements must be either "counts"
376-
#' and/or "cpm" and/or "quantile_normalised", representing the corresponding assay(s) you want to request.
377-
#' By default only the count assay is downloaded. If you are interested in
378-
#' comparing a limited amount of genes, the "cpm" assay is more appropriate.
379-
#' @param repository A character vector of length one. If provided, it should be
380-
#' an HTTP URL pointing to the location where the single cell data is stored.
381-
#' @param cache_directory An optional character vector of length one. If
382-
#' provided, it should indicate a local file path where any remotely accessed
383-
#' files should be copied.
384-
#' @param features An optional character vector of features (ie genes) to return
385-
#' the counts for. By default counts for all features will be returned.
386-
#' @return A list of elements:
387-
#' \itemize{
388-
#' \item{raw_data}{Data after being converted to an in-memory data frame }
389-
#' \item{versioned_cache_directory}{The path to the cache directory}
390-
#' \item{subdirs}{Vector of subdirectory names from the `assays` input}
391-
#' }
392-
#' @importFrom dplyr collect
393-
#' @importFrom assertthat assert_that has_name
394-
#' @importFrom cli cli_alert_info
395-
#' @importFrom rlang .data
396-
#' @keywords internal
397-
param_validation <- function(data,
398-
assays,
399-
cache_directory,
400-
repository,
401-
features
402-
) {
403-
# Parameter validation
404-
assays %in% names(assay_map) |>
405-
all() |>
406-
assert_that(msg = 'assays must be a character vector containing "counts" and/or
407-
"cpm" and/or "quantile_normalised"')
408-
assert_that(
409-
!anyDuplicated(assays),
410-
inherits(cache_directory, "character"),
411-
is.null(repository) || is.character(repository),
412-
is.null(features) || is.character(features)
413-
)
414-
415-
# Data parameter validation (last, because it's slower)
416-
## Evaluate the promise now so that we get a sensible error message
417-
force(data)
418-
## We have to convert to an in-memory table here, or some of the dplyr
419-
## operations will fail when passed a database connection
420-
cli_alert_info("Realising metadata.")
421-
raw_data <- collect(data)
422-
assert_that(inherits(raw_data, "tbl"),
423-
has_name(raw_data, c("file_id", "cell_", "file_id_db")))
424-
425-
versioned_cache_directory <- cache_directory
426-
versioned_cache_directory |> dir.create(showWarnings = FALSE,
427-
recursive = TRUE)
428-
429-
subdirs <- assay_map[assays]
430-
list(raw_data = raw_data, versioned_cache_directory = versioned_cache_directory,
431-
subdirs = subdirs)
432-
}
433-

man/get_SingleCellExperiment.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/get_pseudobulk.Rd

Lines changed: 11 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/get_seurat.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)