@@ -46,37 +46,67 @@ get_SingleCellExperiment <- function(...){
4646}
4747
4848# ' Gets a SingleCellExperiment from curated metadata
49- # '
50- # ' @inheritDotParams get_summarized_experiment
49+ # '
50+ # ' @param data A data frame containing, at minimum, `cell_`, `file_id_db` columns,
51+ # ' which correspond to a single cell ID, file subdivision for internal use.
52+ # ' They can be obtained from the [get_metadata()] function.
53+ # ' @inheritDotParams get_data_container
5154# ' @examples
5255# ' meta <- get_metadata() |> head(2)
5356# ' sce <- get_single_cell_experiment(meta)
5457# ' @export
55- get_single_cell_experiment <- function (... ){
56- get_summarized_experiment(... , type = " sce" )
58+ get_single_cell_experiment <- function (data , ... ){
59+ raw_data <- collect(data )
60+ assert_that(
61+ inherits(raw_data , " tbl" ),
62+ has_name(raw_data , c(" cell_" , " file_id_db" ))
63+ )
64+ get_data_container(data , ... , repository = COUNTS_URL , grouping_column = " file_id_db" )
5765}
5866
5967# ' Gets a Pseudobulk from curated metadata
6068# '
61- # ' @inheritDotParams get_summarized_experiment
69+ # ' @param data A data frame containing, at minimum, `cell_`, `file_id`,
70+ # ' `sample_`, `cell_type_harmonised` columns, which correspond to a single cell ID,
71+ # ' file subdivision for internal use, a singlel cell sample ID and harmonised cell type.
72+ # ' They can be obtained from the [get_metadata()] function.
73+ # ' @inheritDotParams get_data_container
6274# ' @examples
6375# ' \dontrun{
6476# ' meta <- get_metadata() |> filter(tissue_harmonised == "lung")
6577# ' pseudobulk <- meta |> get_pseudobulk()
6678# ' }
6779# ' @export
68- get_pseudobulk <- function (... ) {
69- get_summarized_experiment(... , type = " pseudobulk" )
80+ get_pseudobulk <- function (data , ... ) {
81+ raw_data <- collect(data )
82+ assert_that(
83+ inherits(raw_data , " tbl" ),
84+ has_name(raw_data , c(" cell_" , " file_id" , " sample_" , " cell_type_harmonised" ))
85+ )
86+ get_data_container(data , ... , repository = pseudobulk_url , grouping_column = " file_id" )
7087}
7188
72- # ' Gets a Summarized Experiment from curated metadata
89+ # ' Gets data from curated metadata container
7390# '
7491# ' Given a data frame of Curated Atlas metadata obtained from [get_metadata()],
7592# ' returns a [`SummarizedExperiment::SummarizedExperiment-class`] object
7693# ' corresponding to the samples in that data frame
77- # ' @inheritParams param_validation
78- # ' @param type A character vector of length one. Either "sce" for Single Cell Experiment,
79- # ' or "pseudobulk".
94+ # ' @param data A data frame containing, at minimum, `cell_`, `file_id_db`, `file_id` column, which
95+ # ' correspond to a single cell ID, file subdivision for internal use, and a single cell sample ID.
96+ # ' They can be obtained from the [get_metadata()] function.
97+ # ' @param assays A character vector whose elements must be either "counts"
98+ # ' and/or "cpm" and/or "quantile_normalised", representing the corresponding assay(s) you want to request.
99+ # ' By default only the count assay is downloaded. If you are interested in
100+ # ' comparing a limited amount of genes, the "cpm" assay is more appropriate.
101+ # ' @param cache_directory An optional character vector of length one. If
102+ # ' provided, it should indicate a local file path where any remotely accessed
103+ # ' files should be copied.
104+ # ' @param repository A character vector of length one. If provided, it should be
105+ # ' an HTTP URL pointing to the location where the single cell data is stored.
106+ # ' @param grouping_column A character vector of metadata column for grouping. "file_id_db" for
107+ # ' for Single Cell Experiment, "file_id" for pseudobulk.
108+ # ' @param features An optional character vector of features (ie genes) to return
109+ # ' the counts for. By default counts for all features will be returned.
80110# ' @importFrom dplyr pull filter as_tibble inner_join collect
81111# ' @importFrom tibble column_to_rownames
82112# ' @importFrom purrr reduce map map_int imap keep
@@ -88,23 +118,43 @@ get_pseudobulk <- function(...) {
88118# ' @importFrom cli cli_alert_success cli_alert_info
89119# ' @importFrom rlang .data
90120# ' @importFrom S4Vectors DataFrame
91- get_summarized_experiment <- function (
121+ get_data_container <- function (
92122 data ,
93123 assays = " counts" ,
94124 cache_directory = get_default_cache_dir(),
95- repository = COUNTS_URL ,
96- features = NULL ,
97- type = " sce"
125+ repository ,
126+ grouping_column ,
127+ features = NULL
128+
98129) {
99- repository <- if (type == " sce" ) COUNTS_URL else pseudobulk_url
100- validated <- param_validation(data , assays , cache_directory , repository , features )
130+ # Parameter validation
131+ assays %in% names(assay_map ) | >
132+ all() | >
133+ assert_that(
134+ msg = ' assays must be a character vector containing "counts" and/or
135+ "cpm"'
136+ )
137+ assert_that(
138+ ! anyDuplicated(assays ),
139+ inherits(cache_directory , " character" ),
140+ is.null(repository ) || is.character(repository ),
141+ is.null(features ) || is.character(features )
142+ )
101143
102- # Extract variables from validation
103- raw_data <- validated $ raw_data
104- versioned_cache_directory <- validated $ versioned_cache_directory
105- subdirs <- validated $ subdirs
144+ # Data parameter validation (last, because it's slower)
145+ # # Evaluate the promise now so that we get a sensible error message
146+ force(data )
147+ # # We have to convert to an in-memory table here, or some of the dplyr
148+ # # operations will fail when passed a database connection
149+ cli_alert_info(" Realising metadata." )
150+ raw_data <- collect(data )
151+ versioned_cache_directory <- cache_directory
152+ versioned_cache_directory | > dir.create(
153+ showWarnings = FALSE ,
154+ recursive = TRUE
155+ )
106156
107- file_id_col <- if ( type == " sce " ) " file_id_db " else " file_id "
157+ subdirs <- assay_map [ assays ]
108158
109159 # The repository is optional. If not provided we load only from the cache
110160 if (! is.null(repository )) {
@@ -116,7 +166,7 @@ get_summarized_experiment <- function(
116166
117167 files_to_read <-
118168 raw_data | >
119- pull(.data [[file_id_col ]]) | >
169+ pull(.data [[grouping_column ]]) | >
120170 unique() | >
121171 as.character() | >
122172 sync_assay_files(
@@ -135,16 +185,16 @@ get_summarized_experiment <- function(
135185 versioned_cache_directory ,
136186 current_subdir
137187 )
138-
188+
139189 experiment_list <- raw_data | >
140- dplyr :: group_by(.data [[file_id_col ]]) | >
190+ dplyr :: group_by(.data [[grouping_column ]]) | >
141191 dplyr :: summarise(experiments = list (
142- group_to_sme (
192+ group_to_data_container (
143193 dplyr :: cur_group_id(),
144194 dplyr :: cur_data_all(),
145195 dir_prefix ,
146196 features ,
147- type
197+ grouping_column
148198 )
149199 )) | >
150200 dplyr :: pull(experiments )
@@ -159,6 +209,7 @@ get_summarized_experiment <- function(
159209 cli_alert_info(" Compiling Experiment." )
160210 # Combine all the assays
161211 experiment <- experiments [[1 ]]
212+
162213 SummarizedExperiment :: assays(experiment ) <- map(experiments , function (exp ) {
163214 SummarizedExperiment :: assays(exp )[[1 ]]
164215 })
@@ -178,6 +229,7 @@ get_summarized_experiment <- function(
178229# ' @param dir_prefix The path to the single cell experiment, minus the final
179230# ' segment
180231# ' @param features The list of genes/rows of interest
232+ # ' @param grouping_column A character vector of metadata column for grouping
181233# ' @return A `SummarizedExperiment` object
182234# ' @importFrom dplyr mutate filter
183235# ' @importFrom HDF5Array loadHDF5SummarizedExperiment
@@ -188,10 +240,9 @@ get_summarized_experiment <- function(
188240# ' @importFrom glue glue
189241# ' @importFrom stringr str_replace_all
190242# ' @noRd
191- group_to_sme <- function (i , df , dir_prefix , features , type = " sce " ) {
243+ group_to_data_container <- function (i , df , dir_prefix , features , grouping_column ) {
192244 # Set file name based on type
193- file_id_col <- if (type == " sce" ) " file_id_db" else " file_id"
194- experiment_path <- df [[file_id_col ]] | >
245+ experiment_path <- df [[grouping_column ]] | >
195246 head(1 ) | >
196247 file.path(
197248 dir_prefix ,
@@ -213,7 +264,7 @@ group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
213264 # Fix for https://github.com/tidyverse/dplyr/issues/6746
214265 force(i )
215266
216- if (type == " sce " ) {
267+ if (grouping_column == " file_id_db " ) {
217268 # Process specific to SCE
218269 cells <- colnames(experiment ) | > intersect(df $ cell_ )
219270
@@ -247,7 +298,8 @@ group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
247298 `colnames<-`(new_coldata $ cell_ ) | >
248299 `colData<-`(value = new_coldata )
249300 }
250- else if (type == " pseudobulk" ) {
301+ else if (grouping_column == " file_id" ) {
302+ # Process specific to Pseudobulk
251303 # remove cell-level annotations
252304 cell_level_anno <- c(" cell_" , " cell_type" , " confidence_class" , " file_id_db" ,
253305 " cell_annotation_blueprint_singler" ,
@@ -257,7 +309,7 @@ group_to_sme <- function(i, df, dir_prefix, features, type = "sce") {
257309 " sample_id_db" )
258310
259311 new_coldata <- df | >
260- select(- dplyr :: all_of(cell_level_anno )) | >
312+ select(- dplyr :: all_of(intersect(names( df ), cell_level_anno ) )) | >
261313 distinct() | >
262314 mutate(
263315 sample_identifier = glue(" {sample_}___{cell_type_harmonised}" ),
@@ -368,66 +420,3 @@ check_gene_overlap <- function(obj_list) {
368420 common_genes
369421}
370422
371- # ' Validate parameters for Summarized Experiment analysis
372- # ' @param data A data frame containing, at minimum, `cell_`, `file_id_db`, `file_id` column, which
373- # ' correspond to a single cell ID, file subdivision for internal use, and a single cell sample ID.
374- # ' They can be obtained from the [get_metadata()] function.
375- # ' @param assays A character vector whose elements must be either "counts"
376- # ' and/or "cpm" and/or "quantile_normalised", representing the corresponding assay(s) you want to request.
377- # ' By default only the count assay is downloaded. If you are interested in
378- # ' comparing a limited amount of genes, the "cpm" assay is more appropriate.
379- # ' @param repository A character vector of length one. If provided, it should be
380- # ' an HTTP URL pointing to the location where the single cell data is stored.
381- # ' @param cache_directory An optional character vector of length one. If
382- # ' provided, it should indicate a local file path where any remotely accessed
383- # ' files should be copied.
384- # ' @param features An optional character vector of features (ie genes) to return
385- # ' the counts for. By default counts for all features will be returned.
386- # ' @return A list of elements:
387- # ' \itemize{
388- # ' \item{raw_data}{Data after being converted to an in-memory data frame }
389- # ' \item{versioned_cache_directory}{The path to the cache directory}
390- # ' \item{subdirs}{Vector of subdirectory names from the `assays` input}
391- # ' }
392- # ' @importFrom dplyr collect
393- # ' @importFrom assertthat assert_that has_name
394- # ' @importFrom cli cli_alert_info
395- # ' @importFrom rlang .data
396- # ' @keywords internal
397- param_validation <- function (data ,
398- assays ,
399- cache_directory ,
400- repository ,
401- features
402- ) {
403- # Parameter validation
404- assays %in% names(assay_map ) | >
405- all() | >
406- assert_that(msg = ' assays must be a character vector containing "counts" and/or
407- "cpm" and/or "quantile_normalised"' )
408- assert_that(
409- ! anyDuplicated(assays ),
410- inherits(cache_directory , " character" ),
411- is.null(repository ) || is.character(repository ),
412- is.null(features ) || is.character(features )
413- )
414-
415- # Data parameter validation (last, because it's slower)
416- # # Evaluate the promise now so that we get a sensible error message
417- force(data )
418- # # We have to convert to an in-memory table here, or some of the dplyr
419- # # operations will fail when passed a database connection
420- cli_alert_info(" Realising metadata." )
421- raw_data <- collect(data )
422- assert_that(inherits(raw_data , " tbl" ),
423- has_name(raw_data , c(" file_id" , " cell_" , " file_id_db" )))
424-
425- versioned_cache_directory <- cache_directory
426- versioned_cache_directory | > dir.create(showWarnings = FALSE ,
427- recursive = TRUE )
428-
429- subdirs <- assay_map [assays ]
430- list (raw_data = raw_data , versioned_cache_directory = versioned_cache_directory ,
431- subdirs = subdirs )
432- }
433-
0 commit comments