@@ -35,8 +35,8 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
3535# ' meta <- get_metadata() |> head(2)
3636# ' sce <- get_SingleCellExperiment(meta)
3737# '
38- # ' @importFrom dplyr pull filter as_tibble
39- # ' @importFrom tidySingleCellExperiment inner_join
38+ # ' @importFrom dplyr pull filter as_tibble inner_join collect
39+ # ' @importFrom tibble column_to_rownames
4040# ' @importFrom purrr reduce map map_int imap keep
4141# ' @importFrom BiocGenerics cbind
4242# ' @importFrom glue glue
@@ -49,6 +49,7 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
4949# ' @importFrom cli cli_alert_success cli_alert_info
5050# ' @importFrom rlang .data
5151# ' @importFrom stats setNames
52+ # ' @importFrom S4Vectors DataFrame
5253# '
5354# ' @export
5455# '
@@ -64,7 +65,7 @@ get_SingleCellExperiment <- function(
6465 assays %in% names(assay_map ) | >
6566 all() | >
6667 assert_that(
67- msg = ' assays must be a character vector containing "counts" and/or
68+ msg = ' assays must be a character vector containing "counts" and/or
6869 "cpm"'
6970 )
7071 (! anyDuplicated(assays )) | > assert_that()
@@ -78,15 +79,14 @@ get_SingleCellExperiment <- function(
7879 # # We have to convert to an in-memory table here, or some of the dplyr
7980 # # operations will fail when passed a database connection
8081 cli_alert_info(" Realising metadata." )
81- raw_data <- as_tibble (data )
82+ raw_data <- collect (data )
8283 inherits(raw_data , " tbl" ) | > assert_that()
8384 has_name(raw_data , c(" .cell" , " file_id_db" )) | > assert_that()
8485
8586 cache_directory | > dir.create(showWarnings = FALSE )
8687
87- files_to_read <-
88- raw_data | >
89- pull(.data $ file_id_db ) | >
88+ cells_of_interest <- raw_data | >
89+ pull(.data $ .cell ) | >
9090 unique() | >
9191 as.character()
9292
@@ -95,6 +95,11 @@ get_SingleCellExperiment <- function(
9595 # The repository is optional. If not provided we load only from the cache
9696 if (! is.null(repository )) {
9797 cli_alert_info(" Synchronising files" )
98+ files_to_read <-
99+ raw_data | >
100+ pull(.data $ file_id_db ) | >
101+ unique() | >
102+ as.character()
98103 parsed_repo <- parse_url(repository )
99104 (parsed_repo $ scheme %in% c(" http" , " https" )) | > assert_that()
100105 sync_assay_files(
@@ -104,67 +109,96 @@ get_SingleCellExperiment <- function(
104109 subdirs = subdirs
105110 )
106111 }
107- files_to_read <-
108- raw_data | >
109- pull(.data $ file_id_db ) | >
110- unique() | >
111- as.character()
112112
113- subdirs | >
113+ cli_alert_info(" Reading files." )
114+ sces <- subdirs | >
114115 imap(function (current_subdir , current_assay ) {
115- # Load each file
116- sces <-
117- files_to_read | >
118- map(function (.x ) {
119- sce_path <- file.path(
120- cache_directory ,
121- current_subdir ,
122- .x
123- )
116+ # Build up an SCE for each assay
117+ dir_prefix <- file.path(
118+ cache_directory ,
119+ current_subdir
120+ )
121+
122+ raw_data | >
123+ dplyr :: group_by(file_id_db ) | >
124+ # Load each file and attach metadata
125+ dplyr :: summarise(sces = list (group_to_sce(
126+ dplyr :: cur_group_id(),
127+ dplyr :: cur_data_all(),
128+ dir_prefix ,
129+ features
130+ ))) | >
131+ dplyr :: pull(sces ) | >
132+ # Combine each sce by column, since each sce has a different set
133+ # of cells
134+ do.call(cbind , args = _)
135+ })
124136
125- file.exists(sce_path ) | >
126- assert_that(
127- msg = " Your cache does not contain a file you
128- attempted to query. Please provide the repository
137+ cli_alert_info(" Compiling Single Cell Experiment." )
138+ # Combine all the assays
139+ sce <- sces [[1 ]]
140+ SummarizedExperiment :: assays(sce ) <- map(sces , function (sce ) {
141+ SummarizedExperiment :: assays(sce )[[1 ]]
142+ })
143+
144+ sce
145+ }
146+
147+ # ' Converts a data frame into a single SCE
148+ # '
149+ # ' @param prefix Prefix to be added to the column names
150+ # ' @param df The data frame to be converted
151+ # ' @param dir_prefix The path to the single cell experiment, minus the final segment
152+ # ' @param features The list of genes/rows of interest
153+ # '
154+ # ' @return A SingleCellExperiment object
155+ # ' @importFrom dplyr mutate
156+ # ' @importFrom HDF5Array loadHDF5SummarizedExperiment
157+ # ' @importFrom SummarizedExperiment colData<-
158+ # ' @importFrom tibble column_to_rownames
159+ # ' @importClassesFrom SingleCellExperiment SingleCellExperiment
160+ # '
161+ group_to_sce <- function (i , df , dir_prefix , features ) {
162+ sce_path <- df $ file_id_db | >
163+ head(1 ) | >
164+ file.path(
165+ dir_prefix ,
166+ suffix = _
167+ )
168+
169+ file.exists(sce_path ) | >
170+ assert_that(
171+ msg = " Your cache does not contain a file you
172+ attempted to query. Please provide the repository
129173 parameter so that files can be synchronised from the
130174 internet"
131- )
132-
133- sce <- loadHDF5SummarizedExperiment(sce_path )
175+ )
134176
135- if (! is.null(features )) {
136- # Optionally subset the genes
137- sce <- sce [
138- rownames(sce ) | > intersect(features )
139- ]
140- }
177+ sce <- loadHDF5SummarizedExperiment(sce_path )
178+ # The cells we select here are those that are both available in the SCE
179+ # object, and requested for this particular file
180+ cells <- colnames(sce ) | > intersect(df $ .cell )
181+ # We need to make the cell names globally unique, which we can guarantee
182+ # by adding a suffix that is derived from file_id_db, which is the grouping
183+ # variable
184+ new_cellnames <- paste0(cells , " _" , i )
185+ new_coldata <- df | >
186+ mutate(original_cell_id = .cell , .cell = new_cellnames ) | >
187+ column_to_rownames(" .cell" ) | >
188+ as(" DataFrame" )
141189
142- sce
143- }, .progress = list (name = " Reading files" )) | >
144- # Drop files with one cell, which causes the DFrame objects to
145- # combine must have the same column names
146- keep(~ ncol(. ) > 1 ) | >
147- # Combine each sce by column, since each sce has a different set
148- # of cells
149- do.call(cbind , args = _) | >
150- # We only need the assay, since we ultimately need to combine
151- # them We need to use :: here since we already have an assays
152- # argument
153- SummarizedExperiment :: assays() | >
154- setNames(current_assay )
190+ features | >
191+ is.null() | >
192+ {
193+ `if`
194+ }(
195+ sce [, cells ], {
196+ # Optionally subset the genes
197+ genes <- rownames(sce ) | > intersect(features )
198+ sce [genes , cells ]
155199 }) | >
156- aside(cli_alert_info(" Compiling Single Cell Experiment." )) | >
157- # Combine the assays into one list
158- reduce(c ) | >
159- SingleCellExperiment(assays = _) | >
160- aside(cli_alert_info(" Attaching metadata." )) | >
161- # Join back to metadata, which will become coldata annotations
162- inner_join(
163- # Needed because cell IDs are not unique outside the file_id or
164- # file_id_db
165- filter(raw_data , .data $ file_id_db %in% files_to_read ),
166- by = " .cell"
167- )
200+ `colnames<-`(new_cellnames ) | >
201+ `colData<-`(value = new_coldata )
168202}
169203
170204# ' Synchronises one or more remote assays with a local copy
@@ -181,17 +215,19 @@ get_SingleCellExperiment <- function(
181215# '
182216# ' @return A character vector of files that have been downloaded
183217# ' @importFrom purrr pmap_chr transpose
184- # ' @importFrom httr modify_url GET write_disk stop_for_status
218+ # ' @importFrom httr modify_url GET write_disk stop_for_status parse_url
185219# ' @importFrom dplyr tibble transmute filter full_join
186220# ' @importFrom glue glue
187221# ' @importFrom assertthat assert_that
188222# ' @importFrom cli cli_alert_success cli_alert_info cli_abort
189223# ' @noRd
190224# '
191- sync_assay_files <- function (url = httr :: parse_url(REMOTE_URL ),
192- cache_dir ,
193- subdirs ,
194- files ) {
225+ sync_assay_files <- function (
226+ url = parse_url(REMOTE_URL ),
227+ cache_dir ,
228+ subdirs ,
229+ files
230+ ) {
195231 # Find every combination of file name, sample id, and assay, since each
196232 # will be a separate file we need to download
197233 expand.grid(
0 commit comments