@@ -35,8 +35,8 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
3535# ' meta <- get_metadata() |> head(2)
3636# ' sce <- get_SingleCellExperiment(meta)
3737# '
38- # ' @importFrom dplyr pull filter as_tibble
39- # ' @importFrom tidySingleCellExperiment inner_join
38+ # ' @importFrom dplyr pull filter as_tibble inner_join collect
39+ # ' @importFrom tibble column_to_rownames
4040# ' @importFrom purrr reduce map map_int imap keep
4141# ' @importFrom BiocGenerics cbind
4242# ' @importFrom glue glue
@@ -49,6 +49,7 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
4949# ' @importFrom cli cli_alert_success cli_alert_info
5050# ' @importFrom rlang .data
5151# ' @importFrom stats setNames
52+ # ' @importFrom S4Vectors DataFrame
5253# '
5354# ' @export
5455# '
@@ -78,15 +79,14 @@ get_SingleCellExperiment <- function(
7879 # # We have to convert to an in-memory table here, or some of the dplyr
7980 # # operations will fail when passed a database connection
8081 cli_alert_info(" Realising metadata." )
81- raw_data <- as_tibble (data )
82+ raw_data <- collect (data )
8283 inherits(raw_data , " tbl" ) | > assert_that()
8384 has_name(raw_data , c(" .cell" , " file_id_db" )) | > assert_that()
8485
8586 cache_directory | > dir.create(showWarnings = FALSE )
8687
87- files_to_read <-
88- raw_data | >
89- pull(.data $ file_id_db ) | >
88+ cells_of_interest <- raw_data | >
89+ pull(.data $ .cell ) | >
9090 unique() | >
9191 as.character()
9292
@@ -95,6 +95,11 @@ get_SingleCellExperiment <- function(
9595 # The repository is optional. If not provided we load only from the cache
9696 if (! is.null(repository )) {
9797 cli_alert_info(" Synchronising files" )
98+ files_to_read <-
99+ raw_data | >
100+ pull(.data $ file_id_db ) | >
101+ unique() | >
102+ as.character()
98103 parsed_repo <- parse_url(repository )
99104 (parsed_repo $ scheme %in% c(" http" , " https" )) | > assert_that()
100105 sync_assay_files(
@@ -104,46 +109,28 @@ get_SingleCellExperiment <- function(
104109 subdirs = subdirs
105110 )
106111 }
107- files_to_read <-
108- raw_data | >
109- pull(.data $ file_id_db ) | >
110- unique() | >
111- as.character()
112112
113- subdirs | >
113+ cli_alert_info(" Reading files." )
114+ sce <- subdirs | >
114115 imap(function (current_subdir , current_assay ) {
116+ dir_prefix = file.path(
117+ cache_directory ,
118+ current_subdir
119+ )
120+
115121 # Load each file
116- sces <-
117- files_to_read | >
118- map(function (.x ) {
119- sce_path <- file.path(
120- cache_directory ,
121- current_subdir ,
122- .x
123- )
124-
125- file.exists(sce_path ) | >
126- assert_that(
127- msg = " Your cache does not contain a file you
128- attempted to query. Please provide the repository
129- parameter so that files can be synchronised from the
130- internet"
131- )
132-
133- sce <- loadHDF5SummarizedExperiment(sce_path )
134-
135- if (! is.null(features )) {
136- # Optionally subset the genes
137- sce <- sce [
138- rownames(sce ) | > intersect(features )
139- ]
140- }
141-
142- sce
143- }, .progress = list (name = " Reading files" )) | >
122+ sces <- raw_data | >
123+ dplyr :: group_by(file_id_db ) | >
124+ dplyr :: summarise(sces = list (group_to_sce(
125+ dplyr :: cur_group_id(),
126+ dplyr :: cur_data_all(),
127+ dir_prefix ,
128+ features
129+ ))) | >
130+ dplyr :: pull(sces ) | >
144131 # Drop files with one cell, which causes the DFrame objects to
145132 # combine must have the same column names
146- keep(~ ncol(. ) > 1 ) | >
133+ # keep(~ ncol(.) > 1) |>
147134 # Combine each sce by column, since each sce has a different set
148135 # of cells
149136 do.call(cbind , args = _) | >
@@ -156,15 +143,79 @@ get_SingleCellExperiment <- function(
156143 aside(cli_alert_info(" Compiling Single Cell Experiment." )) | >
157144 # Combine the assays into one list
158145 reduce(c ) | >
159- SingleCellExperiment(assays = _) | >
160- aside(cli_alert_info(" Attaching metadata." )) | >
161- # Join back to metadata, which will become coldata annotations
162- inner_join(
146+ SingleCellExperiment(assays = _)
147+
148+ cli_alert_info(" Attaching metadata." )
149+
150+ colData(sce ) <- raw_data | >
163151 # Needed because cell IDs are not unique outside the file_id or
164152 # file_id_db
165- filter(raw_data , .data $ file_id_db %in% files_to_read ),
166- by = " .cell"
153+ filter(.data $ file_id_db %in% files_to_read ) | >
154+ inner_join(
155+ colData(sce ) | > as_tibble(rownames = " .cell" ),
156+ by = " .cell"
157+ ) | >
158+ column_to_rownames(" .cell" ) | >
159+ as(" DataFrame" )
160+
161+ sce
162+ }
163+
164+ # ' Converts a data frame into a single SCE
165+ # '
166+ # ' @param prefix Prefix to be added to the column names
167+ # ' @param df The data frame to be converted
168+ # ' @param dir_prefix The path to the single cell experiment, minus the final segment
169+ # ' @param features The list of genes/rows of interest
170+ # '
171+ # ' @return A SingleCellExperiment object
172+ # ' @importFrom dplyr mutate
173+ # ' @importFrom HDF5Array loadHDF5SummarizedExperiment
174+ # ' @importFrom SummarizedExperiment colData<-
175+ # ' @importFrom tibble column_to_rownames
176+ # ' @importClassesFrom SingleCellExperiment SingleCellExperiment
177+ # '
178+ group_to_sce = function (i , df , dir_prefix , features ){
179+ sce_path <- df $ file_id_db | >
180+ head(1 ) | >
181+ file.path(
182+ dir_prefix ,
183+ suffix = _
167184 )
185+
186+ file.exists(sce_path ) | >
187+ assert_that(
188+ msg = " Your cache does not contain a file you
189+ attempted to query. Please provide the repository
190+ parameter so that files can be synchronised from the
191+ internet"
192+ )
193+
194+ sce <- loadHDF5SummarizedExperiment(sce_path )
195+ # The cells we select here are those that are both available in the SCE
196+ # object, and requested for this particular file
197+ cells = colnames(sce ) | > intersect(df $ .cell )
198+ # We need to make the cell names globally unique, which we can guarantee
199+ # by adding a suffix that is derived from file_id_db, which is the grouping
200+ # variable
201+ new_cellnames = paste0(cells , " _" , i )
202+ new_coldata = df | >
203+ mutate(original_cell_id = .cell , .cell = new_cellnames ) | >
204+ column_to_rownames(" .cell" ) | >
205+ as(" DataFrame" )
206+
207+ features | >
208+ is.null() | >
209+ {`if` } (
210+ sce [, cells ],
211+ {
212+ # Optionally subset the genes
213+ genes = rownames(sce ) | > intersect(features )
214+ sce [genes , cells ]
215+ }
216+ ) | >
217+ `colnames<-`(new_cellnames ) | >
218+ `colData<-`(new_coldata )
168219}
169220
170221# ' Synchronises one or more remote assays with a local copy
@@ -181,15 +232,15 @@ get_SingleCellExperiment <- function(
181232# '
182233# ' @return A character vector of files that have been downloaded
183234# ' @importFrom purrr pmap_chr transpose
184- # ' @importFrom httr modify_url GET write_disk stop_for_status
235+ # ' @importFrom httr modify_url GET write_disk stop_for_status parse_url
185236# ' @importFrom dplyr tibble transmute filter full_join
186237# ' @importFrom glue glue
187238# ' @importFrom assertthat assert_that
188239# ' @importFrom cli cli_alert_success cli_alert_info cli_abort
189240# ' @noRd
190241# '
191242sync_assay_files <- function (
192- url = httr :: parse_url(REMOTE_URL ),
243+ url = parse_url(REMOTE_URL ),
193244 cache_dir ,
194245 subdirs ,
195246 files
0 commit comments