stemangiola
diff --git a/‎DESCRIPTION‎
Lines changed: 0 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 7 additions & 1 deletion b/‎NAMESPACE‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎R/query.R‎
Lines changed: 101 additions & 65 deletions b/‎R/query.R‎
Lines changed: 101 additions & 65 deletions
@@ -18,7 +18,6 @@ Imports:
     dplyr,
     SummarizedExperiment,
     SingleCellExperiment,
-    tidySingleCellExperiment,
     purrr (>= 1.0.0),
     BiocGenerics,
     glue,
 
@@ -4,26 +4,32 @@ S3method(as.sparse,DelayedMatrix)
 export(get_SingleCellExperiment)
 export(get_metadata)
 export(get_seurat)
+importClassesFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(BiocGenerics,cbind)
 importFrom(DBI,dbConnect)
 importFrom(HDF5Array,HDF5RealizationSink)
 importFrom(HDF5Array,loadHDF5SummarizedExperiment)
 importFrom(RSQLite,SQLITE_RO)
 importFrom(RSQLite,SQLite)
+importFrom(S4Vectors,DataFrame)
 importFrom(Seurat,as.Seurat)
 importFrom(SeuratObject,as.sparse)
 importFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(SingleCellExperiment,simplifyToSCE)
 importFrom(SummarizedExperiment,"assayNames<-")
+importFrom(SummarizedExperiment,"colData<-")
 importFrom(SummarizedExperiment,colData)
 importFrom(assertthat,assert_that)
 importFrom(assertthat,has_name)
 importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_success)
 importFrom(dplyr,as_tibble)
+importFrom(dplyr,collect)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
+importFrom(dplyr,inner_join)
+importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
 importFrom(dplyr,tbl)
 importFrom(dplyr,tibble)
@@ -46,4 +52,4 @@ importFrom(purrr,transpose)
 importFrom(rappdirs,user_cache_dir)
 importFrom(rlang,.data)
 importFrom(stats,setNames)
-importFrom(tidySingleCellExperiment,inner_join)
+importFrom(tibble,column_to_rownames)
@@ -35,8 +35,8 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
 #' meta <- get_metadata() |> head(2)
 #' sce <- get_SingleCellExperiment(meta)
 #'
-#' @importFrom dplyr pull filter as_tibble
-#' @importFrom tidySingleCellExperiment inner_join
+#' @importFrom dplyr pull filter as_tibble inner_join collect
+#' @importFrom tibble column_to_rownames
 #' @importFrom purrr reduce map map_int imap keep
 #' @importFrom BiocGenerics cbind
 #' @importFrom glue glue
@@ -49,6 +49,7 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
 #' @importFrom cli cli_alert_success cli_alert_info
 #' @importFrom rlang .data
 #' @importFrom stats setNames
+#' @importFrom S4Vectors DataFrame
 #'
 #' @export
 #'
@@ -64,7 +65,7 @@ get_SingleCellExperiment <- function(
     assays %in% names(assay_map) |>
         all() |>
         assert_that(
-          msg = 'assays must be a character vector containing "counts" and/or
+            msg = 'assays must be a character vector containing "counts" and/or
           "cpm"'
         )
     (!anyDuplicated(assays)) |> assert_that()
@@ -78,15 +79,14 @@ get_SingleCellExperiment <- function(
     ## We have to convert to an in-memory table here, or some of the dplyr
     ## operations will fail when passed a database connection
     cli_alert_info("Realising metadata.")
-    raw_data <- as_tibble(data)
+    raw_data <- collect(data)
     inherits(raw_data, "tbl") |> assert_that()
     has_name(raw_data, c(".cell", "file_id_db")) |> assert_that()
 
     cache_directory |> dir.create(showWarnings = FALSE)
 
-    files_to_read <-
-        raw_data |>
-        pull(.data$file_id_db) |>
+    cells_of_interest <- raw_data |>
+        pull(.data$.cell) |>
         unique() |>
         as.character()
 
@@ -95,6 +95,11 @@ get_SingleCellExperiment <- function(
     # The repository is optional. If not provided we load only from the cache
     if (!is.null(repository)) {
         cli_alert_info("Synchronising files")
+        files_to_read <-
+            raw_data |>
+            pull(.data$file_id_db) |>
+            unique() |>
+            as.character()
         parsed_repo <- parse_url(repository)
         (parsed_repo$scheme %in% c("http", "https")) |> assert_that()
         sync_assay_files(
@@ -104,67 +109,96 @@ get_SingleCellExperiment <- function(
             subdirs = subdirs
         )
     }
-    files_to_read <-
-        raw_data |>
-        pull(.data$file_id_db) |>
-        unique() |>
-        as.character()
 
-    subdirs |>
+    cli_alert_info("Reading files.")
+    sces <- subdirs |>
         imap(function(current_subdir, current_assay) {
-            # Load each file
-            sces <-
-                files_to_read |>
-                map(function(.x) {
-                    sce_path <- file.path(
-                        cache_directory,
-                        current_subdir,
-                        .x
-                    )
+            # Build up an SCE for each assay
+            dir_prefix <- file.path(
+                cache_directory,
+                current_subdir
+            )
+
+            raw_data |>
+                dplyr::group_by(file_id_db) |>
+                # Load each file and attach metadata
+                dplyr::summarise(sces = list(group_to_sce(
+                    dplyr::cur_group_id(),
+                    dplyr::cur_data_all(),
+                    dir_prefix,
+                    features
+                ))) |>
+                dplyr::pull(sces) |>
+                # Combine each sce by column, since each sce has a different set
+                # of cells
+                do.call(cbind, args = _)
+        })
 
-                    file.exists(sce_path) |>
-                        assert_that(
-                            msg = "Your cache does not contain a file you
-                            attempted to query. Please provide the repository 
+    cli_alert_info("Compiling Single Cell Experiment.")
+    # Combine all the assays
+    sce <- sces[[1]]
+    SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
+        SummarizedExperiment::assays(sce)[[1]]
+    })
+
+    sce
+}
+
+#' Converts a data frame into a single SCE
+#'
+#' @param prefix Prefix to be added to the column names
+#' @param df The data frame to be converted
+#' @param dir_prefix The path to the single cell experiment, minus the final segment
+#' @param features The list of genes/rows of interest
+#'
+#' @return A SingleCellExperiment object
+#' @importFrom dplyr mutate
+#' @importFrom HDF5Array loadHDF5SummarizedExperiment
+#' @importFrom SummarizedExperiment colData<-
+#' @importFrom tibble column_to_rownames
+#' @importClassesFrom SingleCellExperiment SingleCellExperiment
+#'
+group_to_sce <- function(i, df, dir_prefix, features) {
+    sce_path <- df$file_id_db |>
+        head(1) |>
+        file.path(
+            dir_prefix,
+            suffix = _
+        )
+
+    file.exists(sce_path) |>
+        assert_that(
+            msg = "Your cache does not contain a file you
+                            attempted to query. Please provide the repository
                             parameter so that files can be synchronised from the
                             internet"
-                        )
-
-                    sce <- loadHDF5SummarizedExperiment(sce_path)
+        )
 
-                    if (!is.null(features)) {
-                        # Optionally subset the genes
-                        sce <- sce[
-                            rownames(sce) |> intersect(features)
-                        ]
-                    }
+    sce <- loadHDF5SummarizedExperiment(sce_path)
+    # The cells we select here are those that are both available in the SCE
+    # object, and requested for this particular file
+    cells <- colnames(sce) |> intersect(df$.cell)
+    # We need to make the cell names globally unique, which we can guarantee
+    # by adding a suffix that is derived from file_id_db, which is the grouping
+    # variable
+    new_cellnames <- paste0(cells, "_", i)
+    new_coldata <- df |>
+        mutate(original_cell_id = .cell, .cell = new_cellnames) |>
+        column_to_rownames(".cell") |>
+        as("DataFrame")
 
-                    sce
-                }, .progress = list(name = "Reading files")) |>
-                # Drop files with one cell, which causes the DFrame objects to
-                # combine must have the same column names
-                keep(~ ncol(.) > 1) |>
-                # Combine each sce by column, since each sce has a different set
-                # of cells
-                do.call(cbind, args = _) |>
-                # We only need the assay, since we ultimately need to combine
-                # them We need to use :: here since we already have an assays
-                # argument
-                SummarizedExperiment::assays() |>
-                setNames(current_assay)
+    features |>
+        is.null() |>
+        {
+            `if`
+        }(
+            sce[, cells], {
+                # Optionally subset the genes
+                genes <- rownames(sce) |> intersect(features)
+                sce[genes, cells]
         }) |>
-        aside(cli_alert_info("Compiling Single Cell Experiment.")) |>
-        # Combine the assays into one list
-        reduce(c) |>
-        SingleCellExperiment(assays = _) |>
-        aside(cli_alert_info("Attaching metadata.")) |>
-        # Join back to metadata, which will become coldata annotations
-        inner_join(
-            # Needed because cell IDs are not unique outside the file_id or
-            # file_id_db
-            filter(raw_data, .data$file_id_db %in% files_to_read),
-            by = ".cell"
-        )
+        `colnames<-`(new_cellnames) |>
+        `colData<-`(value = new_coldata)
 }
 
 #' Synchronises one or more remote assays with a local copy
@@ -181,17 +215,19 @@ get_SingleCellExperiment <- function(
 #'
 #' @return A character vector of files that have been downloaded
 #' @importFrom purrr pmap_chr transpose
-#' @importFrom httr modify_url GET write_disk stop_for_status
+#' @importFrom httr modify_url GET write_disk stop_for_status parse_url
 #' @importFrom dplyr tibble transmute filter full_join
 #' @importFrom glue glue
 #' @importFrom assertthat assert_that
 #' @importFrom cli cli_alert_success cli_alert_info cli_abort
 #' @noRd
 #'
-sync_assay_files <- function(url = httr::parse_url(REMOTE_URL),
-                             cache_dir,
-                             subdirs,
-                             files) {
+sync_assay_files <- function(
+    url = parse_url(REMOTE_URL),
+    cache_dir,
+    subdirs,
+    files
+) {
     # Find every combination of file name, sample id, and assay, since each
     # will be a separate file we need to download
     expand.grid(