stemangiola
diff --git a/‎DESCRIPTION‎
Lines changed: 0 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 7 additions & 1 deletion b/‎NAMESPACE‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎R/query.R‎
Lines changed: 100 additions & 49 deletions b/‎R/query.R‎
Lines changed: 100 additions & 49 deletions
@@ -18,7 +18,6 @@ Imports:
     dplyr,
     SummarizedExperiment,
     SingleCellExperiment,
-    tidySingleCellExperiment,
     purrr (>= 1.0.0),
     BiocGenerics,
     glue,
 
@@ -4,26 +4,32 @@ S3method(as.sparse,DelayedMatrix)
 export(get_SingleCellExperiment)
 export(get_metadata)
 export(get_seurat)
+importClassesFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(BiocGenerics,cbind)
 importFrom(DBI,dbConnect)
 importFrom(HDF5Array,HDF5RealizationSink)
 importFrom(HDF5Array,loadHDF5SummarizedExperiment)
 importFrom(RSQLite,SQLITE_RO)
 importFrom(RSQLite,SQLite)
+importFrom(S4Vectors,DataFrame)
 importFrom(Seurat,as.Seurat)
 importFrom(SeuratObject,as.sparse)
 importFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(SingleCellExperiment,simplifyToSCE)
 importFrom(SummarizedExperiment,"assayNames<-")
+importFrom(SummarizedExperiment,"colData<-")
 importFrom(SummarizedExperiment,colData)
 importFrom(assertthat,assert_that)
 importFrom(assertthat,has_name)
 importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_success)
 importFrom(dplyr,as_tibble)
+importFrom(dplyr,collect)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
+importFrom(dplyr,inner_join)
+importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
 importFrom(dplyr,tbl)
 importFrom(dplyr,tibble)
@@ -46,4 +52,4 @@ importFrom(purrr,transpose)
 importFrom(rappdirs,user_cache_dir)
 importFrom(rlang,.data)
 importFrom(stats,setNames)
-importFrom(tidySingleCellExperiment,inner_join)
+importFrom(tibble,column_to_rownames)
@@ -35,8 +35,8 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
 #' meta <- get_metadata() |> head(2)
 #' sce <- get_SingleCellExperiment(meta)
 #'
-#' @importFrom dplyr pull filter as_tibble
-#' @importFrom tidySingleCellExperiment inner_join
+#' @importFrom dplyr pull filter as_tibble inner_join collect
+#' @importFrom tibble column_to_rownames
 #' @importFrom purrr reduce map map_int imap keep
 #' @importFrom BiocGenerics cbind
 #' @importFrom glue glue
@@ -49,6 +49,7 @@ REMOTE_URL <- "https://harmonised-human-atlas.s3.amazonaws.com/"
 #' @importFrom cli cli_alert_success cli_alert_info
 #' @importFrom rlang .data
 #' @importFrom stats setNames
+#' @importFrom S4Vectors DataFrame
 #'
 #' @export
 #'
@@ -78,15 +79,14 @@ get_SingleCellExperiment <- function(
     ## We have to convert to an in-memory table here, or some of the dplyr
     ## operations will fail when passed a database connection
     cli_alert_info("Realising metadata.")
-    raw_data <- as_tibble(data)
+    raw_data <- collect(data)
     inherits(raw_data, "tbl") |> assert_that()
     has_name(raw_data, c(".cell", "file_id_db")) |> assert_that()
 
     cache_directory |> dir.create(showWarnings = FALSE)
 
-    files_to_read <-
-        raw_data |>
-        pull(.data$file_id_db) |>
+    cells_of_interest <- raw_data |>
+        pull(.data$.cell) |>
         unique() |>
         as.character()
 
@@ -95,6 +95,11 @@ get_SingleCellExperiment <- function(
     # The repository is optional. If not provided we load only from the cache
     if (!is.null(repository)) {
         cli_alert_info("Synchronising files")
+        files_to_read <-
+            raw_data |>
+            pull(.data$file_id_db) |>
+            unique() |>
+            as.character()
         parsed_repo <- parse_url(repository)
         (parsed_repo$scheme %in% c("http", "https")) |> assert_that()
         sync_assay_files(
@@ -104,46 +109,28 @@ get_SingleCellExperiment <- function(
             subdirs = subdirs
         )
     }
-    files_to_read <-
-        raw_data |>
-        pull(.data$file_id_db) |>
-        unique() |>
-        as.character()
 
-    subdirs |>
+    cli_alert_info("Reading files.")
+    sce <- subdirs |>
         imap(function(current_subdir, current_assay) {
+            dir_prefix = file.path(
+                cache_directory,
+                current_subdir
+            )
+            
             # Load each file
-            sces <-
-                files_to_read |>
-                map(function(.x) {
-                    sce_path <- file.path(
-                        cache_directory,
-                        current_subdir,
-                        .x
-                    )
-
-                    file.exists(sce_path) |>
-                        assert_that(
-                            msg = "Your cache does not contain a file you
-                            attempted to query. Please provide the repository 
-                            parameter so that files can be synchronised from the
-                            internet"
-                        )
-
-                    sce <- loadHDF5SummarizedExperiment(sce_path)
-
-                    if (!is.null(features)) {
-                        # Optionally subset the genes
-                        sce <- sce[
-                            rownames(sce) |> intersect(features)
-                        ]
-                    }
-
-                    sce
-                }, .progress = list(name = "Reading files")) |>
+            sces <- raw_data |>
+                dplyr::group_by(file_id_db) |>
+                dplyr::summarise(sces = list(group_to_sce(
+                    dplyr::cur_group_id(),
+                    dplyr::cur_data_all(),
+                    dir_prefix,
+                    features
+                ))) |>
+                dplyr::pull(sces) |>
                 # Drop files with one cell, which causes the DFrame objects to
                 # combine must have the same column names
-                keep(~ ncol(.) > 1) |>
+                # keep(~ ncol(.) > 1) |>
                 # Combine each sce by column, since each sce has a different set
                 # of cells
                 do.call(cbind, args = _) |>
@@ -156,15 +143,79 @@ get_SingleCellExperiment <- function(
         aside(cli_alert_info("Compiling Single Cell Experiment.")) |>
         # Combine the assays into one list
         reduce(c) |>
-        SingleCellExperiment(assays = _) |>
-        aside(cli_alert_info("Attaching metadata.")) |>
-        # Join back to metadata, which will become coldata annotations
-        inner_join(
+        SingleCellExperiment(assays = _)
+    
+        cli_alert_info("Attaching metadata.")
+
+        colData(sce) <- raw_data |>
             # Needed because cell IDs are not unique outside the file_id or
             # file_id_db
-            filter(raw_data, .data$file_id_db %in% files_to_read),
-            by = ".cell"
+            filter(.data$file_id_db %in% files_to_read) |>
+            inner_join(
+                colData(sce) |> as_tibble(rownames = ".cell"),
+                by = ".cell" 
+            ) |>
+            column_to_rownames(".cell") |>
+            as("DataFrame")
+        
+        sce
+}
+
+#' Converts a data frame into a single SCE
+#'
+#' @param prefix Prefix to be added to the column names
+#' @param df The data frame to be converted
+#' @param dir_prefix The path to the single cell experiment, minus the final segment
+#' @param features The list of genes/rows of interest
+#'
+#' @return A SingleCellExperiment object
+#' @importFrom dplyr mutate 
+#' @importFrom HDF5Array loadHDF5SummarizedExperiment
+#' @importFrom SummarizedExperiment colData<-
+#' @importFrom tibble column_to_rownames
+#' @importClassesFrom SingleCellExperiment SingleCellExperiment 
+#'
+group_to_sce = function(i, df, dir_prefix, features){
+    sce_path <- df$file_id_db |> 
+        head(1) |>
+        file.path(
+            dir_prefix,
+            suffix = _
         )
+    
+    file.exists(sce_path) |>
+        assert_that(
+            msg = "Your cache does not contain a file you
+                            attempted to query. Please provide the repository 
+                            parameter so that files can be synchronised from the
+                            internet"
+        )
+    
+    sce <- loadHDF5SummarizedExperiment(sce_path)
+    # The cells we select here are those that are both available in the SCE
+    # object, and requested for this particular file
+    cells = colnames(sce) |> intersect(df$.cell)
+    # We need to make the cell names globally unique, which we can guarantee
+    # by adding a suffix that is derived from file_id_db, which is the grouping
+    # variable
+    new_cellnames = paste0(cells, "_", i)
+    new_coldata = df |>
+        mutate(original_cell_id = .cell, .cell = new_cellnames) |>
+        column_to_rownames(".cell") |>
+        as("DataFrame")
+    
+    features |> 
+        is.null() |>
+        {`if`} (
+            sce[, cells],
+            {
+                # Optionally subset the genes
+                genes = rownames(sce) |> intersect(features)
+                sce[genes, cells]
+            }
+        ) |>
+        `colnames<-`(new_cellnames) |>
+        `colData<-`(new_coldata)
 }
 
 #' Synchronises one or more remote assays with a local copy
@@ -181,15 +232,15 @@ get_SingleCellExperiment <- function(
 #'
 #' @return A character vector of files that have been downloaded
 #' @importFrom purrr pmap_chr transpose
-#' @importFrom httr modify_url GET write_disk stop_for_status
+#' @importFrom httr modify_url GET write_disk stop_for_status parse_url
 #' @importFrom dplyr tibble transmute filter full_join
 #' @importFrom glue glue
 #' @importFrom assertthat assert_that
 #' @importFrom cli cli_alert_success cli_alert_info cli_abort
 #' @noRd
 #'
 sync_assay_files <- function(
-    url = httr::parse_url(REMOTE_URL),
+    url = parse_url(REMOTE_URL),
     cache_dir,
     subdirs,
     files