|
| 1 | +# Functions that relate to downloading count data into SingleCellExperiments |
| 2 | + |
| 3 | +# We need to load utils now so it can be used at the top level |
| 4 | +#' @include utils.R |
| 5 | +# This is a hack to force Seurat packages to be loaded, and also to |
| 6 | +# satisfy R CMD check. We don't need to attach them at all. |
| 7 | +#' @importFrom Seurat as.SingleCellExperiment |
| 8 | +NULL |
| 9 | + |
| 10 | +# Maps user provided assay names to their corresponding paths in the repository |
| 11 | +assay_map <- c( |
| 12 | + counts = "original", |
| 13 | + cpm = "cpm" |
| 14 | +) |
| 15 | + |
| 16 | +#' Base URL pointing to the count data |
| 17 | +COUNTS_URL <- single_line_str( |
| 18 | + "https://swift.rc.nectar.org.au/v1/ |
| 19 | + AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas" |
| 20 | +) |
| 21 | +#' Current version of the counts. This will be incremented when a newer |
| 22 | +#' version is released |
| 23 | +COUNTS_VERSION <- "0.2" |
| 24 | + |
| 25 | +#' Gets a SingleCellExperiment from curated metadata |
| 26 | +#' |
| 27 | +#' Given a data frame of Curated Atlas metadata obtained from [get_metadata()], |
| 28 | +#' returns a [`SingleCellExperiment::SingleCellExperiment-class`] object |
| 29 | +#' corresponding to the samples in that data frame |
| 30 | +#' |
| 31 | +#' @param data A data frame containing, at minimum, a `sample_` column, which |
| 32 | +#' corresponds to a single cell sample ID. This can be obtained from the |
| 33 | +#' [get_metadata()] function. |
| 34 | +#' @param assays A character vector whose elements must be either "counts" |
| 35 | +#' and/or "cpm", representing the corresponding assay(s) you want to request. |
| 36 | +#' By default only the count assay is downloaded. If you are interested in |
| 37 | +#' comparing a limited amount of genes, the "cpm" assay is more appropriate. |
| 38 | +#' @param repository A character vector of length one. If provided, it should be |
| 39 | +#' an HTTP URL pointing to the location where the single cell data is stored. |
| 40 | +#' @param cache_directory An optional character vector of length one. If |
| 41 | +#' provided, it should indicate a local file path where any remotely accessed |
| 42 | +#' files should be copied. |
| 43 | +#' @param features An optional character vector of features (ie genes) to return |
| 44 | +#' the counts for. By default counts for all features will be returned. |
| 45 | +#' @returns A SingleCellExperiment object, with one assay for each value in the |
| 46 | +#' assays argument |
| 47 | +#' @examples |
| 48 | +#' meta <- get_metadata() |> head(2) |
| 49 | +#' sce <- get_SingleCellExperiment(meta) |
| 50 | +#' |
| 51 | +#' @importFrom dplyr pull filter as_tibble inner_join collect |
| 52 | +#' @importFrom tibble column_to_rownames |
| 53 | +#' @importFrom purrr reduce map map_int imap keep |
| 54 | +#' @importFrom BiocGenerics cbind |
| 55 | +#' @importFrom glue glue |
| 56 | +#' @importFrom HDF5Array loadHDF5SummarizedExperiment HDF5RealizationSink |
| 57 | +#' loadHDF5SummarizedExperiment |
| 58 | +#' @importFrom SingleCellExperiment SingleCellExperiment combineCols |
| 59 | +#' @importFrom SummarizedExperiment colData assayNames<- |
| 60 | +#' @importFrom httr parse_url |
| 61 | +#' @importFrom assertthat assert_that has_name |
| 62 | +#' @importFrom cli cli_alert_success cli_alert_info |
| 63 | +#' @importFrom rlang .data |
| 64 | +#' @importFrom stats setNames |
| 65 | +#' @importFrom S4Vectors DataFrame |
| 66 | +#' @export |
| 67 | +get_SingleCellExperiment <- function( |
| 68 | + data, |
| 69 | + assays = "counts", |
| 70 | + cache_directory = get_default_cache_dir(), |
| 71 | + repository = COUNTS_URL, |
| 72 | + features = NULL |
| 73 | +) { |
| 74 | + # Parameter validation |
| 75 | + assays %in% names(assay_map) |> |
| 76 | + all() |> |
| 77 | + assert_that( |
| 78 | + msg = 'assays must be a character vector containing "counts" and/or |
| 79 | + "cpm"' |
| 80 | + ) |
| 81 | + (!anyDuplicated(assays)) |> assert_that() |
| 82 | + inherits(cache_directory, "character") |> assert_that() |
| 83 | + is.null(repository) || is.character(repository) |> assert_that() |
| 84 | + is.null(features) || is.character(features) |> assert_that() |
| 85 | + |
| 86 | + # Data parameter validation (last, because it's slower) |
| 87 | + ## Evaluate the promise now so that we get a sensible error message |
| 88 | + data |
| 89 | + ## We have to convert to an in-memory table here, or some of the dplyr |
| 90 | + ## operations will fail when passed a database connection |
| 91 | + cli_alert_info("Realising metadata.") |
| 92 | + raw_data <- collect(data) |
| 93 | + inherits(raw_data, "tbl") |> assert_that() |
| 94 | + has_name(raw_data, c("cell_", "file_id_db")) |> assert_that() |
| 95 | + |
| 96 | + versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION) |
| 97 | + versioned_cache_directory |> dir.create( |
| 98 | + showWarnings = FALSE, |
| 99 | + recursive = TRUE |
| 100 | + ) |
| 101 | + |
| 102 | + subdirs <- assay_map[assays] |
| 103 | + |
| 104 | + # The repository is optional. If not provided we load only from the cache |
| 105 | + if (!is.null(repository)) { |
| 106 | + cli_alert_info("Synchronising files") |
| 107 | + parsed_repo <- parse_url(repository) |
| 108 | + parsed_repo$scheme |> |
| 109 | + `%in%`(c("http", "https")) |> |
| 110 | + assert_that() |
| 111 | + |
| 112 | + files_to_read <- |
| 113 | + raw_data |> |
| 114 | + pull(.data$file_id_db) |> |
| 115 | + unique() |> |
| 116 | + as.character() |> |
| 117 | + sync_assay_files( |
| 118 | + url = parsed_repo, |
| 119 | + cache_dir = versioned_cache_directory, |
| 120 | + files = _, |
| 121 | + subdirs = subdirs |
| 122 | + ) |
| 123 | + } |
| 124 | + |
| 125 | + cli_alert_info("Reading files.") |
| 126 | + sces <- subdirs |> |
| 127 | + imap(function(current_subdir, current_assay) { |
| 128 | + # Build up an SCE for each assay |
| 129 | + dir_prefix <- file.path( |
| 130 | + versioned_cache_directory, |
| 131 | + current_subdir |
| 132 | + ) |
| 133 | + |
| 134 | + raw_data |> |
| 135 | + dplyr::group_by(.data$file_id_db) |> |
| 136 | + # Load each file and attach metadata |
| 137 | + dplyr::summarise(sces = list(group_to_sce( |
| 138 | + dplyr::cur_group_id(), |
| 139 | + dplyr::cur_data_all(), |
| 140 | + dir_prefix, |
| 141 | + features |
| 142 | + ))) |> |
| 143 | + dplyr::pull(sces) |> |
| 144 | + # Combine each sce by column, since each sce has a different set |
| 145 | + # of cells |
| 146 | + do.call(cbind, args = _) |
| 147 | + }) |
| 148 | + |
| 149 | + cli_alert_info("Compiling Single Cell Experiment.") |
| 150 | + # Combine all the assays |
| 151 | + sce <- sces[[1]] |
| 152 | + SummarizedExperiment::assays(sce) <- map(sces, function(sce) { |
| 153 | + SummarizedExperiment::assays(sce)[[1]] |
| 154 | + }) |
| 155 | + |
| 156 | + sce |
| 157 | +} |
| 158 | + |
| 159 | +#' Converts a data frame into a single SCE |
| 160 | +#' @param i Suffix to be added to the column names, to make them unique |
| 161 | +#' @param df The data frame to be converted |
| 162 | +#' @param dir_prefix The path to the single cell experiment, minus the final |
| 163 | +#' segment |
| 164 | +#' @param features The list of genes/rows of interest |
| 165 | +#' @return A SingleCellExperiment object |
| 166 | +#' @importFrom dplyr mutate filter |
| 167 | +#' @importFrom HDF5Array loadHDF5SummarizedExperiment |
| 168 | +#' @importFrom SummarizedExperiment colData<- |
| 169 | +#' @importFrom tibble column_to_rownames |
| 170 | +#' @importFrom utils head |
| 171 | +#' @importFrom cli cli_alert_warning cli_abort |
| 172 | +#' @importFrom glue glue |
| 173 | +#' @noRd |
| 174 | +group_to_sce <- function(i, df, dir_prefix, features) { |
| 175 | + sce_path <- df$file_id_db |> |
| 176 | + head(1) |> |
| 177 | + file.path( |
| 178 | + dir_prefix, |
| 179 | + suffix = _ |
| 180 | + ) |
| 181 | + |
| 182 | + file.exists(sce_path) |> |
| 183 | + assert_that( |
| 184 | + msg = "Your cache does not contain a file you |
| 185 | + attempted to query. Please provide the repository |
| 186 | + parameter so that files can be synchronised from the |
| 187 | + internet" |
| 188 | + ) |
| 189 | + |
| 190 | + sce <- loadHDF5SummarizedExperiment(sce_path) |
| 191 | + # The cells we select here are those that are both available in the SCE |
| 192 | + # object, and requested for this particular file |
| 193 | + cells <- colnames(sce) |> intersect(df$cell_) |
| 194 | + |
| 195 | + if (length(cells) < nrow(df)){ |
| 196 | + str_replace_all( |
| 197 | + "Some cells were filtered out because of extremely low counts. The |
| 198 | + number of cells in the SingleCellExperiment will be less than the |
| 199 | + number of cells you have selected from the metadata." |
| 200 | + ) |
| 201 | + df <- filter(df, .data$cell_ %in% cells) |
| 202 | + } |
| 203 | + else if (length(cells) > nrow(df)){ |
| 204 | + cli_abort("This should never happen") |
| 205 | + } |
| 206 | + |
| 207 | + # Fix for https://github.com/tidyverse/dplyr/issues/6746 |
| 208 | + force(i) |
| 209 | + |
| 210 | + new_coldata <- df |> |
| 211 | + # We need to make the cell names globally unique, which we can guarantee |
| 212 | + # by adding a suffix that is derived from file_id_db, which is the |
| 213 | + # grouping variable |
| 214 | + mutate(original_cell_id = .data$cell_, cell_ = glue("{cell_}_{i}")) |> |
| 215 | + column_to_rownames("cell_") |> |
| 216 | + as("DataFrame") |
| 217 | + |
| 218 | + `if`( |
| 219 | + is.null(features), |
| 220 | + sce[, new_coldata$original_cell_id], |
| 221 | + { |
| 222 | + # Optionally subset the genes |
| 223 | + genes <- rownames(sce) |> intersect(features) |
| 224 | + sce[genes, new_coldata$original_cell_id] |
| 225 | + } |
| 226 | + ) |> |
| 227 | + `colnames<-`(new_coldata$cell_) |> |
| 228 | + `colData<-`(value = new_coldata) |
| 229 | +} |
| 230 | + |
| 231 | +#' Synchronises one or more remote assays with a local copy |
| 232 | +#' @param url A character vector of length one. The base HTTP URL from which to |
| 233 | +#' obtain the files. |
| 234 | +#' @param cache_dir A character vector of length one. The local filepath to |
| 235 | +#' synchronise files to. |
| 236 | +#' @param subdirs A character vector of subdirectories within the root URL to |
| 237 | +#' sync. These correspond to assays. |
| 238 | +#' @param files A character vector containing one or more file_id_db entries |
| 239 | +#' @returns A character vector consisting of file paths to all the newly |
| 240 | +#' downloaded files |
| 241 | +#' @return A character vector of files that have been downloaded |
| 242 | +#' @importFrom purrr pmap_chr map_chr |
| 243 | +#' @importFrom httr modify_url |
| 244 | +#' @importFrom dplyr transmute filter |
| 245 | +#' @noRd |
| 246 | +#' |
| 247 | +sync_assay_files <- function( |
| 248 | + url = parse_url(REMOTE_URL), |
| 249 | + cache_dir, |
| 250 | + subdirs, |
| 251 | + files |
| 252 | +) { |
| 253 | + # Find every combination of file name, sample id, and assay, since each |
| 254 | + # will be a separate file we need to download |
| 255 | + files <- expand.grid( |
| 256 | + filename = c("assays.h5", "se.rds"), |
| 257 | + sample_id = files, |
| 258 | + subdir = subdirs, |
| 259 | + stringsAsFactors = FALSE |
| 260 | + ) |> |
| 261 | + transmute( |
| 262 | + # Path to the file of interest from the root path. We use "/" |
| 263 | + # since URLs must use these regardless of OS |
| 264 | + full_url = paste0( |
| 265 | + url$path, |
| 266 | + "/", |
| 267 | + .data$subdir, |
| 268 | + "/", |
| 269 | + .data$sample_id, |
| 270 | + "/", |
| 271 | + .data$filename |
| 272 | + ) |> map_chr(~ modify_url(url, path = .)), |
| 273 | + |
| 274 | + # Path to save the file on local disk (and its parent directory) |
| 275 | + # We use file.path since the file separator will differ on other OSs |
| 276 | + output_dir = file.path( |
| 277 | + cache_dir, |
| 278 | + .data$subdir, |
| 279 | + .data$sample_id |
| 280 | + ), |
| 281 | + output_file = file.path( |
| 282 | + .data$output_dir, |
| 283 | + .data$filename |
| 284 | + ) |
| 285 | + ) |> |
| 286 | + filter( |
| 287 | + # Don't bother downloading files that don't exist TODO: use some |
| 288 | + # kind of hashing to check if the remote file has changed, and |
| 289 | + # proceed with the download if it has. However this is low |
| 290 | + # importance as the repository is not likely to change often |
| 291 | + !file.exists(.data$output_file) |
| 292 | + ) |
| 293 | + |
| 294 | + report_file_sizes(files$full_url) |
| 295 | + |
| 296 | + pmap_chr(files, function(full_url, output_dir, output_file) { |
| 297 | + sync_remote_file(full_url, output_file) |
| 298 | + output_file |
| 299 | + }, .progress = list(name = "Downloading files")) |
| 300 | +} |
0 commit comments