Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 35fbc4b

Browse files
committed
Bioc improvements
1 parent 31692fa commit 35fbc4b

18 files changed

+556
-42
lines changed

R/counts.R

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
# Functions that relate to downloading count data into SingleCellExperiments
2+
3+
# We need to load utils now so it can be used at the top level
4+
#' @include utils.R
5+
# This is a hack to force Seurat packages to be loaded, and also to
6+
# satisfy R CMD check. We don't need to attach them at all.
7+
#' @importFrom Seurat as.SingleCellExperiment
8+
NULL
9+
10+
# Maps user provided assay names to their corresponding paths in the repository
11+
assay_map <- c(
12+
counts = "original",
13+
cpm = "cpm"
14+
)
15+
16+
#' Base URL pointing to the count data
17+
COUNTS_URL <- single_line_str(
18+
"https://swift.rc.nectar.org.au/v1/
19+
AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
20+
)
21+
#' Current version of the counts. This will be incremented when a newer
22+
#' version is released
23+
COUNTS_VERSION <- "0.2"
24+
25+
#' Gets a SingleCellExperiment from curated metadata
26+
#'
27+
#' Given a data frame of Curated Atlas metadata obtained from [get_metadata()],
28+
#' returns a [`SingleCellExperiment::SingleCellExperiment-class`] object
29+
#' corresponding to the samples in that data frame
30+
#'
31+
#' @param data A data frame containing, at minimum, a `sample_` column, which
32+
#' corresponds to a single cell sample ID. This can be obtained from the
33+
#' [get_metadata()] function.
34+
#' @param assays A character vector whose elements must be either "counts"
35+
#' and/or "cpm", representing the corresponding assay(s) you want to request.
36+
#' By default only the count assay is downloaded. If you are interested in
37+
#' comparing a limited amount of genes, the "cpm" assay is more appropriate.
38+
#' @param repository A character vector of length one. If provided, it should be
39+
#' an HTTP URL pointing to the location where the single cell data is stored.
40+
#' @param cache_directory An optional character vector of length one. If
41+
#' provided, it should indicate a local file path where any remotely accessed
42+
#' files should be copied.
43+
#' @param features An optional character vector of features (ie genes) to return
44+
#' the counts for. By default counts for all features will be returned.
45+
#' @returns A SingleCellExperiment object, with one assay for each value in the
46+
#' assays argument
47+
#' @examples
48+
#' meta <- get_metadata() |> head(2)
49+
#' sce <- get_SingleCellExperiment(meta)
50+
#'
51+
#' @importFrom dplyr pull filter as_tibble inner_join collect
52+
#' @importFrom tibble column_to_rownames
53+
#' @importFrom purrr reduce map map_int imap keep
54+
#' @importFrom BiocGenerics cbind
55+
#' @importFrom glue glue
56+
#' @importFrom HDF5Array loadHDF5SummarizedExperiment HDF5RealizationSink
57+
#' loadHDF5SummarizedExperiment
58+
#' @importFrom SingleCellExperiment SingleCellExperiment combineCols
59+
#' @importFrom SummarizedExperiment colData assayNames<-
60+
#' @importFrom httr parse_url
61+
#' @importFrom assertthat assert_that has_name
62+
#' @importFrom cli cli_alert_success cli_alert_info
63+
#' @importFrom rlang .data
64+
#' @importFrom stats setNames
65+
#' @importFrom S4Vectors DataFrame
66+
#' @export
67+
get_SingleCellExperiment <- function(
68+
data,
69+
assays = "counts",
70+
cache_directory = get_default_cache_dir(),
71+
repository = COUNTS_URL,
72+
features = NULL
73+
) {
74+
# Parameter validation
75+
assays %in% names(assay_map) |>
76+
all() |>
77+
assert_that(
78+
msg = 'assays must be a character vector containing "counts" and/or
79+
"cpm"'
80+
)
81+
(!anyDuplicated(assays)) |> assert_that()
82+
inherits(cache_directory, "character") |> assert_that()
83+
is.null(repository) || is.character(repository) |> assert_that()
84+
is.null(features) || is.character(features) |> assert_that()
85+
86+
# Data parameter validation (last, because it's slower)
87+
## Evaluate the promise now so that we get a sensible error message
88+
data
89+
## We have to convert to an in-memory table here, or some of the dplyr
90+
## operations will fail when passed a database connection
91+
cli_alert_info("Realising metadata.")
92+
raw_data <- collect(data)
93+
inherits(raw_data, "tbl") |> assert_that()
94+
has_name(raw_data, c("cell_", "file_id_db")) |> assert_that()
95+
96+
versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
97+
versioned_cache_directory |> dir.create(
98+
showWarnings = FALSE,
99+
recursive = TRUE
100+
)
101+
102+
subdirs <- assay_map[assays]
103+
104+
# The repository is optional. If not provided we load only from the cache
105+
if (!is.null(repository)) {
106+
cli_alert_info("Synchronising files")
107+
parsed_repo <- parse_url(repository)
108+
parsed_repo$scheme |>
109+
`%in%`(c("http", "https")) |>
110+
assert_that()
111+
112+
files_to_read <-
113+
raw_data |>
114+
pull(.data$file_id_db) |>
115+
unique() |>
116+
as.character() |>
117+
sync_assay_files(
118+
url = parsed_repo,
119+
cache_dir = versioned_cache_directory,
120+
files = _,
121+
subdirs = subdirs
122+
)
123+
}
124+
125+
cli_alert_info("Reading files.")
126+
sces <- subdirs |>
127+
imap(function(current_subdir, current_assay) {
128+
# Build up an SCE for each assay
129+
dir_prefix <- file.path(
130+
versioned_cache_directory,
131+
current_subdir
132+
)
133+
134+
raw_data |>
135+
dplyr::group_by(.data$file_id_db) |>
136+
# Load each file and attach metadata
137+
dplyr::summarise(sces = list(group_to_sce(
138+
dplyr::cur_group_id(),
139+
dplyr::cur_data_all(),
140+
dir_prefix,
141+
features
142+
))) |>
143+
dplyr::pull(sces) |>
144+
# Combine each sce by column, since each sce has a different set
145+
# of cells
146+
do.call(cbind, args = _)
147+
})
148+
149+
cli_alert_info("Compiling Single Cell Experiment.")
150+
# Combine all the assays
151+
sce <- sces[[1]]
152+
SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
153+
SummarizedExperiment::assays(sce)[[1]]
154+
})
155+
156+
sce
157+
}
158+
159+
#' Converts a data frame into a single SCE
160+
#' @param i Suffix to be added to the column names, to make them unique
161+
#' @param df The data frame to be converted
162+
#' @param dir_prefix The path to the single cell experiment, minus the final
163+
#' segment
164+
#' @param features The list of genes/rows of interest
165+
#' @return A SingleCellExperiment object
166+
#' @importFrom dplyr mutate filter
167+
#' @importFrom HDF5Array loadHDF5SummarizedExperiment
168+
#' @importFrom SummarizedExperiment colData<-
169+
#' @importFrom tibble column_to_rownames
170+
#' @importFrom utils head
171+
#' @importFrom cli cli_alert_warning cli_abort
172+
#' @importFrom glue glue
173+
#' @noRd
174+
group_to_sce <- function(i, df, dir_prefix, features) {
175+
sce_path <- df$file_id_db |>
176+
head(1) |>
177+
file.path(
178+
dir_prefix,
179+
suffix = _
180+
)
181+
182+
file.exists(sce_path) |>
183+
assert_that(
184+
msg = "Your cache does not contain a file you
185+
attempted to query. Please provide the repository
186+
parameter so that files can be synchronised from the
187+
internet"
188+
)
189+
190+
sce <- loadHDF5SummarizedExperiment(sce_path)
191+
# The cells we select here are those that are both available in the SCE
192+
# object, and requested for this particular file
193+
cells <- colnames(sce) |> intersect(df$cell_)
194+
195+
if (length(cells) < nrow(df)){
196+
str_replace_all(
197+
"Some cells were filtered out because of extremely low counts. The
198+
number of cells in the SingleCellExperiment will be less than the
199+
number of cells you have selected from the metadata."
200+
)
201+
df <- filter(df, .data$cell_ %in% cells)
202+
}
203+
else if (length(cells) > nrow(df)){
204+
cli_abort("This should never happen")
205+
}
206+
207+
# Fix for https://github.com/tidyverse/dplyr/issues/6746
208+
force(i)
209+
210+
new_coldata <- df |>
211+
# We need to make the cell names globally unique, which we can guarantee
212+
# by adding a suffix that is derived from file_id_db, which is the
213+
# grouping variable
214+
mutate(original_cell_id = .data$cell_, cell_ = glue("{cell_}_{i}")) |>
215+
column_to_rownames("cell_") |>
216+
as("DataFrame")
217+
218+
`if`(
219+
is.null(features),
220+
sce[, new_coldata$original_cell_id],
221+
{
222+
# Optionally subset the genes
223+
genes <- rownames(sce) |> intersect(features)
224+
sce[genes, new_coldata$original_cell_id]
225+
}
226+
) |>
227+
`colnames<-`(new_coldata$cell_) |>
228+
`colData<-`(value = new_coldata)
229+
}
230+
231+
#' Synchronises one or more remote assays with a local copy
232+
#' @param url A character vector of length one. The base HTTP URL from which to
233+
#' obtain the files.
234+
#' @param cache_dir A character vector of length one. The local filepath to
235+
#' synchronise files to.
236+
#' @param subdirs A character vector of subdirectories within the root URL to
237+
#' sync. These correspond to assays.
238+
#' @param files A character vector containing one or more file_id_db entries
239+
#' @returns A character vector consisting of file paths to all the newly
240+
#' downloaded files
241+
#' @return A character vector of files that have been downloaded
242+
#' @importFrom purrr pmap_chr map_chr
243+
#' @importFrom httr modify_url
244+
#' @importFrom dplyr transmute filter
245+
#' @noRd
246+
#'
247+
sync_assay_files <- function(
248+
url = parse_url(REMOTE_URL),
249+
cache_dir,
250+
subdirs,
251+
files
252+
) {
253+
# Find every combination of file name, sample id, and assay, since each
254+
# will be a separate file we need to download
255+
files <- expand.grid(
256+
filename = c("assays.h5", "se.rds"),
257+
sample_id = files,
258+
subdir = subdirs,
259+
stringsAsFactors = FALSE
260+
) |>
261+
transmute(
262+
# Path to the file of interest from the root path. We use "/"
263+
# since URLs must use these regardless of OS
264+
full_url = paste0(
265+
url$path,
266+
"/",
267+
.data$subdir,
268+
"/",
269+
.data$sample_id,
270+
"/",
271+
.data$filename
272+
) |> map_chr(~ modify_url(url, path = .)),
273+
274+
# Path to save the file on local disk (and its parent directory)
275+
# We use file.path since the file separator will differ on other OSs
276+
output_dir = file.path(
277+
cache_dir,
278+
.data$subdir,
279+
.data$sample_id
280+
),
281+
output_file = file.path(
282+
.data$output_dir,
283+
.data$filename
284+
)
285+
) |>
286+
filter(
287+
# Don't bother downloading files that don't exist TODO: use some
288+
# kind of hashing to check if the remote file has changed, and
289+
# proceed with the download if it has. However this is low
290+
# importance as the repository is not likely to change often
291+
!file.exists(.data$output_file)
292+
)
293+
294+
report_file_sizes(files$full_url)
295+
296+
pmap_chr(files, function(full_url, output_dir, output_file) {
297+
sync_remote_file(full_url, output_file)
298+
output_file
299+
}, .progress = list(name = "Downloading files"))
300+
}

R/seurat.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Functions that relate to the Seurat conversion
2+
3+
#' @importFrom assertthat assert_that
4+
#' @importFrom methods as
5+
#' @importFrom SeuratObject as.sparse
6+
#' @exportS3Method
7+
as.sparse.DelayedMatrix <- function(x) {
8+
# This is glue to ensure the SCE -> Seurat conversion works properly with
9+
# DelayedArray types
10+
as(x, "dgCMatrix")
11+
}
12+
13+
#' Given a data frame of HCA metadata, returns a Seurat object corresponding to
14+
#' the samples in that data frame
15+
#'
16+
#' @inheritDotParams get_SingleCellExperiment
17+
#' @importFrom SeuratObject as.Seurat
18+
#' @export
19+
#' @return A Seurat object containing the same data as a call to
20+
#' get_SingleCellExperiment.
21+
#' @examples
22+
#' meta <- get_metadata() |> head(2)
23+
#' seurat <- get_seurat(meta)
24+
#'
25+
get_seurat <- function(...) {
26+
get_SingleCellExperiment(...) |> as.Seurat(data = NULL)
27+
}

man/COUNTS_URL.Rd

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/COUNTS_VERSION.Rd

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/UNHARMONISED_URL.Rd

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/cache.Rd

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)