Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit cfeaed2

Browse files
authored
Merge pull request #109 from stemangiola/bioc-review
Bioc review changes
2 parents 2d76f58 + 0deb46b commit cfeaed2

23 files changed

+682
-486
lines changed

.github/workflows/check-bioc.yml

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ jobs:
4848
## Environment variables unique to this job.
4949

5050
strategy:
51-
fail-fast: false
51+
fail-fast: true
5252
matrix:
5353
config:
54-
- { os: ubuntu-latest, r: '4.2', bioc: '3.15', cont: "bioconductor/bioconductor_docker:RELEASE_3_15", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
55-
- { os: macOS-latest, r: '4.2', bioc: '3.15'}
56-
- { os: windows-latest, r: '4.2', bioc: '3.15'}
54+
- { os: ubuntu-latest, r: '4.2', bioc: '3.16', cont: "bioconductor/bioconductor_docker:RELEASE_3_16", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
55+
- { os: macOS-latest, r: '4.2', bioc: '3.16'}
56+
- { os: windows-latest, r: '4.2', bioc: '3.16'}
5757
env:
5858
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
5959
RSPM: ${{ matrix.config.rspm }}
@@ -95,26 +95,20 @@ jobs:
9595
install.packages('remotes')
9696
saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
9797
shell: Rscript {0}
98-
99-
- name: Cache assay data
100-
uses: actions/cache@v3
101-
with:
102-
key: ${{ runner.os }}-hca-harmonised
103-
path: ~/.cache/hca_harmonised
10498

10599
- name: Cache R packages
106100
if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'"
107101
uses: actions/cache@v2
108102
with:
109103
path: ${{ env.R_LIBS_USER }}
110-
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2
104+
key: "${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_${{ matrix.config.bioc }}r-{{ matrix.config.r }}"
111105

112106
- name: Cache R packages on Linux
113107
if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
114108
uses: actions/cache@v2
115109
with:
116110
path: /home/runner/work/_temp/Library
117-
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2
111+
key: "${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_${{ matrix.config.bioc }}r-{{ matrix.config.r }}"
118112

119113
- name: Install Linux system dependencies
120114
if: runner.os == 'Linux'
@@ -172,15 +166,15 @@ jobs:
172166
173167
## Pass #1 at installing dependencies
174168
message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
175-
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE)
169+
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
176170
continue-on-error: true
177171
shell: Rscript {0}
178172

179173
- name: Install dependencies pass 2
180174
run: |
181175
## Pass #2 at installing dependencies
182176
message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
183-
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE)
177+
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
184178
185179
## For running the checks
186180
message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
@@ -219,7 +213,7 @@ jobs:
219213
_R_CHECK_CRAN_INCOMING_: false
220214
run: |
221215
rcmdcheck::rcmdcheck(
222-
args = c("--no-build-vignettes", "--no-manual", "--timings"),
216+
args = c("--no-manual", "--timings"),
223217
build_args = c("--no-manual", "--no-resave-data"),
224218
error_on = "warning",
225219
check_dir = "check"

DESCRIPTION

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@ Authors@R: c(
77
"Stefano",
88
"Mangiola",
99
email = "[email protected]",
10-
role = c("aut", "cre", "rev")
10+
role = c("aut", "cre", "rev"),
11+
comment = c(ORCID = "0000-0001-7474-836X")
1112
),
1213
person(
1314
"Michael",
1415
"Milton",
1516
email = "[email protected]",
16-
role = c("aut", "rev")
17+
role = c("aut", "rev"),
18+
comment = c(ORCID = "0000-0002-8965-2595")
1719
),
1820
person(
1921
"Martin",
@@ -50,7 +52,7 @@ Authors@R: c(
5052
role = c( "fnd")
5153
),
5254
person(
53-
"Victoria Cancer Agnency",
55+
"Victoria Cancer Agency",
5456
"ECRF21036",
5557
role = c( "fnd")
5658
),
@@ -99,7 +101,12 @@ Suggests:
99101
testthat,
100102
basilisk,
101103
arrow,
102-
reticulate
104+
reticulate,
105+
spelling,
106+
forcats,
107+
ggplot2,
108+
tidySingleCellExperiment,
109+
rprojroot
103110
Biarch: true
104111
biocViews:
105112
AssayDomain,
@@ -128,3 +135,4 @@ Collate:
128135
'seurat.R'
129136
'unharmonised.R'
130137
'zzz.R'
138+
Language: en-US

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# Generated by roxygen2: do not edit by hand
22

33
S3method(as.sparse,DelayedMatrix)
4+
export(DATABASE_URL)
5+
export(SAMPLE_DATABASE_URL)
46
export(get_SingleCellExperiment)
57
export(get_metadata)
68
export(get_seurat)
9+
export(get_single_cell_experiment)
710
export(get_unharmonised_metadata)
811
importFrom(BiocGenerics,cbind)
912
importFrom(DBI,dbConnect)

R/counts.R

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,18 @@ COUNTS_URL <- single_line_str(
2222
#' version is released
2323
COUNTS_VERSION <- "0.2.1"
2424

25+
#' @inherit get_single_cell_experiment
26+
#' @inheritDotParams get_single_cell_experiment
27+
#' @importFrom cli cli_alert_warning
28+
#' @export
29+
get_SingleCellExperiment <- function(...){
30+
single_line_str("This function name is deprecated.
31+
Please use `get_single_cell_experiment()` instead") |>
32+
cli_alert_warning()
33+
34+
get_single_cell_experiment(...)
35+
}
36+
2537
#' Gets a SingleCellExperiment from curated metadata
2638
#'
2739
#' Given a data frame of Curated Atlas metadata obtained from [get_metadata()],
@@ -46,7 +58,7 @@ COUNTS_VERSION <- "0.2.1"
4658
#' assays argument
4759
#' @examples
4860
#' meta <- get_metadata() |> head(2)
49-
#' sce <- get_SingleCellExperiment(meta)
61+
#' sce <- get_single_cell_experiment(meta)
5062
#'
5163
#' @importFrom dplyr pull filter as_tibble inner_join collect
5264
#' @importFrom tibble column_to_rownames
@@ -64,34 +76,39 @@ COUNTS_VERSION <- "0.2.1"
6476
#' @importFrom stats setNames
6577
#' @importFrom S4Vectors DataFrame
6678
#' @export
67-
get_SingleCellExperiment <- function(
79+
get_single_cell_experiment <- function(
6880
data,
6981
assays = "counts",
7082
cache_directory = get_default_cache_dir(),
7183
repository = COUNTS_URL,
7284
features = NULL
7385
) {
7486
# Parameter validation
87+
7588
assays %in% names(assay_map) |>
7689
all() |>
7790
assert_that(
7891
msg = 'assays must be a character vector containing "counts" and/or
79-
"cpm"'
92+
"cpm"'
8093
)
81-
(!anyDuplicated(assays)) |> assert_that()
82-
inherits(cache_directory, "character") |> assert_that()
83-
is.null(repository) || is.character(repository) |> assert_that()
84-
is.null(features) || is.character(features) |> assert_that()
94+
assert_that(
95+
!anyDuplicated(assays),
96+
inherits(cache_directory, "character"),
97+
is.null(repository) || is.character(repository),
98+
is.null(features) || is.character(features)
99+
)
85100

86101
# Data parameter validation (last, because it's slower)
87102
## Evaluate the promise now so that we get a sensible error message
88-
data
103+
force(data)
89104
## We have to convert to an in-memory table here, or some of the dplyr
90105
## operations will fail when passed a database connection
91106
cli_alert_info("Realising metadata.")
92107
raw_data <- collect(data)
93-
inherits(raw_data, "tbl") |> assert_that()
94-
has_name(raw_data, c("cell_", "file_id_db")) |> assert_that()
108+
assert_that(
109+
inherits(raw_data, "tbl"),
110+
has_name(raw_data, c("cell_", "file_id_db"))
111+
)
95112

96113
versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
97114
versioned_cache_directory |> dir.create(
@@ -182,10 +199,10 @@ group_to_sce <- function(i, df, dir_prefix, features) {
182199

183200
file.exists(sce_path) |>
184201
assert_that(
185-
msg = "Your cache does not contain a file you
202+
msg = "Your cache does not contain a file {sce_path} you
186203
attempted to query. Please provide the repository
187204
parameter so that files can be synchronised from the
188-
internet"
205+
internet" |> glue()
189206
)
190207

191208
sce <- loadHDF5SummarizedExperiment(sce_path)
@@ -194,11 +211,12 @@ group_to_sce <- function(i, df, dir_prefix, features) {
194211
cells <- colnames(sce) |> intersect(df$cell_)
195212

196213
if (length(cells) < nrow(df)){
197-
str_replace_all(
198-
"Some cells were filtered out because of extremely low counts. The
214+
single_line_str(
215+
"Some cells were filtered out while loading {head(df$file_id_db, 1)}
216+
because of extremely low counts. The
199217
number of cells in the SingleCellExperiment will be less than the
200218
number of cells you have selected from the metadata."
201-
)
219+
) |> cli_alert_warning()
202220
df <- filter(df, .data$cell_ %in% cells)
203221
}
204222
else if (length(cells) > nrow(df)){

R/dev.R

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
145145
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_scaled_0.2.1"
146146
#' )
147147
#' }
148-
dir_to_anndata = function(src, dest){
148+
dir_to_anndata <- function(src, dest){
149149
dir.create(dest, showWarnings = FALSE)
150150
# This is a quick utility script to convert the SCE files into AnnData format for use in Pythonlist.files("/vast/projects/RCP/human_cell_atlas/splitted_DB2_data", full.names = FALSE) |> purrr::walk(function(dir){
151151
basilisk::basiliskRun(fun = function(sce) {
@@ -184,3 +184,53 @@ dir_to_anndata = function(src, dest){
184184
}, .progress = "Converting files")
185185
}, env = zellkonverter::zellkonverterAnnDataEnv())
186186
}
187+
188+
#' Makes a "downsampled" metadata file that only contains the minimal data
189+
#' needed to run the vignette.
190+
#' @param output Character scalar. Path to the output file.
191+
#' @return NULL
192+
#' @keywords internal
193+
downsample_metadata <- function(output = "sample_meta.parquet"){
194+
metadata <- get_metadata()
195+
196+
# Make a table of rows per dataset
197+
dataset_sizes <- metadata |>
198+
dplyr::group_by(.data$file_id_db) |>
199+
summarise(n = dplyr::n()) |>
200+
dplyr::collect()
201+
202+
# Find a minimal set of file_id_dbs we need
203+
minimal_file_ids <- rlang::exprs(
204+
# Used by the vignette
205+
.data$ethnicity == "African" &
206+
stringr::str_like(.data$assay, "%10x%") &
207+
.data$tissue == "lung parenchyma" &
208+
stringr::str_like(.data$cell_type, "%CD4%"),
209+
.data$cell_type_harmonised == "nk",
210+
.data$cell_type_harmonised == "cd14 mono",
211+
.data$tissue == "kidney blood vessel",
212+
# Used by tests
213+
.data$file_id_db == "3214d8f8986c1e33a85be5322f2db4a9",
214+
.data$cell_ == "868417_1"
215+
) |>
216+
purrr::map(function(filter){
217+
all_ids <- metadata |>
218+
dplyr::filter(!!filter) |>
219+
dplyr::group_by(.data$file_id_db) |>
220+
dplyr::pull(.data$file_id_db) |> unique()
221+
222+
dataset_sizes |>
223+
dplyr::filter(.data$file_id_db %in% all_ids) |>
224+
dplyr::slice_min(n=50, order_by = .data$n) |>
225+
dplyr::pull(.data$file_id_db)
226+
}) |>
227+
purrr::reduce(union)
228+
229+
metadata |>
230+
dplyr::filter(.data$file_id_db %in% minimal_file_ids) |>
231+
dplyr::arrange(.data$file_id_db, .data$sample_) |>
232+
dplyr::collect() |>
233+
arrow::write_parquet(output)
234+
235+
NULL
236+
}

R/metadata.R

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,38 @@ cache <- rlang::env(
88
metadata_table = rlang::env()
99
)
1010

11+
#' URL pointing to the full metadata file
12+
#' @export
13+
#' @examples
14+
#' get_metadata(remote_url = DATABASE_URL)
1115
DATABASE_URL <- single_line_str(
1216
"https://object-store.rc.nectar.org.au/v1/
1317
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
1418
)
1519

20+
#' URL pointing to the sample metadata file, which is smaller and for test,
21+
#' demonstration, and vignette purposes only
22+
#' @export
23+
#' @examples
24+
#' get_metadata(remote_url = SAMPLE_DATABASE_URL)
25+
SAMPLE_DATABASE_URL <- single_line_str(
26+
"https://object-store.rc.nectar.org.au/v1/
27+
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/
28+
sample_metadata.0.2.3.parquet"
29+
)
30+
1631
#' Gets the Curated Atlas metadata as a data frame.
1732
#'
1833
#' Downloads a parquet database of the Human Cell Atlas metadata to a local
1934
#' cache, and then opens it as a data frame. It can then be filtered and passed
20-
#' into [get_SingleCellExperiment()] to obtain a
35+
#' into [get_single_cell_experiment()] to obtain a
2136
#' [`SingleCellExperiment::SingleCellExperiment-class`]
2237
#'
2338
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
2439
#' to the location of the parquet database.
2540
#' @param cache_directory Optional character vector of length 1. A file path on
2641
#' your local system to a directory (not a file) that will be used to store
27-
#' metadata.parquet
42+
#' `metadata.parquet`
2843
#' @param use_cache Optional logical scalar. If `TRUE` (the default), and this
2944
#' function has been called before with the same parameters, then a cached
3045
#' reference to the table will be returned. If `FALSE`, a new connection will
@@ -116,7 +131,7 @@ DATABASE_URL <- single_line_str(
116131
#'
117132
#' Error in `db_query_fields.DBIConnection()`: ! Can't query fields. Caused by
118133
#' error: ! Parser Error: syntax error at or near "/" LINE 2: FROM
119-
#' /Users/bob/Library/Cach...
134+
#' /Users/bob/Library/Caches...
120135
#'
121136
#' The solution is to choose a different cache, for example
122137
#'
@@ -134,13 +149,17 @@ get_metadata <- function(
134149
cached_connection
135150
}
136151
else {
137-
report_file_sizes(remote_url)
138152
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")
139-
sync_remote_file(
140-
remote_url,
141-
db_path,
142-
progress(type = "down", con = stderr())
143-
)
153+
154+
if (!file.exists(db_path)){
155+
report_file_sizes(remote_url)
156+
sync_remote_file(
157+
remote_url,
158+
db_path,
159+
progress(type = "down", con = stderr())
160+
)
161+
}
162+
144163
table <- duckdb() |>
145164
dbConnect(drv = _, read_only = TRUE) |>
146165
tbl(db_path)

R/seurat.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@ as.sparse.DelayedMatrix <- function(x) {
1313
#' Given a data frame of HCA metadata, returns a Seurat object corresponding to
1414
#' the samples in that data frame
1515
#'
16-
#' @inheritDotParams get_SingleCellExperiment
16+
#' @inheritDotParams get_single_cell_experiment
1717
#' @importFrom SeuratObject as.Seurat
1818
#' @export
1919
#' @return A Seurat object containing the same data as a call to
20-
#' get_SingleCellExperiment.
20+
#' [get_single_cell_experiment()]
2121
#' @examples
2222
#' meta <- get_metadata() |> head(2)
2323
#' seurat <- get_seurat(meta)
2424
#'
2525
get_seurat <- function(...) {
26-
get_SingleCellExperiment(...) |> as.Seurat(data = NULL)
26+
get_single_cell_experiment(...) |> as.Seurat(data = NULL)
2727
}

0 commit comments

Comments
 (0)