Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 8a94a69

Browse files
authored
Merge pull request #103 from stemangiola/fix-bioc
Bioc improvements
2 parents 212be10 + 4322f15 commit 8a94a69

28 files changed

+539
-221
lines changed

.github/workflows/check-bioc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,8 @@ jobs:
243243
dir('check', 'tar.gz$', full.names = TRUE),
244244
`quit-with-status` = TRUE,
245245
`no-check-R-ver` = TRUE,
246-
`no-check-bioc-help` = TRUE
246+
`no-check-bioc-help` = TRUE,
247+
`new-package` = TRUE
247248
)
248249
shell: Rscript {0}
249250

DESCRIPTION

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: CuratedAtlasQueryR
33
Title: Queries the Human Cell Atlas
4-
Version: 0.4.3
4+
Version: 0.99.1
55
Authors@R: c(
66
person(
77
"Stefano",
@@ -90,10 +90,10 @@ Imports:
9090
tibble,
9191
utils,
9292
dbplyr (>= 2.3.0),
93-
duckdb
93+
duckdb,
94+
stringr
9495
Suggests:
9596
here,
96-
stringr,
9797
tidyseurat,
9898
zellkonverter,
9999
scMerge,
@@ -130,3 +130,11 @@ URL: https://github.com/stemangiola/CuratedAtlasQueryR
130130
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
131131
VignetteBuilder: knitr
132132
Roxygen: list(markdown = TRUE)
133+
Collate:
134+
'utils.R'
135+
'counts.R'
136+
'dev.R'
137+
'metadata.R'
138+
'seurat.R'
139+
'unharmonised.R'
140+
'zzz.R'

NAMESPACE

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@ export(get_SingleCellExperiment)
55
export(get_metadata)
66
export(get_seurat)
77
export(get_unharmonised_metadata)
8-
import(Seurat)
9-
import(dbplyr)
108
importFrom(BiocGenerics,cbind)
119
importFrom(DBI,dbConnect)
1210
importFrom(DBI,dbDisconnect)
1311
importFrom(HDF5Array,HDF5RealizationSink)
1412
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
1513
importFrom(S4Vectors,DataFrame)
14+
importFrom(Seurat,as.SingleCellExperiment)
1615
importFrom(SeuratObject,as.Seurat)
1716
importFrom(SeuratObject,as.sparse)
1817
importFrom(SingleCellExperiment,SingleCellExperiment)
@@ -26,6 +25,7 @@ importFrom(cli,cli_abort)
2625
importFrom(cli,cli_alert_info)
2726
importFrom(cli,cli_alert_success)
2827
importFrom(cli,cli_alert_warning)
28+
importFrom(cli,hash_sha256)
2929
importFrom(dbplyr,remote_con)
3030
importFrom(dplyr,as_tibble)
3131
importFrom(dplyr,collect)
@@ -59,6 +59,8 @@ importFrom(purrr,set_names)
5959
importFrom(purrr,walk)
6060
importFrom(rlang,.data)
6161
importFrom(stats,setNames)
62+
importFrom(stringr,str_remove_all)
63+
importFrom(stringr,str_replace_all)
6264
importFrom(tibble,column_to_rownames)
6365
importFrom(tools,R_user_dir)
6466
importFrom(utils,head)

R/query.R renamed to R/counts.R

Lines changed: 36 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
# These are hacks to force the above packages to be loaded, and also to
1+
# Functions that relate to downloading count data into SingleCellExperiments
2+
3+
# We need to load utils now so it can be used at the top level
4+
#' @include utils.R
5+
# This is a hack to force Seurat packages to be loaded, and also to
26
# satisfy R CMD check. We don't need to attach them at all.
3-
#' @import dbplyr
4-
#' @import Seurat
7+
#' @importFrom Seurat as.SingleCellExperiment
58
NULL
69

710
# Maps user provided assay names to their corresponding paths in the repository
@@ -10,22 +13,28 @@ assay_map <- c(
1013
cpm = "cpm"
1114
)
1215

13-
REMOTE_URL <- "https://swift.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
16+
#' Base URL pointing to the count data
17+
COUNTS_URL <- single_line_str(
18+
"https://swift.rc.nectar.org.au/v1/
19+
AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
20+
)
21+
#' Current version of the counts. This will be incremented when a newer
22+
#' version is released
1423
COUNTS_VERSION <- "0.2"
1524

1625
#' Gets a SingleCellExperiment from curated metadata
1726
#'
1827
#' Given a data frame of Curated Atlas metadata obtained from [get_metadata()],
19-
#' returns a [`SingleCellExperiment::SingleCellExperiment-class`] object corresponding to the samples in that
20-
#' data frame
28+
#' returns a [`SingleCellExperiment::SingleCellExperiment-class`] object
29+
#' corresponding to the samples in that data frame
2130
#'
2231
#' @param data A data frame containing, at minimum, a `sample_` column, which
2332
#' corresponds to a single cell sample ID. This can be obtained from the
2433
#' [get_metadata()] function.
2534
#' @param assays A character vector whose elements must be either "counts"
26-
#' and/or "cpm", representing the corresponding assay(s) you want to request.
27-
#' By default only the count assay is downloaded. If you are interested in comparing a limited amount of genes,
28-
#' the "cpm" assay is more appropriate.
35+
#' and/or "cpm", representing the corresponding assay(s) you want to request.
36+
#' By default only the count assay is downloaded. If you are interested in
37+
#' comparing a limited amount of genes, the "cpm" assay is more appropriate.
2938
#' @param repository A character vector of length one. If provided, it should be
3039
#' an HTTP URL pointing to the location where the single cell data is stored.
3140
#' @param cache_directory An optional character vector of length one. If
@@ -54,15 +63,12 @@ COUNTS_VERSION <- "0.2"
5463
#' @importFrom rlang .data
5564
#' @importFrom stats setNames
5665
#' @importFrom S4Vectors DataFrame
57-
#'
5866
#' @export
59-
#'
60-
#'
6167
get_SingleCellExperiment <- function(
6268
data,
6369
assays = "counts",
6470
cache_directory = get_default_cache_dir(),
65-
repository = REMOTE_URL,
71+
repository = COUNTS_URL,
6672
features = NULL
6773
) {
6874
# Parameter validation
@@ -88,7 +94,10 @@ get_SingleCellExperiment <- function(
8894
has_name(raw_data, c("cell_", "file_id_db")) |> assert_that()
8995

9096
versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
91-
versioned_cache_directory |> dir.create(showWarnings = FALSE, recursive = TRUE)
97+
versioned_cache_directory |> dir.create(
98+
showWarnings = FALSE,
99+
recursive = TRUE
100+
)
92101

93102
subdirs <- assay_map[assays]
94103

@@ -150,7 +159,8 @@ get_SingleCellExperiment <- function(
150159
#' Converts a data frame into a single SCE
151160
#' @param i Suffix to be added to the column names, to make them unique
152161
#' @param df The data frame to be converted
153-
#' @param dir_prefix The path to the single cell experiment, minus the final segment
162+
#' @param dir_prefix The path to the single cell experiment, minus the final
163+
#' segment
154164
#' @param features The list of genes/rows of interest
155165
#' @return A SingleCellExperiment object
156166
#' @importFrom dplyr mutate filter
@@ -160,6 +170,7 @@ get_SingleCellExperiment <- function(
160170
#' @importFrom utils head
161171
#' @importFrom cli cli_alert_warning cli_abort
162172
#' @importFrom glue glue
173+
#' @importFrom stringr str_replace_all
163174
#' @noRd
164175
group_to_sce <- function(i, df, dir_prefix, features) {
165176
sce_path <- df$file_id_db |>
@@ -183,8 +194,12 @@ group_to_sce <- function(i, df, dir_prefix, features) {
183194
cells <- colnames(sce) |> intersect(df$cell_)
184195

185196
if (length(cells) < nrow(df)){
186-
cli_alert_warning("Some cells were filtered out because of extremely low counts. The number of cells in the SingleCellExperiment will be less than the number of cells you have selected from the metadata.")
187-
df = filter(df, .data$cell_ %in% cells)
197+
str_replace_all(
198+
"Some cells were filtered out because of extremely low counts. The
199+
number of cells in the SingleCellExperiment will be less than the
200+
number of cells you have selected from the metadata."
201+
)
202+
df <- filter(df, .data$cell_ %in% cells)
188203
}
189204
else if (length(cells) > nrow(df)){
190205
cli_abort("This should never happen")
@@ -195,8 +210,8 @@ group_to_sce <- function(i, df, dir_prefix, features) {
195210

196211
new_coldata <- df |>
197212
# We need to make the cell names globally unique, which we can guarantee
198-
# by adding a suffix that is derived from file_id_db, which is the grouping
199-
# variable
213+
# by adding a suffix that is derived from file_id_db, which is the
214+
# grouping variable
200215
mutate(original_cell_id = .data$cell_, cell_ = glue("{cell_}_{i}")) |>
201216
column_to_rownames("cell_") |>
202217
as("DataFrame")
@@ -231,14 +246,14 @@ group_to_sce <- function(i, df, dir_prefix, features) {
231246
#' @noRd
232247
#'
233248
sync_assay_files <- function(
234-
url = parse_url(REMOTE_URL),
249+
url = parse_url(COUNTS_URL),
235250
cache_dir,
236251
subdirs,
237252
files
238253
) {
239254
# Find every combination of file name, sample id, and assay, since each
240255
# will be a separate file we need to download
241-
files = expand.grid(
256+
files <- expand.grid(
242257
filename = c("assays.h5", "se.rds"),
243258
sample_id = files,
244259
subdir = subdirs,
@@ -284,69 +299,3 @@ sync_assay_files <- function(
284299
output_file
285300
}, .progress = list(name = "Downloading files"))
286301
}
287-
288-
#' Synchronises a single remote file with a local path
289-
#' @importFrom httr write_disk GET stop_for_status
290-
#' @importFrom cli cli_abort cli_alert_info
291-
#' @noRd
292-
sync_remote_file <- function(full_url, output_file, ...) {
293-
if (!file.exists(output_file)) {
294-
output_dir <- dirname(output_file)
295-
dir.create(output_dir,
296-
recursive = TRUE,
297-
showWarnings = FALSE
298-
)
299-
cli_alert_info("Downloading {full_url} to {output_file}")
300-
301-
tryCatch(
302-
GET(full_url, write_disk(output_file), ...) |> stop_for_status(),
303-
error = function(e) {
304-
# Clean up if we had an error
305-
file.remove(output_file)
306-
cli_abort("File {full_url} could not be downloaded. {e}")
307-
}
308-
)
309-
}
310-
}
311-
312-
#' Returns the default cache directory
313-
#'
314-
#' @return A length one character vector.
315-
#' @importFrom tools R_user_dir
316-
#' @importFrom utils packageName
317-
#' @noRd
318-
#'
319-
get_default_cache_dir <- function() {
320-
packageName() |>
321-
R_user_dir(
322-
"cache"
323-
) |>
324-
normalizePath() |>
325-
suppressWarnings()
326-
}
327-
328-
#' @importFrom assertthat assert_that
329-
#' @importFrom methods as
330-
#' @importFrom SeuratObject as.sparse
331-
#' @exportS3Method
332-
as.sparse.DelayedMatrix <- function(x) {
333-
# This is glue to ensure the SCE -> Seurat conversion works properly with
334-
# DelayedArray types
335-
as(x, "dgCMatrix")
336-
}
337-
338-
#' Given a data frame of HCA metadata, returns a Seurat object corresponding to
339-
#' the samples in that data frame
340-
#'
341-
#' @inheritDotParams get_SingleCellExperiment
342-
#' @importFrom SeuratObject as.Seurat
343-
#' @export
344-
#' @return A Seurat object containing the same data as a call to
345-
#' get_SingleCellExperiment.
346-
#' @examples
347-
#' meta <- get_metadata() |> head(2)
348-
#' seurat <- get_seurat(meta)
349-
#'
350-
get_seurat <- function(...) {
351-
get_SingleCellExperiment(...) |> as.Seurat(data = NULL)
352-
}

R/dev.R

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,24 @@
1313
#' `-openrc.sh` file instead of providing it here.
1414
#' @param credential_id The OpenStack application credential secret as a
1515
#' character scalar
16-
#' @return NULL
16+
#' @return `NULL`, invisibly
1717
#' @keywords internal
18-
upload_swift = function(source, container, name = basename(source), credential_id = NULL, credential_secret = NULL){
18+
upload_swift <- function(
19+
source,
20+
container,
21+
name = basename(source),
22+
credential_id = NULL,
23+
credential_secret = NULL
24+
) {
1925
# Create the basilisk environment
2026
swift_env <- basilisk::BasiliskEnvironment(
2127
envname="swift-nectar-upload",
2228
pkgname=packageName(),
23-
packages=c("python-swiftclient==4.2.0", "python-keystoneclient==5.1.0", "python==3.10.9")
29+
packages=c(
30+
"python-swiftclient==4.2.0",
31+
"python-keystoneclient==5.1.0",
32+
"python==3.10.9"
33+
)
2434
)
2535
proc <- basilisk::basiliskStart(swift_env)
2636

@@ -38,7 +48,7 @@ upload_swift = function(source, container, name = basename(source), credential_i
3848
else {
3949
auth <- character()
4050
}
41-
args = c(
51+
args <- c(
4252
"-m",
4353
"swiftclient.shell",
4454
"--os-auth-url",
@@ -67,12 +77,20 @@ upload_swift = function(source, container, name = basename(source), credential_i
6777
#' @inheritDotParams upload_swift
6878
#' @examples
6979
#' \dontrun{
70-
#' metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
71-
#' update_database(metadata, "0.2.3", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
80+
#' metadata = CuratedAtlasQueryR::get_metadata() |>
81+
#' head(10) |>
82+
#' dplyr::collect()
83+
#' update_database(
84+
#' metadata,
85+
#' "0.2.3",
86+
#' credential_id = "ABCDEFGHIJK",
87+
#' credential_secret = "ABCD1234EFGH-5678IJK"
88+
#' )
7289
#' # Prints "metadata.0.2.3.parquet" if successful
7390
#' }
7491
#' @keywords internal
75-
update_database = function(metadata, version, ...){
92+
#' @inherit upload_swift return
93+
update_database <- function(metadata, version, ...){
7694
# These are optional dev packages
7795
rlang::check_installed(c("arrow", "glue", "basilisk"))
7896

@@ -89,12 +107,22 @@ update_database = function(metadata, version, ...){
89107
#' files, one for each dataset, e.g.
90108
#' /vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2
91109
#' @inheritDotParams upload_swift
110+
#' @inherit upload_swift return
92111
#' @keywords internal
93112
#' @examples
94113
#' \dontrun{
95-
#' update_unharmonised("/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
114+
#' update_unharmonised(
115+
#' "/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2",
116+
#' credential_id = "ABCDEFGHIJK",
117+
#' credential_secret = "ABCD1234EFGH-5678IJK"
118+
#' )
96119
#' }
97-
update_unharmonised = function(unharmonised_parquet_dir, ...){
120+
update_unharmonised <- function(unharmonised_parquet_dir, ...){
98121
# name="/" forces it have no prefix, ie be at the top level in the bucket
99-
upload_swift(unharmonised_parquet_dir, container="unharmonised_metadata", name="/", ...)
122+
upload_swift(
123+
unharmonised_parquet_dir,
124+
container="unharmonised_metadata",
125+
name="/",
126+
...
127+
)
100128
}

0 commit comments

Comments
 (0)