Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 0e0fc5f

Browse files
committed
Add support for unharmonised metadata
1 parent d9554b9 commit 0e0fc5f

File tree

4 files changed

+110
-24
lines changed

4 files changed

+110
-24
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ S3method(as.sparse,DelayedMatrix)
44
export(get_SingleCellExperiment)
55
export(get_metadata)
66
export(get_seurat)
7+
export(get_unharmonised_metadata)
78
import(Seurat)
89
import(dbplyr)
910
importFrom(BiocGenerics,cbind)

R/dev.R

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,21 @@
11
# Utility scripts for development purposes, that are not exported to users
22

3-
#' Update the metadata database in nectar using a newly created data frame
4-
#' @param metadata The data frame to upload
5-
#' @param version The version for the new metadata as a character scalar, e.g.
6-
#' "0.2.3"
3+
#' Upload a file to the Nectar object store
4+
#' @param source A character scalar indicating the local path to the file to
5+
#' upload
6+
#' @param container A character scalar indicating the name of the container to
7+
#' upload to
8+
#' @param name An optional character scalar indicating the name the file should
9+
#' have after being uploaded. Defaults to being the basename of the source
10+
#' file.
711
#' @param credential_id The OpenStack application credential ID as a character
812
#' scalar. This is optional because you can alternatively source a
913
#' `-openrc.sh` file instead of providing it here.
1014
#' @param credential_id The OpenStack application credential secret as a
1115
#' character scalar
12-
#' @noRd
13-
#' @example
14-
#' \dontrun{
15-
#' metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
16-
#' update_database(metadata, "0.2.3", "rfypdlunhrfopdnkrs", "3q5lw3qntafptdfsrdh-wa4p8h")
17-
#' # Prints "metadata.0.2.3.parquet" if successful
18-
#' }
19-
update_database = function(metadata, version, credential_id = NULL, credential_secret = NULL){
20-
# These are optional dev packages
21-
rlang::check_installed(c("arrow", "glue", "basilisk"))
22-
23-
# Create parquet
24-
dir <- tempdir()
25-
parquet_name <- glue::glue("metadata.{version}.parquet")
26-
parquet_path <- file.path(dir, parquet_name)
27-
arrow::write_parquet(metadata, sink=parquet_path)
28-
16+
#' @return NULL
17+
#' @keywords internal
18+
upload_swift = function(source, container, name = basename(source), credential_id = NULL, credential_secret = NULL){
2919
# Create the basilisk environment
3020
swift_env <- basilisk::BasiliskEnvironment(
3121
envname="swift-nectar-upload",
@@ -57,13 +47,54 @@ update_database = function(metadata, version, credential_id = NULL, credential_s
5747
"06d6e008e3e642da99d806ba3ea629c5",
5848
auth,
5949
"upload",
60-
"metadata",
61-
parquet_path,
50+
container,
51+
source,
6252
"--object-name",
63-
parquet_name
53+
name
6454
)
6555

6656
# Perform the upload
6757
system2(reticulate::py_exe(), args=args)
6858
basilisk::basiliskStop(proc)
59+
60+
invisible(NULL)
61+
}
62+
63+
#' Update the metadata database in nectar using a newly created data frame
64+
#' @param metadata The data frame to upload
65+
#' @param version The version for the new metadata as a character scalar, e.g.
66+
#' "0.2.3"
67+
#' @inheritDotParams upload_swift
68+
#' @example
69+
#' \dontrun{
70+
#' metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
71+
#' update_database(metadata, "0.2.3", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
72+
#' # Prints "metadata.0.2.3.parquet" if successful
73+
#' }
74+
#' @keywords internal
75+
update_database = function(metadata, version, ...){
76+
# These are optional dev packages
77+
rlang::check_installed(c("arrow", "glue", "basilisk"))
78+
79+
dir <- tempdir()
80+
parquet_name <- glue::glue("metadata.{version}.parquet")
81+
parquet_path <- file.path(dir, parquet_name)
82+
arrow::write_parquet(metadata, sink=parquet_path)
83+
84+
upload_swift(parquet_path, container="metadata", name=parquet_name, ...)
85+
}
86+
87+
#' Update the unharmonised parquet files
88+
#' @param unharmonised_parquet_dir The path to a directory containing parquet
89+
#' files, one for each dataset, e.g.
90+
#' /vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2
91+
#' @inheritDotParams upload_swift
92+
#' @keywords internal
93+
#' @examples
94+
#' \dontrun{
95+
#' update_unharmonised("/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
96+
#' }
97+
update_unharmonised = function(unharmonised_parquet_dir, ...){
98+
# name="/" forces it have no prefix, ie be at the top level in the bucket
99+
upload_swift(unharmonised_parquet_dir, container="unharmonised_metadata", name="/", ...)
69100
}

R/query.R

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,3 +448,41 @@ get_metadata <- function(
448448
dbConnect(drv = _, read_only = TRUE) |>
449449
tbl(db_path)
450450
}
451+
452+
#' Returns unharmonised metadata for selected datasets.
453+
#'
454+
#' Various metadata fields are *not* common between datasets, so it does not make
455+
#' sense for these to live in the main metadata table. This function is a
456+
#' utility that allows easy fetching of this data if necessary.
457+
#'
458+
#' @param dataset_ids A character vector, where each entry is a dataset ID
459+
#' obtained from the `$dataset_id` column of the table returned from
460+
#' [get_metadata()]
461+
#' @return
462+
#' @export
463+
#' @examples
464+
#' dataset_id = "838ea006-2369-4e2c-b426-b2a744a2b02b"
465+
#' harmonised_meta = get_metadata() |> dplyr::filter(dataset_id = dataset_id)
466+
#' unharmonised_meta = get_unharmonised_metadata(dataset_id)
467+
#' unharmonised_tbl = unharmonised_meta[[dataset_id]]
468+
#' dplyr::join(harmonised_meta, unharmonised_tbl, by=c("dataset_id", "cell_"))
469+
get_unharmonised_metadata = function(
470+
dataset_ids,
471+
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
472+
cache_directory = get_default_cache_dir()
473+
){
474+
unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised")
475+
duck = duckdb() |> dbConnect(drv = _, read_only = TRUE)
476+
dataset_ids |>
477+
purrr::set_names() |>
478+
purrr::map(function(dataset_id){
479+
file_name = glue::glue("{dataset_id}.parquet")
480+
local_path = file.path(unharmonised_root, file_name)
481+
glue::glue("{remote_url}/{file_name}") |>
482+
sync_remote_file(
483+
local_path,
484+
progress(type = "down", con = stderr())
485+
)
486+
tbl(duck, local_path)
487+
})
488+
}

tests/testthat/test-query.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,19 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {
156156
assay_2[, colnames(assay_1)]
157157
)
158158
})
159+
160+
test_that("get_unharmonised_metadata works with one ID", {
161+
dataset_id = "838ea006-2369-4e2c-b426-b2a744a2b02b"
162+
unharmonised_meta = get_unharmonised_metadata(dataset_id)
163+
unharmonised_tbl = unharmonised_meta[[dataset_id]]
164+
165+
expect_s3_class(unharmonised_meta, "list")
166+
expect_s3_class(unharmonised_tbl, "tbl")
167+
})
168+
169+
test_that("get_unharmonised_metadata works with multiple IDs", {
170+
dataset_ids = c("838ea006-2369-4e2c-b426-b2a744a2b02b", "83b9cb97-9ee4-404d-8cdf-ccede8235356.parquet")
171+
unharmonised_meta = get_unharmonised_metadata(dataset_ids)
172+
173+
expect_equal(names(unharmonised_meta), dataset_ids)
174+
})

0 commit comments

Comments
 (0)