Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 941d3f6

Browse files
authored
Merge pull request #97 from stemangiola/fix-95
Add support for unharmonised metadata
2 parents d9554b9 + afcbfff commit 941d3f6

File tree

10 files changed

+316
-24
lines changed

10 files changed

+316
-24
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ S3method(as.sparse,DelayedMatrix)
44
export(get_SingleCellExperiment)
55
export(get_metadata)
66
export(get_seurat)
7+
export(get_unharmonised_metadata)
78
import(Seurat)
89
import(dbplyr)
910
importFrom(BiocGenerics,cbind)
@@ -49,6 +50,7 @@ importFrom(purrr,map)
4950
importFrom(purrr,map_int)
5051
importFrom(purrr,pmap_chr)
5152
importFrom(purrr,reduce)
53+
importFrom(purrr,set_names)
5254
importFrom(purrr,transpose)
5355
importFrom(rlang,.data)
5456
importFrom(stats,setNames)

R/dev.R

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,21 @@
11
# Utility scripts for development purposes, that are not exported to users
22

3-
#' Update the metadata database in nectar using a newly created data frame
4-
#' @param metadata The data frame to upload
5-
#' @param version The version for the new metadata as a character scalar, e.g.
6-
#' "0.2.3"
3+
#' Upload a file to the Nectar object store
4+
#' @param source A character scalar indicating the local path to the file to
5+
#' upload
6+
#' @param container A character scalar indicating the name of the container to
7+
#' upload to
8+
#' @param name An optional character scalar indicating the name the file should
9+
#' have after being uploaded. Defaults to being the basename of the source
10+
#' file.
711
#' @param credential_id The OpenStack application credential ID as a character
812
#' scalar. This is optional because you can alternatively source a
913
#' `-openrc.sh` file instead of providing it here.
1014
#' @param credential_id The OpenStack application credential secret as a
1115
#' character scalar
12-
#' @noRd
13-
#' @example
14-
#' \dontrun{
15-
#' metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
16-
#' update_database(metadata, "0.2.3", "rfypdlunhrfopdnkrs", "3q5lw3qntafptdfsrdh-wa4p8h")
17-
#' # Prints "metadata.0.2.3.parquet" if successful
18-
#' }
19-
update_database = function(metadata, version, credential_id = NULL, credential_secret = NULL){
20-
# These are optional dev packages
21-
rlang::check_installed(c("arrow", "glue", "basilisk"))
22-
23-
# Create parquet
24-
dir <- tempdir()
25-
parquet_name <- glue::glue("metadata.{version}.parquet")
26-
parquet_path <- file.path(dir, parquet_name)
27-
arrow::write_parquet(metadata, sink=parquet_path)
28-
16+
#' @return NULL
17+
#' @keywords internal
18+
upload_swift = function(source, container, name = basename(source), credential_id = NULL, credential_secret = NULL){
2919
# Create the basilisk environment
3020
swift_env <- basilisk::BasiliskEnvironment(
3121
envname="swift-nectar-upload",
@@ -57,13 +47,54 @@ update_database = function(metadata, version, credential_id = NULL, credential_s
5747
"06d6e008e3e642da99d806ba3ea629c5",
5848
auth,
5949
"upload",
60-
"metadata",
61-
parquet_path,
50+
container,
51+
source,
6252
"--object-name",
63-
parquet_name
53+
name
6454
)
6555

6656
# Perform the upload
6757
system2(reticulate::py_exe(), args=args)
6858
basilisk::basiliskStop(proc)
59+
60+
invisible(NULL)
61+
}
62+
63+
#' Update the metadata database in nectar using a newly created data frame
64+
#' @param metadata The data frame to upload
65+
#' @param version The version for the new metadata as a character scalar, e.g.
66+
#' "0.2.3"
67+
#' @inheritDotParams upload_swift
68+
#' @examples
69+
#' \dontrun{
70+
#' metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
71+
#' update_database(metadata, "0.2.3", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
72+
#' # Prints "metadata.0.2.3.parquet" if successful
73+
#' }
74+
#' @keywords internal
75+
update_database = function(metadata, version, ...){
76+
# These are optional dev packages
77+
rlang::check_installed(c("arrow", "glue", "basilisk"))
78+
79+
dir <- tempdir()
80+
parquet_name <- glue::glue("metadata.{version}.parquet")
81+
parquet_path <- file.path(dir, parquet_name)
82+
arrow::write_parquet(metadata, sink=parquet_path)
83+
84+
upload_swift(parquet_path, container="metadata", name=parquet_name, ...)
85+
}
86+
87+
#' Update the unharmonised parquet files
88+
#' @param unharmonised_parquet_dir The path to a directory containing parquet
89+
#' files, one for each dataset, e.g.
90+
#' /vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2
91+
#' @inheritDotParams upload_swift
92+
#' @keywords internal
93+
#' @examples
94+
#' \dontrun{
95+
#' update_unharmonised("/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
96+
#' }
97+
update_unharmonised = function(unharmonised_parquet_dir, ...){
98+
# name="/" forces it have no prefix, ie be at the top level in the bucket
99+
upload_swift(unharmonised_parquet_dir, container="unharmonised_metadata", name="/", ...)
69100
}

R/query.R

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,3 +448,52 @@ get_metadata <- function(
448448
dbConnect(drv = _, read_only = TRUE) |>
449449
tbl(db_path)
450450
}
451+
452+
#' Returns unharmonised metadata for selected datasets.
453+
#'
454+
#' Various metadata fields are *not* common between datasets, so it does not
455+
#' make sense for these to live in the main metadata table. This function is a
456+
#' utility that allows easy fetching of this data if necessary.
457+
#'
458+
#' @param dataset_ids A character vector, where each entry is a dataset ID
459+
#' obtained from the `$file_id` column of the table returned from
460+
#' [get_metadata()]
461+
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
462+
#' to the root URL under which all the unharmonised dataset files are located.
463+
#' @param cache_directory Optional character vector of length 1. A file path on
464+
#' your local system to a directory (not a file) that will be used to store
465+
#' the unharmonised metadata files.
466+
#' @importFrom purrr map set_names
467+
#' @importFrom glue glue
468+
#' @importFrom DBI dbConnect
469+
#' @importFrom duckdb duckdb
470+
#' @importFrom dplyr tbl
471+
#' @return A named list, where each name is a dataset file ID, and each value is
472+
#' a "lazy data frame", ie a `tbl`.
473+
#' @export
474+
#' @examples
475+
#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
476+
#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
477+
#' unharmonised_meta = get_unharmonised_metadata(dataset)
478+
#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
479+
#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
480+
get_unharmonised_metadata = function(
481+
dataset_ids,
482+
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
483+
cache_directory = get_default_cache_dir()
484+
){
485+
unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised")
486+
duck = duckdb() |> dbConnect(drv = _, read_only = TRUE)
487+
dataset_ids |>
488+
set_names() |>
489+
map(function(dataset_id){
490+
file_name = glue::glue("{dataset_id}.parquet")
491+
local_path = file.path(unharmonised_root, file_name)
492+
glue("{remote_url}/{file_name}") |>
493+
sync_remote_file(
494+
local_path,
495+
progress(type = "down", con = stderr())
496+
)
497+
tbl(duck, local_path)
498+
})
499+
}

README.Rmd

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,30 @@ metadata |>
275275
knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
276276
```
277277

278+
## Obtain Unharmonised Metadata
279+
280+
Various metadata fields are *not* common between datasets, so it does not
281+
make sense for these to live in the main metadata table. However, we can
282+
obtain it using the `get_unharmonised_metadata()` function.
283+
284+
Note how this table has additional columns that are not in the normal metadata:
285+
286+
```{r}
287+
dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
288+
unharmonised_meta = get_unharmonised_metadata(dataset)
289+
unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
290+
unharmonised_tbl
291+
```
292+
293+
If we have metadata from the normal metadata table that is from a single dataset,
294+
we can even join this additional metadata into one big data frame:
295+
```{r}
296+
harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
297+
dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
298+
```
299+
300+
301+
278302
# Cell metadata
279303

280304
Dataset-specific columns (definitions available at cellxgene.cziscience.com)

man/get_unharmonised_metadata.Rd

Lines changed: 41 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/update_database.Rd

Lines changed: 39 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/update_unharmonised.Rd

Lines changed: 36 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/upload_swift.Rd

Lines changed: 32 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-query.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,19 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {
156156
assay_2[, colnames(assay_1)]
157157
)
158158
})
159+
160+
test_that("get_unharmonised_metadata works with one ID", {
161+
dataset_id = "838ea006-2369-4e2c-b426-b2a744a2b02b"
162+
unharmonised_meta = get_unharmonised_metadata(dataset_id)
163+
unharmonised_tbl = unharmonised_meta[[dataset_id]]
164+
165+
expect_type(unharmonised_meta, "list")
166+
expect_s3_class(unharmonised_tbl, "tbl")
167+
})
168+
169+
test_that("get_unharmonised_metadata works with multiple IDs", {
170+
dataset_ids = c("838ea006-2369-4e2c-b426-b2a744a2b02b", "83b9cb97-9ee4-404d-8cdf-ccede8235356")
171+
unharmonised_meta = get_unharmonised_metadata(dataset_ids)
172+
173+
expect_equal(names(unharmonised_meta), dataset_ids)
174+
})

0 commit comments

Comments
 (0)