Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 34e64fc

Browse files
committed
WIP implementation of the original suggested API
1 parent 98748c9 commit 34e64fc

File tree

5 files changed

+135
-71
lines changed

5 files changed

+135
-71
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,11 @@ importFrom(dplyr,as_tibble)
2929
importFrom(dplyr,collect)
3030
importFrom(dplyr,filter)
3131
importFrom(dplyr,full_join)
32+
importFrom(dplyr,group_by)
3233
importFrom(dplyr,inner_join)
3334
importFrom(dplyr,mutate)
3435
importFrom(dplyr,pull)
36+
importFrom(dplyr,summarise)
3537
importFrom(dplyr,tbl)
3638
importFrom(dplyr,tibble)
3739
importFrom(dplyr,transmute)

R/query.R

Lines changed: 1 addition & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -447,53 +447,4 @@ get_metadata <- function(
447447
duckdb() |>
448448
dbConnect(drv = _, read_only = TRUE) |>
449449
tbl(db_path)
450-
}
451-
452-
#' Returns unharmonised metadata for selected datasets.
453-
#'
454-
#' Various metadata fields are *not* common between datasets, so it does not
455-
#' make sense for these to live in the main metadata table. This function is a
456-
#' utility that allows easy fetching of this data if necessary.
457-
#'
458-
#' @param dataset_ids A character vector, where each entry is a dataset ID
459-
#' obtained from the `$file_id` column of the table returned from
460-
#' [get_metadata()]
461-
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
462-
#' to the root URL under which all the unharmonised dataset files are located.
463-
#' @param cache_directory Optional character vector of length 1. A file path on
464-
#' your local system to a directory (not a file) that will be used to store
465-
#' the unharmonised metadata files.
466-
#' @importFrom purrr map set_names
467-
#' @importFrom glue glue
468-
#' @importFrom DBI dbConnect
469-
#' @importFrom duckdb duckdb
470-
#' @importFrom dplyr tbl
471-
#' @return A named list, where each name is a dataset file ID, and each value is
472-
#' a "lazy data frame", ie a `tbl`.
473-
#' @export
474-
#' @examples
475-
#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
476-
#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
477-
#' unharmonised_meta = get_unharmonised_metadata(dataset)
478-
#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
479-
#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
480-
get_unharmonised_metadata = function(
481-
dataset_ids,
482-
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
483-
cache_directory = get_default_cache_dir()
484-
){
485-
unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised")
486-
duck = duckdb() |> dbConnect(drv = _, read_only = TRUE)
487-
dataset_ids |>
488-
set_names() |>
489-
map(function(dataset_id){
490-
file_name = glue::glue("{dataset_id}.parquet")
491-
local_path = file.path(unharmonised_root, file_name)
492-
glue("{remote_url}/{file_name}") |>
493-
sync_remote_file(
494-
local_path,
495-
progress(type = "down", con = stderr())
496-
)
497-
tbl(duck, local_path)
498-
})
499-
}
450+
}

R/unharmonised.R

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#' Returns unharmonised metadata for selected datasets.
2+
#'
3+
#' Various metadata fields are *not* common between datasets, so it does not
4+
#' make sense for these to live in the main metadata table. This function is a
5+
#' utility that allows easy fetching of this data if necessary.
6+
#'
7+
#' @param dataset_ids A character vector, where each entry is a dataset ID
8+
#' obtained from the `$file_id` column of the table returned from
9+
#' [get_metadata()]
10+
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
11+
#' to the root URL under which all the unharmonised dataset files are located.
12+
#' @param cache_directory Optional character vector of length 1. A file path on
13+
#' your local system to a directory (not a file) that will be used to store
14+
#' the unharmonised metadata files.
15+
#' @importFrom purrr map set_names
16+
#' @importFrom glue glue
17+
#' @importFrom DBI dbConnect
18+
#' @importFrom duckdb duckdb
19+
#' @importFrom dplyr tbl filter
20+
#' @return A named list, where each name is a dataset file ID, and each value is
21+
#' a "lazy data frame", ie a `tbl`.
22+
#' @examples
23+
#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
24+
#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
25+
#' unharmonised_meta = get_unharmonised_metadata_list(dataset)
26+
#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
27+
#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
28+
get_unharmonised_dataset = function(
29+
dataset_id,
30+
cells = NULL,
31+
conn = duckdb() |> dbConnect(drv = _, read_only = TRUE),
32+
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
33+
cache_directory = get_default_cache_dir()
34+
){
35+
unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised")
36+
file_name = glue::glue("{dataset_id}.parquet")
37+
local_path = file.path(unharmonised_root, file_name)
38+
glue("{remote_url}/{file_name}") |>
39+
sync_remote_file(
40+
local_path,
41+
progress(type = "down", con = stderr())
42+
)
43+
tbl(conn, local_path) |>
44+
filter(cell_ %in% cells)
45+
}
46+
47+
#' Returns unharmonised metadata for a metadata query
48+
#' @inherit get_unharmonised_dataset description
49+
#' @param metadata A lazy data frame obtained from [get_metadata()], filtered
50+
#' down to some cells of interest
51+
#' @inheritDotParams get_unharmonised_dataset
52+
#' @return A tibble with two columns:
53+
#' * `file_id`: the same `file_id` as the main metadata table obtained from [get_metadata()]
54+
#' * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata
55+
#' @export
56+
#' @importFrom dplyr group_by summarise filter collect
57+
#' @examples
58+
#' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
59+
#' unharmonised <- get_unharmonised_metadata(harmonised)
60+
get_unharmonised_metadata = function(metadata, ...){
61+
args = list(...)
62+
metadata |>
63+
collect() |>
64+
group_by(file_id) |>
65+
summarise(
66+
unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=metadata$src$con) |>
67+
c(args) |>
68+
do.call(get_unharmonised_dataset, args=_) |>
69+
list()
70+
)
71+
}

man/get_unharmonised_dataset.Rd

Lines changed: 43 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/get_unharmonised_metadata.Rd

Lines changed: 18 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)