1+ # ' Returns unharmonised metadata for selected datasets.
2+ # '
3+ # ' Various metadata fields are *not* common between datasets, so it does not
4+ # ' make sense for these to live in the main metadata table. This function is a
5+ # ' utility that allows easy fetching of this data if necessary.
6+ # '
7+ # ' @param dataset_ids A character vector, where each entry is a dataset ID
8+ # ' obtained from the `$file_id` column of the table returned from
9+ # ' [get_metadata()]
10+ # ' @param remote_url Optional character vector of length 1. An HTTP URL pointing
11+ # ' to the root URL under which all the unharmonised dataset files are located.
12+ # ' @param cache_directory Optional character vector of length 1. A file path on
13+ # ' your local system to a directory (not a file) that will be used to store
14+ # ' the unharmonised metadata files.
15+ # ' @importFrom purrr map set_names
16+ # ' @importFrom glue glue
17+ # ' @importFrom DBI dbConnect
18+ # ' @importFrom duckdb duckdb
19+ # ' @importFrom dplyr tbl filter
20+ # ' @return A named list, where each name is a dataset file ID, and each value is
21+ # ' a "lazy data frame", ie a `tbl`.
22+ # ' @examples
23+ # ' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
24+ # ' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
25+ # ' unharmonised_meta = get_unharmonised_metadata_list(dataset)
26+ # ' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
27+ # ' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
28+ get_unharmonised_dataset = function (
29+ dataset_id ,
30+ cells = NULL ,
31+ conn = duckdb() | > dbConnect(drv = _, read_only = TRUE ),
32+ remote_url = " https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata" ,
33+ cache_directory = get_default_cache_dir()
34+ ){
35+ unharmonised_root <- file.path(cache_directory , COUNTS_VERSION , " unharmonised" )
36+ file_name = glue :: glue(" {dataset_id}.parquet" )
37+ local_path = file.path(unharmonised_root , file_name )
38+ glue(" {remote_url}/{file_name}" ) | >
39+ sync_remote_file(
40+ local_path ,
41+ progress(type = " down" , con = stderr())
42+ )
43+ tbl(conn , local_path ) | >
44+ filter(cell_ %in% cells )
45+ }
46+
47+ # ' Returns unharmonised metadata for a metadata query
48+ # ' @inherit get_unharmonised_dataset description
49+ # ' @param metadata A lazy data frame obtained from [get_metadata()], filtered
50+ # ' down to some cells of interest
51+ # ' @inheritDotParams get_unharmonised_dataset
52+ # ' @return A tibble with two columns:
53+ # ' * `file_id`: the same `file_id` as the main metadata table obtained from [get_metadata()]
54+ # ' * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata
55+ # ' @export
56+ # ' @importFrom dplyr group_by summarise filter collect
57+ # ' @examples
58+ # ' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
59+ # ' unharmonised <- get_unharmonised_metadata(harmonised)
60+ get_unharmonised_metadata = function (metadata , ... ){
61+ args = list (... )
62+ metadata | >
63+ collect() | >
64+ group_by(file_id ) | >
65+ summarise(
66+ unharmonised = list (dataset_id = file_id [[1 ]], cells = cell_ , conn = metadata $ src $ con ) | >
67+ c(args ) | >
68+ do.call(get_unharmonised_dataset , args = _) | >
69+ list ()
70+ )
71+ }
0 commit comments