66# ' Environment that we use to cache the DuckDB connections
77# ' @noRd
88cache <- rlang :: env(
9- metadata_table = rlang :: env()
9+ metadata_table = rlang :: env()
1010)
1111
12- # ' URL pointing to the full metadata file
12+ # ' Returns the URLs for all metadata files
13+ # ' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
1314# ' @export
14- # ' @return A character scalar consisting of the URL
15+ # ' @return A character vector of URLs to parquet files to download
1516# ' @examples
16- # ' get_metadata(remote_url = DATABASE_URL )
17- DATABASE_URL <- single_line_str(
18- " https://object-store.rc.nectar.org.au/v1/
19- AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet "
20- )
17+ # ' get_database_url("metadata.0.2.3.parquet" )
18+ get_database_url <- function ( databases = c( " metadata.0.2.3.parquet " , " fibrosis.0.2.3.parquet " )) {
19+ glue :: glue(
20+ " https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases} " )
21+ }
2122
2223# ' URL pointing to the sample metadata file, which is smaller and for test,
2324# ' demonstration, and vignette purposes only
2425# ' @export
2526# ' @return A character scalar consisting of the URL
2627# ' @examples
27- # ' get_metadata(remote_url = SAMPLE_DATABASE_URL)
28+ # ' get_metadata(remote_url = SAMPLE_DATABASE_URL, cache_directory = tempdir() )
2829SAMPLE_DATABASE_URL <- single_line_str(
29- " https://object-store.rc.nectar.org.au/v1/
30+ " https://object-store.rc.nectar.org.au/v1/
3031 AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/
3132 sample_metadata.0.2.3.parquet"
3233)
@@ -38,8 +39,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
3839# ' into [get_single_cell_experiment()] to obtain a
3940# ' [`SingleCellExperiment::SingleCellExperiment-class`]
4041# '
41- # ' @param remote_url Optional character vector of length 1. An HTTP URL pointing
42- # ' to the location of the parquet database.
42+ # ' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
43+ # ' to the name and location of parquet database/databases .
4344# ' @param cache_directory Optional character vector of length 1. A file path on
4445# ' your local system to a directory (not a file) that will be used to store
4546# ' `metadata.parquet`
@@ -68,6 +69,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
6869# ' @importFrom httr progress
6970# ' @importFrom cli cli_alert_info hash_sha256
7071# ' @importFrom glue glue
72+ # ' @importFrom purrr walk
7173# '
7274# ' @details
7375# '
@@ -142,32 +144,38 @@ SAMPLE_DATABASE_URL <- single_line_str(
142144# ' get_metadata(cache_directory = path.expand('~'))
143145# '
144146get_metadata <- function (
145- remote_url = DATABASE_URL ,
147+ remote_url = get_database_url() ,
146148 cache_directory = get_default_cache_dir(),
147149 use_cache = TRUE
148150) {
149- hash <- c(remote_url , cache_directory ) | > paste0(collapse = " " ) | >
150- hash_sha256()
151- cached_connection <- cache $ metadata_table [[hash ]]
152- if (! is.null(cached_connection ) && isTRUE(use_cache )) {
153- cached_connection
154- }
155- else {
156- db_path <- file.path(cache_directory , " metadata.0.2.3.parquet" )
157-
158- if (! file.exists(db_path )){
159- report_file_sizes(remote_url )
160- sync_remote_file(
161- remote_url ,
162- db_path ,
163- progress(type = " down" , con = stderr())
164- )
165- }
166-
167- table <- duckdb() | >
168- dbConnect(drv = _, read_only = TRUE ) | >
169- read_parquet(db_path )
170- cache $ metadata_table [[hash ]] <- table
171- table
151+ # Synchronize remote files
152+ walk(remote_url , function (url ) {
153+ # Calculate the file path from the URL
154+ path <- file.path(cache_directory , url | > basename())
155+ if (! file.exists(path )) {
156+ report_file_sizes(url )
157+ sync_remote_file(url ,
158+ path ,
159+ progress(type = " down" , con = stderr()))
172160 }
161+ })
162+ all_parquet <- file.path(cache_directory , dir(cache_directory , pattern = " .parquet$" ))
163+ # We try to avoid re-reading a set of parquet files
164+ # that is identical to a previous set by hashing the file list
165+ hash <- all_parquet | > paste0(collapse = " " ) | >
166+ hash_sha256()
167+ cached_connection <- cache $ metadata_table [[hash ]]
168+
169+ if (! is.null(cached_connection ) && isTRUE(use_cache )) {
170+ # If the file list is identical, just re-use the database table
171+ cached_connection
172+ }
173+ else {
174+ table <- duckdb() | >
175+ dbConnect(drv = _, read_only = TRUE ) | >
176+ read_parquet(path = all_parquet )
177+ cache $ metadata_table [[hash ]] <- table
178+ table
179+ }
173180}
181+
0 commit comments