Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit cc066dd

Browse files
authored
Merge pull request #100 from stemangiola/cache-connection
Cache connection
2 parents a58b669 + 237c8be commit cc066dd

File tree

5 files changed

+133
-99
lines changed

5 files changed

+133
-99
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import(Seurat)
99
import(dbplyr)
1010
importFrom(BiocGenerics,cbind)
1111
importFrom(DBI,dbConnect)
12+
importFrom(DBI,dbDisconnect)
1213
importFrom(HDF5Array,HDF5RealizationSink)
1314
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
1415
importFrom(S4Vectors,DataFrame)
@@ -55,6 +56,7 @@ importFrom(purrr,pmap_chr)
5556
importFrom(purrr,reduce)
5657
importFrom(purrr,set_names)
5758
importFrom(purrr,transpose)
59+
importFrom(purrr,walk)
5860
importFrom(rlang,.data)
5961
importFrom(stats,setNames)
6062
importFrom(tibble,column_to_rownames)

R/metadata.R

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#' Environment that we use to cache the DuckDB connections
2+
cache = rlang::env(
3+
metadata_table = rlang::env()
4+
)
5+
6+
7+
#' Gets the Curated Atlas metadata as a data frame.
8+
#'
9+
#' Downloads a parquet database of the Human Cell Atlas metadata to a local
10+
#' cache, and then opens it as a data frame. It can then be filtered and
11+
#' passed into [get_SingleCellExperiment()]
12+
#' to obtain a [`SingleCellExperiment::SingleCellExperiment-class`]
13+
#'
14+
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
15+
#' to the location of the parquet database.
16+
#' @param cache_directory Optional character vector of length 1. A file path on
17+
#' your local system to a directory (not a file) that will be used to store
18+
#' metadata.parquet
19+
#' @param use_cache Optional logical scalar. If `TRUE` (the default), and this
20+
#' function has been called before with the same parameters, then a cached
21+
#' reference to the table will be returned. If `FALSE`, a new connection will
22+
#' be created no matter what.
23+
#' @return A lazy data.frame subclass containing the metadata. You can interact
24+
#' with this object using most standard dplyr functions. For string matching,
25+
#' it is recommended that you use `stringr::str_like` to filter character
26+
#' columns, as `stringr::str_match` will not work.
27+
#' @export
28+
#' @examples
29+
#' library(dplyr)
30+
#' filtered_metadata <- get_metadata() |>
31+
#' filter(
32+
#' ethnicity == "African" &
33+
#' assay %LIKE% "%10x%" &
34+
#' tissue == "lung parenchyma" &
35+
#' cell_type %LIKE% "%CD4%"
36+
#' )
37+
#'
38+
#' @importFrom DBI dbConnect
39+
#' @importFrom duckdb duckdb
40+
#' @importFrom dplyr tbl
41+
#' @importFrom httr progress
42+
#' @importFrom cli cli_alert_info
43+
#'
44+
#' @details
45+
#'
46+
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
47+
#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
48+
#'
49+
#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
50+
#'
51+
#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
52+
#'
53+
#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
54+
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
55+
#'
56+
#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
57+
#'
58+
#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
59+
#'
60+
#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
61+
#'
62+
#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler`
63+
#'
64+
#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
65+
#'
66+
#' - `tissue_harmonised`: a coarser tissue name for better filtering
67+
#' - `age_days`: the number of days corresponding to the age
68+
#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
69+
#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
70+
#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
71+
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
72+
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
73+
#' - `sample_id_db`: Sample subdivision for internal use
74+
#' - `file_id_db`: File subdivision for internal use
75+
#' - `sample_`: Sample ID
76+
#' - `.sample_name`: How samples were defined
77+
#'
78+
#'
79+
#' **Possible cache path issues**
80+
#'
81+
#' If your default R cache path includes non-standard characters (e.g. dash because of your user or organisation name), the following error can manifest
82+
#'
83+
#' Error in `db_query_fields.DBIConnection()`:
84+
#' ! Can't query fields.
85+
#' Caused by error:
86+
#' ! Parser Error: syntax error at or near "/"
87+
#' LINE 2: FROM /Users/bob/Library/Cach...
88+
#'
89+
#' The solution is to choose a different cache, for example
90+
#'
91+
#' get_metadata(cache_directory = path.expand('~'))
92+
#'
93+
get_metadata <- function(
94+
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet",
95+
cache_directory = get_default_cache_dir(),
96+
use_cache = TRUE
97+
) {
98+
hash = c(remote_url, cache_directory) |> paste0(collapse="") |> cli::hash_sha256()
99+
cached_connection = cache$metadata_table[[hash]]
100+
if (!is.null(cached_connection) && isTRUE(use_cache)) {
101+
cached_connection
102+
}
103+
else {
104+
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")
105+
sync_remote_file(
106+
remote_url,
107+
db_path,
108+
progress(type = "down", con = stderr())
109+
)
110+
table = duckdb() |>
111+
dbConnect(drv = _, read_only = TRUE) |>
112+
tbl(db_path)
113+
cache$metadata_table[[hash]] = table
114+
table
115+
}
116+
}

R/query.R

Lines changed: 0 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -351,100 +351,3 @@ as.sparse.DelayedMatrix <- function(x) {
351351
get_seurat <- function(...) {
352352
get_SingleCellExperiment(...) |> as.Seurat(data = NULL)
353353
}
354-
355-
#' Gets the Curated Atlas metadata as a data frame.
356-
#'
357-
#' Downloads a parquet database of the Human Cell Atlas metadata to a local
358-
#' cache, and then opens it as a data frame. It can then be filtered and
359-
#' passed into [get_SingleCellExperiment()]
360-
#' to obtain a [`SingleCellExperiment::SingleCellExperiment-class`]
361-
#'
362-
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
363-
#' to the location of the parquet database.
364-
#' @param cache_directory Optional character vector of length 1. A file path on
365-
#' your local system to a directory (not a file) that will be used to store
366-
#' metadata.parquet
367-
#' @return A lazy data.frame subclass containing the metadata. You can interact
368-
#' with this object using most standard dplyr functions. For string matching,
369-
#' it is recommended that you use `stringr::str_like` to filter character
370-
#' columns, as `stringr::str_match` will not work.
371-
#' @export
372-
#' @examples
373-
#' library(dplyr)
374-
#' filtered_metadata <- get_metadata() |>
375-
#' filter(
376-
#' ethnicity == "African" &
377-
#' assay %LIKE% "%10x%" &
378-
#' tissue == "lung parenchyma" &
379-
#' cell_type %LIKE% "%CD4%"
380-
#' )
381-
#'
382-
#' @importFrom DBI dbConnect
383-
#' @importFrom duckdb duckdb
384-
#' @importFrom dplyr tbl
385-
#' @importFrom httr progress
386-
#' @importFrom cli cli_alert_info
387-
#'
388-
#' @details
389-
#'
390-
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
391-
#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
392-
#'
393-
#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
394-
#'
395-
#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
396-
#'
397-
#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
398-
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
399-
#'
400-
#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
401-
#'
402-
#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
403-
#'
404-
#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
405-
#'
406-
#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler`
407-
#'
408-
#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
409-
#'
410-
#' - `tissue_harmonised`: a coarser tissue name for better filtering
411-
#' - `age_days`: the number of days corresponding to the age
412-
#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
413-
#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
414-
#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
415-
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
416-
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
417-
#' - `sample_id_db`: Sample subdivision for internal use
418-
#' - `file_id_db`: File subdivision for internal use
419-
#' - `sample_`: Sample ID
420-
#' - `.sample_name`: How samples were defined
421-
#'
422-
#'
423-
#' **Possible cache path issues**
424-
#'
425-
#' If your default R cache path includes non-standard characters (e.g. dash because of your user or organisation name), the following error can manifest
426-
#'
427-
#' Error in `db_query_fields.DBIConnection()`:
428-
#' ! Can't query fields.
429-
#' Caused by error:
430-
#' ! Parser Error: syntax error at or near "/"
431-
#' LINE 2: FROM /Users/bob/Library/Cach...
432-
#'
433-
#' The solution is to choose a different cache, for example
434-
#'
435-
#' get_metadata(cache_directory = path.expand('~'))
436-
#'
437-
get_metadata <- function(
438-
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet",
439-
cache_directory = get_default_cache_dir()
440-
) {
441-
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")
442-
sync_remote_file(
443-
remote_url,
444-
db_path,
445-
progress(type = "down", con = stderr())
446-
)
447-
duckdb() |>
448-
dbConnect(drv = _, read_only = TRUE) |>
449-
tbl(db_path)
450-
}

man/get_metadata.Rd

Lines changed: 8 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-query.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,10 @@ test_that("get_unharmonised_metadata() returns the appropriate data", {
183183
nrow(unharmonised)
184184
)
185185
})
186+
187+
test_that("get_metadata() is cached", {
188+
table = get_metadata()
189+
table_2 = get_metadata()
190+
191+
identical(table, table_2) |> expect_true()
192+
})

0 commit comments

Comments
 (0)