|
| 1 | +#' Environment that we use to cache the DuckDB connections |
| 2 | +cache = rlang::env( |
| 3 | + metadata_table = rlang::env() |
| 4 | +) |
| 5 | + |
| 6 | + |
| 7 | +#' Gets the Curated Atlas metadata as a data frame. |
| 8 | +#' |
| 9 | +#' Downloads a parquet database of the Human Cell Atlas metadata to a local |
| 10 | +#' cache, and then opens it as a data frame. It can then be filtered and |
| 11 | +#' passed into [get_SingleCellExperiment()] |
| 12 | +#' to obtain a [`SingleCellExperiment::SingleCellExperiment-class`] |
| 13 | +#' |
| 14 | +#' @param remote_url Optional character vector of length 1. An HTTP URL pointing |
| 15 | +#' to the location of the parquet database. |
| 16 | +#' @param cache_directory Optional character vector of length 1. A file path on |
| 17 | +#' your local system to a directory (not a file) that will be used to store |
| 18 | +#' metadata.parquet |
| 19 | +#' @param use_cache Optional logical scalar. If `TRUE` (the default), and this |
| 20 | +#' function has been called before with the same parameters, then a cached |
| 21 | +#' reference to the table will be returned. If `FALSE`, a new connection will |
| 22 | +#' be created no matter what. |
| 23 | +#' @return A lazy data.frame subclass containing the metadata. You can interact |
| 24 | +#' with this object using most standard dplyr functions. For string matching, |
| 25 | +#' it is recommended that you use `stringr::str_like` to filter character |
| 26 | +#' columns, as `stringr::str_match` will not work. |
| 27 | +#' @export |
| 28 | +#' @examples |
| 29 | +#' library(dplyr) |
| 30 | +#' filtered_metadata <- get_metadata() |> |
| 31 | +#' filter( |
| 32 | +#' ethnicity == "African" & |
| 33 | +#' assay %LIKE% "%10x%" & |
| 34 | +#' tissue == "lung parenchyma" & |
| 35 | +#' cell_type %LIKE% "%CD4%" |
| 36 | +#' ) |
| 37 | +#' |
| 38 | +#' @importFrom DBI dbConnect |
| 39 | +#' @importFrom duckdb duckdb |
| 40 | +#' @importFrom dplyr tbl |
| 41 | +#' @importFrom httr progress |
| 42 | +#' @importFrom cli cli_alert_info |
| 43 | +#' |
| 44 | +#' @details |
| 45 | +#' |
| 46 | +#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata. |
| 47 | +#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`. |
| 48 | +#' |
| 49 | +#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal. |
| 50 | +#' |
| 51 | +#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table. |
| 52 | +#' |
| 53 | +#' Dataset-specific columns (definitions available at cellxgene.cziscience.com) |
| 54 | +#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization` |
| 55 | +#' |
| 56 | +#' Sample-specific columns (definitions available at cellxgene.cziscience.com) |
| 57 | +#' |
| 58 | +#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x` |
| 59 | +#' |
| 60 | +#' Cell-specific columns (definitions available at cellxgene.cziscience.com) |
| 61 | +#' |
| 62 | +#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler` |
| 63 | +#' |
| 64 | +#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata |
| 65 | +#' |
| 66 | +#' - `tissue_harmonised`: a coarser tissue name for better filtering |
| 67 | +#' - `age_days`: the number of days corresponding to the age |
| 68 | +#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR |
| 69 | +#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on. |
| 70 | +#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation |
| 71 | +#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference |
| 72 | +#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference |
| 73 | +#' - `sample_id_db`: Sample subdivision for internal use |
| 74 | +#' - `file_id_db`: File subdivision for internal use |
| 75 | +#' - `sample_`: Sample ID |
| 76 | +#' - `.sample_name`: How samples were defined |
| 77 | +#' |
| 78 | +#' |
| 79 | +#' **Possible cache path issues** |
| 80 | +#' |
| 81 | +#' If your default R cache path includes non-standard characters (e.g. dash because of your user or organisation name), the following error can manifest |
| 82 | +#' |
| 83 | +#' Error in `db_query_fields.DBIConnection()`: |
| 84 | +#' ! Can't query fields. |
| 85 | +#' Caused by error: |
| 86 | +#' ! Parser Error: syntax error at or near "/" |
| 87 | +#' LINE 2: FROM /Users/bob/Library/Cach... |
| 88 | +#' |
| 89 | +#' The solution is to choose a different cache, for example |
| 90 | +#' |
| 91 | +#' get_metadata(cache_directory = path.expand('~')) |
| 92 | +#' |
| 93 | +get_metadata <- function( |
| 94 | + remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet", |
| 95 | + cache_directory = get_default_cache_dir(), |
| 96 | + use_cache = TRUE |
| 97 | +) { |
| 98 | + hash = c(remote_url, cache_directory) |> paste0(collapse="") |> cli::hash_sha256() |
| 99 | + cached_connection = cache$metadata_table[[hash]] |
| 100 | + if (!is.null(cached_connection) && isTRUE(use_cache)) { |
| 101 | + cached_connection |
| 102 | + } |
| 103 | + else { |
| 104 | + db_path <- file.path(cache_directory, "metadata.0.2.3.parquet") |
| 105 | + sync_remote_file( |
| 106 | + remote_url, |
| 107 | + db_path, |
| 108 | + progress(type = "down", con = stderr()) |
| 109 | + ) |
| 110 | + table = duckdb() |> |
| 111 | + dbConnect(drv = _, read_only = TRUE) |> |
| 112 | + tbl(db_path) |
| 113 | + cache$metadata_table[[hash]] = table |
| 114 | + table |
| 115 | + } |
| 116 | +} |
0 commit comments