Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 4fee95a

Browse files
committed
BiocChcek
1 parent 35fbc4b commit 4fee95a

File tree

9 files changed

+232
-438
lines changed

9 files changed

+232
-438
lines changed

.github/workflows/check-bioc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,8 @@ jobs:
243243
dir('check', 'tar.gz$', full.names = TRUE),
244244
`quit-with-status` = TRUE,
245245
`no-check-R-ver` = TRUE,
246-
`no-check-bioc-help` = TRUE
246+
`no-check-bioc-help` = TRUE,
247+
`new-package` = TRUE
247248
)
248249
shell: Rscript {0}
249250

DESCRIPTION

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ Imports:
9090
tibble,
9191
utils,
9292
dbplyr (>= 2.3.0),
93-
duckdb
93+
duckdb,
94+
stringr
9495
Suggests:
9596
here,
9697
stringr,
@@ -130,3 +131,12 @@ URL: https://github.com/stemangiola/CuratedAtlasQueryR
130131
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
131132
VignetteBuilder: knitr
132133
Roxygen: list(markdown = TRUE)
134+
Collate:
135+
'utils.R'
136+
'counts.R'
137+
'dev.R'
138+
'metadata.R'
139+
'seurat.R'
140+
'unharm.R'
141+
'unharmonised.R'
142+
'zzz.R'

NAMESPACE

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@ export(get_SingleCellExperiment)
55
export(get_metadata)
66
export(get_seurat)
77
export(get_unharmonised_metadata)
8-
import(Seurat)
9-
import(dbplyr)
108
importFrom(BiocGenerics,cbind)
119
importFrom(DBI,dbConnect)
1210
importFrom(DBI,dbDisconnect)
1311
importFrom(HDF5Array,HDF5RealizationSink)
1412
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
1513
importFrom(S4Vectors,DataFrame)
14+
importFrom(Seurat,as.SingleCellExperiment)
1615
importFrom(SeuratObject,as.Seurat)
1716
importFrom(SeuratObject,as.sparse)
1817
importFrom(SingleCellExperiment,SingleCellExperiment)
@@ -26,6 +25,7 @@ importFrom(cli,cli_abort)
2625
importFrom(cli,cli_alert_info)
2726
importFrom(cli,cli_alert_success)
2827
importFrom(cli,cli_alert_warning)
28+
importFrom(cli,hash_sha256)
2929
importFrom(dbplyr,remote_con)
3030
importFrom(dplyr,as_tibble)
3131
importFrom(dplyr,collect)
@@ -59,6 +59,7 @@ importFrom(purrr,set_names)
5959
importFrom(purrr,walk)
6060
importFrom(rlang,.data)
6161
importFrom(stats,setNames)
62+
importFrom(stringr,str_remove_all)
6263
importFrom(tibble,column_to_rownames)
6364
importFrom(tools,R_user_dir)
6465
importFrom(utils,head)

R/dev.R

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,22 @@
1515
#' character scalar
1616
#' @return NULL
1717
#' @keywords internal
18-
upload_swift = function(source, container, name = basename(source), credential_id = NULL, credential_secret = NULL){
18+
upload_swift <- function(
19+
source,
20+
container,
21+
name = basename(source),
22+
credential_id = NULL,
23+
credential_secret = NULL
24+
) {
1925
# Create the basilisk environment
2026
swift_env <- basilisk::BasiliskEnvironment(
2127
envname="swift-nectar-upload",
2228
pkgname=packageName(),
23-
packages=c("python-swiftclient==4.2.0", "python-keystoneclient==5.1.0", "python==3.10.9")
29+
packages=c(
30+
"python-swiftclient==4.2.0",
31+
"python-keystoneclient==5.1.0",
32+
"python==3.10.9"
33+
)
2434
)
2535
proc <- basilisk::basiliskStart(swift_env)
2636

@@ -38,7 +48,7 @@ upload_swift = function(source, container, name = basename(source), credential_i
3848
else {
3949
auth <- character()
4050
}
41-
args = c(
51+
args <- c(
4252
"-m",
4353
"swiftclient.shell",
4454
"--os-auth-url",
@@ -67,12 +77,19 @@ upload_swift = function(source, container, name = basename(source), credential_i
6777
#' @inheritDotParams upload_swift
6878
#' @examples
6979
#' \dontrun{
70-
#' metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
71-
#' update_database(metadata, "0.2.3", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
80+
#' metadata = CuratedAtlasQueryR::get_metadata() |>
81+
#' head(10) |>
82+
#' dplyr::collect()
83+
#' update_database(
84+
#' metadata,
85+
#' "0.2.3",
86+
#' credential_id = "ABCDEFGHIJK",
87+
#' credential_secret = "ABCD1234EFGH-5678IJK"
88+
#' )
7289
#' # Prints "metadata.0.2.3.parquet" if successful
7390
#' }
7491
#' @keywords internal
75-
update_database = function(metadata, version, ...){
92+
update_database <- function(metadata, version, ...){
7693
# These are optional dev packages
7794
rlang::check_installed(c("arrow", "glue", "basilisk"))
7895

@@ -89,12 +106,22 @@ update_database = function(metadata, version, ...){
89106
#' files, one for each dataset, e.g.
90107
#' /vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2
91108
#' @inheritDotParams upload_swift
109+
#' @inherit upload_swift return
92110
#' @keywords internal
93111
#' @examples
94112
#' \dontrun{
95-
#' update_unharmonised("/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
113+
#' update_unharmonised(
114+
#' "/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2",
115+
#' credential_id = "ABCDEFGHIJK",
116+
#' credential_secret = "ABCD1234EFGH-5678IJK"
117+
#' )
96118
#' }
97-
update_unharmonised = function(unharmonised_parquet_dir, ...){
119+
update_unharmonised <- function(unharmonised_parquet_dir, ...){
98120
# name="/" forces it have no prefix, ie be at the top level in the bucket
99-
upload_swift(unharmonised_parquet_dir, container="unharmonised_metadata", name="/", ...)
121+
upload_swift(
122+
unharmonised_parquet_dir,
123+
container="unharmonised_metadata",
124+
name="/",
125+
...
126+
)
100127
}

R/metadata.R

Lines changed: 82 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,34 @@
1+
# Functions that relate to the harmonised metadata database
2+
3+
#' @include utils.R
4+
NULL
5+
16
#' Environment that we use to cache the DuckDB connections
2-
cache = rlang::env(
7+
cache <- rlang::env(
38
metadata_table = rlang::env()
49
)
510

11+
DATABASE_URL = single_line_str(
12+
"https://object-store.rc.nectar.org.au/v1/
13+
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
14+
)
615

716
#' Gets the Curated Atlas metadata as a data frame.
8-
#'
9-
#' Downloads a parquet database of the Human Cell Atlas metadata to a local
10-
#' cache, and then opens it as a data frame. It can then be filtered and
11-
#' passed into [get_SingleCellExperiment()]
12-
#' to obtain a [`SingleCellExperiment::SingleCellExperiment-class`]
17+
#'
18+
#' Downloads a parquet database of the Human Cell Atlas metadata to a local
19+
#' cache, and then opens it as a data frame. It can then be filtered and passed
20+
#' into [get_SingleCellExperiment()] to obtain a
21+
#' [`SingleCellExperiment::SingleCellExperiment-class`]
1322
#'
1423
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
1524
#' to the location of the parquet database.
1625
#' @param cache_directory Optional character vector of length 1. A file path on
1726
#' your local system to a directory (not a file) that will be used to store
1827
#' metadata.parquet
1928
#' @param use_cache Optional logical scalar. If `TRUE` (the default), and this
20-
#' function has been called before with the same parameters, then a cached
21-
#' reference to the table will be returned. If `FALSE`, a new connection will
22-
#' be created no matter what.
29+
#' function has been called before with the same parameters, then a cached
30+
#' reference to the table will be returned. If `FALSE`, a new connection will
31+
#' be created no matter what.
2332
#' @return A lazy data.frame subclass containing the metadata. You can interact
2433
#' with this object using most standard dplyr functions. For string matching,
2534
#' it is recommended that you use `stringr::str_like` to filter character
@@ -39,63 +48,87 @@ cache = rlang::env(
3948
#' @importFrom duckdb duckdb
4049
#' @importFrom dplyr tbl
4150
#' @importFrom httr progress
42-
#' @importFrom cli cli_alert_info
43-
#'
44-
#' @details
45-
#'
46-
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
47-
#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
48-
#'
49-
#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
50-
#'
51-
#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
52-
#'
51+
#' @importFrom cli cli_alert_info hash_sha256
52+
#'
53+
#' @details
54+
#'
55+
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's
56+
#' vignette `using_cellxgenedp` provides an overview of the columns in the
57+
#' metadata. The data for which the column `organism_name` included "Homo
58+
#' sapiens" was collected collected from `cellxgenedp`.
59+
#'
60+
#' The columns `dataset_id` and `file_id` link the datasets explorable through
61+
#' `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
62+
#'
63+
#' Our representation, harmonises the metadata at dataset, sample and cell
64+
#' levels, in a unique coherent database table.
65+
#'
5366
#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
54-
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
55-
#'
67+
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`,
68+
#' `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`,
69+
#' `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`,
70+
#' `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`,
71+
#' `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`,
72+
#' `user_submitted`, `x_normalization`
73+
#'
5674
#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
57-
#'
58-
#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
59-
#'
75+
#'
76+
#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`,
77+
#' `development_stage`, `development_stage_ontology_term_id`, `ethnicity`,
78+
#' `ethnicity_ontology_term_id`, `experiment___`, `organism`,
79+
#' `organism_ontology_term_id`, `sample_placeholder`, `sex`,
80+
#' `sex_ontology_term_id`, `tissue`, `tissue_harmonised`,
81+
#' `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`,
82+
#' `is_primary_data.x`
83+
#'
6084
#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
61-
#'
62-
#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler`
63-
#'
64-
#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
65-
#'
85+
#'
86+
#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`,
87+
#' `confidence_class`, `cell_annotation_azimuth_l2`,
88+
#' `cell_annotation_blueprint_singler`
89+
#'
90+
#' Through harmonisation and curation we introduced custom column, not present
91+
#' in the original CELLxGENE metadata
92+
#'
6693
#' - `tissue_harmonised`: a coarser tissue name for better filtering
6794
#' - `age_days`: the number of days corresponding to the age
68-
#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
69-
#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
95+
#' - `cell_type_harmonised`: the consensus call identity (for immune cells)
96+
#' using the original and three novel annotations using Seurat Azimuth and
97+
#' SingleR
98+
#' - `confidence_class`: an ordinal class of how confident
99+
#' `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and
100+
#' so on.
70101
#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
71-
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
72-
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
102+
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using
103+
#' Blueprint reference
104+
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco
105+
#' reference
73106
#' - `sample_id_db`: Sample subdivision for internal use
74107
#' - `file_id_db`: File subdivision for internal use
75108
#' - `sample_`: Sample ID
76109
#' - `.sample_name`: How samples were defined
77-
#'
78-
#'
110+
#'
111+
#'
79112
#' **Possible cache path issues**
80-
#'
81-
#' If your default R cache path includes non-standard characters (e.g. dash because of your user or organisation name), the following error can manifest
82-
#'
83-
#' Error in `db_query_fields.DBIConnection()`:
84-
#' ! Can't query fields.
85-
#' Caused by error:
86-
#' ! Parser Error: syntax error at or near "/"
87-
#' LINE 2: FROM /Users/bob/Library/Cach...
88-
#'
113+
#'
114+
#' If your default R cache path includes non-standard characters (e.g. dash
115+
#' because of your user or organisation name), the following error can manifest
116+
#'
117+
#' Error in `db_query_fields.DBIConnection()`: ! Can't query fields. Caused by
118+
#' error: ! Parser Error: syntax error at or near "/" LINE 2: FROM
119+
#' /Users/bob/Library/Cach...
120+
#'
89121
#' The solution is to choose a different cache, for example
90-
#'
122+
#'
91123
#' get_metadata(cache_directory = path.expand('~'))
92124
#'
93125
get_metadata <- function(
94-
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet",
126+
remote_url = DATABASE_URL,
95127
cache_directory = get_default_cache_dir(),
96128
use_cache = TRUE
97129
) {
98-
hash = c(remote_url, cache_directory) |> paste0(collapse="") |> cli::hash_sha256()
130+
hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
131+
hash_sha256()
99132
cached_connection = cache$metadata_table[[hash]]
100133
if (!is.null(cached_connection) && isTRUE(use_cache)) {
101134
cached_connection
@@ -108,7 +141,7 @@ get_metadata <- function(
108141
db_path,
109142
progress(type = "down", con = stderr())
110143
)
111-
table = duckdb() |>
144+
table <- duckdb() |>
112145
dbConnect(drv = _, read_only = TRUE) |>
113146
tbl(db_path)
114147
cache$metadata_table[[hash]] = table

0 commit comments

Comments
 (0)