stemangiola
diff --git a/‎.github/workflows/check-bioc.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/check-bioc.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎DESCRIPTION‎
Lines changed: 11 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 3 additions & 2 deletions b/‎NAMESPACE‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎R/dev.R‎
Lines changed: 36 additions & 9 deletions b/‎R/dev.R‎
Lines changed: 36 additions & 9 deletions
diff --git a/‎R/metadata.R‎
Lines changed: 82 additions & 49 deletions b/‎R/metadata.R‎
Lines changed: 82 additions & 49 deletions
@@ -243,7 +243,8 @@ jobs:
               dir('check', 'tar.gz$', full.names = TRUE),
               `quit-with-status` = TRUE,
               `no-check-R-ver` = TRUE,
-              `no-check-bioc-help` = TRUE
+              `no-check-bioc-help` = TRUE,
+              `new-package` = TRUE
           )
         shell: Rscript {0}
 
 
@@ -90,7 +90,8 @@ Imports:
     tibble,
     utils,
     dbplyr (>= 2.3.0),
-    duckdb
+    duckdb,
+    stringr
 Suggests:
     here,
     stringr,
@@ -130,3 +131,12 @@ URL: https://github.com/stemangiola/CuratedAtlasQueryR
 BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
 VignetteBuilder: knitr
 Roxygen: list(markdown = TRUE)
+Collate: 
+    'utils.R'
+    'counts.R'
+    'dev.R'
+    'metadata.R'
+    'seurat.R'
+    'unharm.R'
+    'unharmonised.R'
+    'zzz.R'
@@ -5,14 +5,13 @@ export(get_SingleCellExperiment)
 export(get_metadata)
 export(get_seurat)
 export(get_unharmonised_metadata)
-import(Seurat)
-import(dbplyr)
 importFrom(BiocGenerics,cbind)
 importFrom(DBI,dbConnect)
 importFrom(DBI,dbDisconnect)
 importFrom(HDF5Array,HDF5RealizationSink)
 importFrom(HDF5Array,loadHDF5SummarizedExperiment)
 importFrom(S4Vectors,DataFrame)
+importFrom(Seurat,as.SingleCellExperiment)
 importFrom(SeuratObject,as.Seurat)
 importFrom(SeuratObject,as.sparse)
 importFrom(SingleCellExperiment,SingleCellExperiment)
@@ -26,6 +25,7 @@ importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_success)
 importFrom(cli,cli_alert_warning)
+importFrom(cli,hash_sha256)
 importFrom(dbplyr,remote_con)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,collect)
@@ -59,6 +59,7 @@ importFrom(purrr,set_names)
 importFrom(purrr,walk)
 importFrom(rlang,.data)
 importFrom(stats,setNames)
+importFrom(stringr,str_remove_all)
 importFrom(tibble,column_to_rownames)
 importFrom(tools,R_user_dir)
 importFrom(utils,head)
 
@@ -15,12 +15,22 @@
 #'   character scalar
 #' @return NULL
 #' @keywords internal
-upload_swift = function(source, container, name = basename(source), credential_id = NULL, credential_secret = NULL){
+upload_swift <- function(
+    source,
+    container,
+    name = basename(source),
+    credential_id = NULL,
+    credential_secret = NULL
+) {
     # Create the basilisk environment
     swift_env <- basilisk::BasiliskEnvironment(
         envname="swift-nectar-upload",
         pkgname=packageName(),
-        packages=c("python-swiftclient==4.2.0", "python-keystoneclient==5.1.0", "python==3.10.9")
+        packages=c(
+          "python-swiftclient==4.2.0",
+          "python-keystoneclient==5.1.0",
+          "python==3.10.9"
+        )
     )
     proc <- basilisk::basiliskStart(swift_env)
 
@@ -38,7 +48,7 @@ upload_swift = function(source, container, name = basename(source), credential_i
     else {
         auth <- character()
     }
-    args = c(
+    args <- c(
         "-m", 
         "swiftclient.shell",
         "--os-auth-url",
@@ -67,12 +77,19 @@ upload_swift = function(source, container, name = basename(source), credential_i
 #' @inheritDotParams upload_swift
 #' @examples
 #' \dontrun{
-#'  metadata = CuratedAtlasQueryR::get_metadata() |> head(10) |> dplyr::collect()
-#'  update_database(metadata, "0.2.3", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
+#'  metadata = CuratedAtlasQueryR::get_metadata() |>
+#'      head(10) |>
+#'      dplyr::collect()
+#'  update_database(
+#'      metadata, 
+#'      "0.2.3", 
+#'      credential_id = "ABCDEFGHIJK", 
+#'      credential_secret = "ABCD1234EFGH-5678IJK"
+#'  )
 #'  # Prints "metadata.0.2.3.parquet" if successful
 #' }
 #' @keywords internal
-update_database = function(metadata, version, ...){
+update_database <- function(metadata, version, ...){
     # These are optional dev packages
     rlang::check_installed(c("arrow", "glue", "basilisk"))
 
@@ -89,12 +106,22 @@ update_database = function(metadata, version, ...){
 #'   files, one for each dataset, e.g.
 #'   /vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2
 #' @inheritDotParams upload_swift
+#' @inherit upload_swift return
 #' @keywords internal
 #' @examples
 #' \dontrun{
-#' update_unharmonised("/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2", credential_id = "ABCDEFGHIJK", credential_secret = "ABCD1234EFGH-5678IJK")
+#' update_unharmonised(
+#'     "/vast/projects/cellxgene_curated/metadata_non_harmonised_parquet_0.2", 
+#'     credential_id = "ABCDEFGHIJK", 
+#'     credential_secret = "ABCD1234EFGH-5678IJK"
+#' )
 #' }
-update_unharmonised = function(unharmonised_parquet_dir, ...){
+update_unharmonised <- function(unharmonised_parquet_dir, ...){
     # name="/" forces it have no prefix, ie be at the top level in the bucket
-    upload_swift(unharmonised_parquet_dir, container="unharmonised_metadata", name="/", ...)
+    upload_swift(
+        unharmonised_parquet_dir, 
+        container="unharmonised_metadata",
+        name="/", 
+        ...
+    )
 }
@@ -1,25 +1,34 @@
+# Functions that relate to the harmonised metadata database
+
+#' @include utils.R
+NULL
+
 #' Environment that we use to cache the DuckDB connections
-cache = rlang::env(
+cache <- rlang::env(
     metadata_table = rlang::env()
 )
 
+DATABASE_URL = single_line_str(
+    "https://object-store.rc.nectar.org.au/v1/
+    AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
+)
 
 #' Gets the Curated Atlas metadata as a data frame.
-#' 
-#' Downloads a parquet database of the Human Cell Atlas metadata to a local 
-#' cache, and then opens it as a data frame. It can then be filtered and 
-#' passed into [get_SingleCellExperiment()] 
-#' to obtain a [`SingleCellExperiment::SingleCellExperiment-class`]
+#'
+#' Downloads a parquet database of the Human Cell Atlas metadata to a local
+#' cache, and then opens it as a data frame. It can then be filtered and passed
+#' into [get_SingleCellExperiment()] to obtain a
+#' [`SingleCellExperiment::SingleCellExperiment-class`]
 #'
 #' @param remote_url Optional character vector of length 1. An HTTP URL pointing
 #'   to the location of the parquet database.
 #' @param cache_directory Optional character vector of length 1. A file path on
 #'   your local system to a directory (not a file) that will be used to store
 #'   metadata.parquet
 #' @param use_cache Optional logical scalar. If `TRUE` (the default), and this
-#'  function has been called before with the same parameters, then a cached
-#'  reference to the table will be returned. If `FALSE`, a new connection will
-#'  be created no matter what.
+#'   function has been called before with the same parameters, then a cached
+#'   reference to the table will be returned. If `FALSE`, a new connection will
+#'   be created no matter what.
 #' @return A lazy data.frame subclass containing the metadata. You can interact
 #'   with this object using most standard dplyr functions. For string matching,
 #'   it is recommended that you use `stringr::str_like` to filter character
@@ -39,63 +48,87 @@ cache = rlang::env(
 #' @importFrom duckdb duckdb
 #' @importFrom dplyr tbl
 #' @importFrom httr progress
-#' @importFrom cli cli_alert_info
-#' 
-#' @details 
-#' 
-#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
-#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
-#' 
-#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
-#' 
-#'  Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
-#' 
+#' @importFrom cli cli_alert_info hash_sha256
+#'
+#' @details
+#'
+#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's
+#' vignette `using_cellxgenedp` provides an overview of the columns in the
+#' metadata. The data for which the column `organism_name` included "Homo
+#' sapiens" was collected collected from `cellxgenedp`.
+#'
+#' The columns `dataset_id` and `file_id` link the datasets explorable through
+#' `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
+#'
+#' Our representation, harmonises the metadata at dataset, sample and cell
+#' levels, in a unique coherent database table.
+#'
 #' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
-#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
-#' 
+#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`,
+#' `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`,
+#' `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`,
+#' `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`,
+#' `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`,
+#' `user_submitted`, `x_normalization`
+#'
 #' Sample-specific columns (definitions available at cellxgene.cziscience.com)
-#' 
-#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
-#' 
+#'
+#' `sample_`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`,
+#' `development_stage`, `development_stage_ontology_term_id`, `ethnicity`,
+#' `ethnicity_ontology_term_id`, `experiment___`, `organism`,
+#' `organism_ontology_term_id`, `sample_placeholder`, `sex`,
+#' `sex_ontology_term_id`, `tissue`, `tissue_harmonised`,
+#' `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`,
+#' `is_primary_data.x`
+#'
 #' Cell-specific columns (definitions available at cellxgene.cziscience.com)
-#' 
-#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler` 
-#' 
-#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
-#' 
+#'
+#' `cell_`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`,
+#' `confidence_class`, `cell_annotation_azimuth_l2`,
+#' `cell_annotation_blueprint_singler`
+#'
+#' Through harmonisation and curation we introduced custom column, not present
+#' in the original CELLxGENE metadata
+#'
 #' - `tissue_harmonised`: a coarser tissue name for better filtering
 #' - `age_days`: the number of days corresponding to the age
-#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
-#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.             
+#' - `cell_type_harmonised`: the consensus call identity (for immune cells)
+#'   using the original and three novel annotations using Seurat Azimuth and 
+#'   SingleR
+#' - `confidence_class`: an ordinal class of how confident
+#'   `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and
+#'   so on.
 #' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
-#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
-#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
+#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using 
+#'   Blueprint reference
+#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco 
+#'   reference
 #' - `sample_id_db`: Sample subdivision for internal use
 #' - `file_id_db`: File subdivision for internal use
 #' - `sample_`: Sample ID
 #' - `.sample_name`: How samples were defined
-#' 
-#' 
+#'
+#'
 #' **Possible cache path issues**
-#' 
-#' If your default R cache path includes non-standard characters (e.g. dash because of your user or organisation name), the following error can manifest
-#' 
-#' Error in `db_query_fields.DBIConnection()`:
-#' ! Can't query fields.
-#' Caused by error:
-#' ! Parser Error: syntax error at or near "/"
-#' LINE 2: FROM /Users/bob/Library/Cach...
-#' 
+#'
+#' If your default R cache path includes non-standard characters (e.g. dash
+#' because of your user or organisation name), the following error can manifest
+#'
+#' Error in `db_query_fields.DBIConnection()`: ! Can't query fields. Caused by
+#' error: ! Parser Error: syntax error at or near "/" LINE 2: FROM
+#' /Users/bob/Library/Cach...
+#'
 #' The solution is to choose a different cache, for example
-#' 
+#'
 #' get_metadata(cache_directory = path.expand('~'))
 #' 
 get_metadata <- function(
-    remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet",
+    remote_url = DATABASE_URL,
     cache_directory = get_default_cache_dir(),
     use_cache = TRUE
 ) {
-    hash = c(remote_url, cache_directory) |> paste0(collapse="") |> cli::hash_sha256()
+    hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
+        hash_sha256()
     cached_connection = cache$metadata_table[[hash]]
     if (!is.null(cached_connection) && isTRUE(use_cache)) {
         cached_connection
@@ -108,7 +141,7 @@ get_metadata <- function(
             db_path,
             progress(type = "down", con = stderr())
         )
-        table = duckdb() |>
+        table <- duckdb() |>
             dbConnect(drv = _, read_only = TRUE) |>
             tbl(db_path)
         cache$metadata_table[[hash]] = table
Original file line number	Diff line number	Diff line change
`@@ -243,7 +243,8 @@ jobs:`
`243`	`243`	`dir('check', 'tar.gz$', full.names = TRUE),`
`244`	`244`	`quit-with-status` = TRUE,
`245`	`245`	`no-check-R-ver` = TRUE,
`246`		- `no-check-bioc-help` = TRUE
	`246`	+ `no-check-bioc-help` = TRUE,
	`247`	+ `new-package` = TRUE
`247`	`248`	`)`
`248`	`249`	`shell: Rscript {0}`
`249`	`250`