stemangiola
diff --git a/‎DESCRIPTION‎
Lines changed: 52 additions & 3 deletions b/‎DESCRIPTION‎
Lines changed: 52 additions & 3 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 0 additions & 1 deletion b/‎NAMESPACE‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎R/query.R‎
Lines changed: 23 additions & 33 deletions b/‎R/query.R‎
Lines changed: 23 additions & 33 deletions
diff --git a/‎README.Rmd‎
Lines changed: 39 additions & 8 deletions b/‎README.Rmd‎
Lines changed: 39 additions & 8 deletions
diff --git a/‎README.md‎
Lines changed: 45 additions & 6 deletions b/‎README.md‎
Lines changed: 45 additions & 6 deletions
diff --git a/‎inst/NCAM1_figure.png‎ ‎man/figures/NCAM1_figure.png‎inst/NCAM1_figure.png renamed to man/figures/NCAM1_figure.png b/‎inst/NCAM1_figure.png‎ ‎man/figures/NCAM1_figure.png‎inst/NCAM1_figure.png renamed to man/figures/NCAM1_figure.png
diff --git a/‎man/figures/bioconductor_logo.jpg‎
77.3 KB b/‎man/figures/bioconductor_logo.jpg‎
77.3 KB
diff --git a/‎man/figures/czi_logo.png‎
57 KB b/‎man/figures/czi_logo.png‎
57 KB
diff --git a/‎inst/logo.png‎ ‎man/figures/logo.png‎inst/logo.png renamed to man/figures/logo.png b/‎inst/logo.png‎ ‎man/figures/logo.png‎inst/logo.png renamed to man/figures/logo.png
diff --git a/‎man/figures/svcf_logo.jpeg‎
18.4 KB b/‎man/figures/svcf_logo.jpeg‎
18.4 KB
@@ -1,13 +1,63 @@
 Type: Package
 Package: CuratedAtlasQueryR
 Title: Queries the Human Cell Atlas
-Version: 0.3.0
+Version: 0.3.1
 Authors@R: c(
     person(
         "Stefano",
         "Mangiola",
         email = "[email protected]",
-        role = c("aut", "cre")
+        role = c("aut", "cre", "rev")
+    ),
+    person(
+        "Michael",
+        "Milton",
+        email = "[email protected]",
+        role = c("aut", "rev")
+    ),
+    person(
+        "Martin",
+        "Morgan",
+        email = "[email protected]",
+        role = c("ctb", "rev")
+    ),
+    person(
+        "Vincent",
+        "Carey",
+        email = "[email protected]",
+        role = c("ctb", "rev")
+    ),
+    person(
+        "Julie",
+        "Iskander",
+        email = "[email protected]",
+        role = c( "rev")
+    ),
+    person(
+        "Tony",
+        "Papenfuss",
+        email = "[email protected]",
+        role = c( "rev")
+    ),
+    person(
+        "Silicon Valley Foundation",
+        "CZF2019-002443",
+        role = c( "fnd")
+    ),
+    person(
+        "NIH NHGRI",
+        "5U24HG004059-18",
+        role = c( "fnd")
+    ),
+    person(
+        "Victoria Cancer Agnency",
+        "ECRF21036",
+        role = c( "fnd")
+    ),
+    person(
+        "NHMRC",
+        "1116955",
+        role = c( "fnd")
     ))
 Description: Provides access to a copy of the Human Cell Atlas, but with 
     harmonised metadata. This allows for uniform querying across numerous 
@@ -36,7 +86,6 @@ Imports:
     methods,
     rlang,
     stats,
-    RSQLite,
     S4Vectors,
     tibble,
     utils,
 
@@ -55,4 +55,3 @@ importFrom(tibble,column_to_rownames)
 importFrom(tools,R_user_dir)
 importFrom(utils,head)
 importFrom(utils,packageName)
-importFrom(utils,untar)
@@ -10,24 +10,16 @@ assay_map <- c(
     cpm = "cpm"
 )
 
-#' Used in a pipeline to run one or more expressions with side effects, but
-#' return the input value as the output value unaffected
-aside <- function(x, ...) {
-    # Courtesy of Hadley: https://fosstodon.org/@hadleywickham/109558265769090930
-    list(...)
-    x
-}
-
-REMOTE_URL <- "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
+REMOTE_URL <- "https://swift.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
 
 #' Given a data frame of HCA metadata, returns a SingleCellExperiment object
 #' corresponding to the samples in that data frame
 #'
 #' @param data A data frame containing, at minimum, a `.sample` column, which
 #'   corresponds to a single cell sample ID. This can be obtained from the
 #'   [get_metadata()] function.
-#' @param assays A character vector whose elements must be either "raw" or
-#'   "scaled", representing the corresponding assay you want to request.
+#' @param assays A character vector whose elements must be either "counts" and/or
+#'   "cpm", representing the corresponding assay(s) you want to request.
 #' @param repository A character vector of length one. If provided, it should be
 #'   an HTTP URL pointing to the location where the single cell data is stored.
 #' @param cache_directory An optional character vector of length one. If
@@ -87,33 +79,31 @@ get_SingleCellExperiment <- function(
     cli_alert_info("Realising metadata.")
     raw_data <- collect(data)
     inherits(raw_data, "tbl") |> assert_that()
-    has_name(raw_data, c(".cell", "file_id_db")) |> assert_that()
+    has_name(raw_data, c("_cell", "file_id_db")) |> assert_that()
 
     cache_directory |> dir.create(showWarnings = FALSE)
 
-    cells_of_interest <- raw_data |>
-        pull(.data$.cell) |>
-        unique() |>
-        as.character()
-
     subdirs <- assay_map[assays]
 
     # The repository is optional. If not provided we load only from the cache
     if (!is.null(repository)) {
         cli_alert_info("Synchronising files")
+        parsed_repo <- parse_url(repository)
+        parsed_repo$scheme |>
+            `%in%`(c("http", "https")) |>
+            assert_that()
+
         files_to_read <-
             raw_data |>
             pull(.data$file_id_db) |>
             unique() |>
-            as.character()
-        parsed_repo <- parse_url(repository)
-        (parsed_repo$scheme %in% c("http", "https")) |> assert_that()
-        sync_assay_files(
-            url = parsed_repo,
-            cache_dir = cache_directory,
-            files = files_to_read,
-            subdirs = subdirs
-        )
+            as.character() |>
+            sync_assay_files(
+                url = parsed_repo,
+                cache_dir = cache_directory,
+                files = _,
+                subdirs = subdirs
+            )
     }
 
     cli_alert_info("Reading files.")
@@ -182,14 +172,14 @@ group_to_sce <- function(i, df, dir_prefix, features) {
     sce <- loadHDF5SummarizedExperiment(sce_path)
     # The cells we select here are those that are both available in the SCE
     # object, and requested for this particular file
-    cells <- colnames(sce) |> intersect(df$.cell)
+    cells <- colnames(sce) |> intersect(df$`_cell`)
     # We need to make the cell names globally unique, which we can guarantee
     # by adding a suffix that is derived from file_id_db, which is the grouping
     # variable
     new_cellnames <- paste0(cells, "_", i)
     new_coldata <- df |>
-        mutate(original_cell_id = .data$.cell, .cell = new_cellnames) |>
-        column_to_rownames(".cell") |>
+        mutate(original_cell_id = .data$`_cell`, `_cell` = new_cellnames) |>
+        column_to_rownames("_cell") |>
         as("DataFrame")
 
     features |>
@@ -313,7 +303,8 @@ get_default_cache_dir <- function() {
         R_user_dir(
             "cache"
         ) |>
-        normalizePath()
+        normalizePath() |>
+        suppressWarnings()
 }
 
 #' @importFrom assertthat assert_that
@@ -372,18 +363,17 @@ get_seurat <- function(...) {
 #' @importFrom dplyr tbl
 #' @importFrom httr progress
 #' @importFrom cli cli_alert_info
-#' @importFrom utils untar
 get_metadata <- function(
     remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
     cache_directory = get_default_cache_dir()
 ) {
-    db_path <- file.path(cache_directory, "metadata.parquet")
+    db_path <- file.path(cache_directory, "metadata.0.2.2.parquet")
     sync_remote_file(
         remote_url,
         db_path,
         progress(type = "down", con = stderr())
     )
-    table <- duckdb() |>
+    duckdb() |>
         dbConnect(drv = _, read_only = TRUE) |>
         tbl(db_path)
 }
@@ -3,9 +3,12 @@ title: "CuratedAtlasQueryR"
 output: github_document
 ---
 
-`CuratedAtlasQuery` is a query interface that allow the programmatic exploration and retrieval of the harmonised, curated and reannotated CELLxGENE single-cell human cell atlas. Data can be retrieved at cell, sample, or dataset levels based on filtering criteria. 
+<!-- badges: start -->
+[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing)
+<!-- badges: end -->
 
-# Query interface
+
+`CuratedAtlasQuery` is a query interface that allow the programmatic exploration and retrieval of the harmonised, curated and reannotated CELLxGENE single-cell human cell atlas. Data can be retrieved at cell, sample, or dataset levels based on filtering criteria. 
 
 ```{r, include = FALSE}
 # Note: knit this to the repo readme file using:
@@ -16,8 +19,27 @@ knitr::opts_chunk$set(
 )
 ```
 
-```{r, echo=FALSE, out.height = "139px", out.width = "120px"}
-knitr::include_graphics("inst/logo.png")
+```{r, echo=FALSE, out.height = c("139px"), out.width = "120x" }
+knitr::include_graphics(c("man/figures/logo.png"))
+```
+
+```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
+knitr::include_graphics(c(
+  "man/figures/svcf_logo.jpeg", 
+  "man/figures/czi_logo.png", 
+  "man/figures/bioconductor_logo.jpg",
+    "man/figures/vca_logo.png"
+))
+```
+
+[website](https://stemangiola.github.io/CuratedAtlasQueryR)
+
+# Query interface
+
+## Installation
+
+```{r, eval=FALSE}
+devtools::install_github("stemangiola/CuratedAtlasQueryR")
 ```
 
 ## Load the package
@@ -38,7 +60,7 @@ get_metadata()
 
 ### Explore the tissue 
 
-```{r, eval=FALSE}
+```{r}
 get_metadata() |>
     dplyr::distinct(tissue, file_id) 
 ```
@@ -168,7 +190,7 @@ get_metadata() |>
 ```
 
 ```{r, echo=FALSE, message=FALSE, warning=FALSE}
-knitr::include_graphics("inst/NCAM1_figure.png")
+knitr::include_graphics("man/figures/NCAM1_figure.png")
 ```
 
 # Cell metadata
@@ -189,7 +211,7 @@ Through harmonisation and curation we introduced custom column, not present in t
 
 - `tissue_harmonised`: a coarser tissue name for better filtering
 - `age_days`: the number of days corresponding to the age
-- `cell_type_harmonised`: the consensus call identiti (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
+- `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
 - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.             
 - `cell_annotation_azimuth_l2`: Azimuth cell annotation
 - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
@@ -201,6 +223,15 @@ Through harmonisation and curation we introduced custom column, not present in t
 
 # RNA abundance
 
-The `raw` assay includes RNA abundance in the positive real scale (not transformed with non-linear functions, e.g. log sqrt). Originally CELLxGENE include a mix of scales and tranformations specified in the `x_normalization` column.
+The `raw` assay includes RNA abundance in the positive real scale (not transformed with non-linear functions, e.g. log sqrt). Originally CELLxGENE include a mix of scales and transformations specified in the `x_normalization` column.
 
 The `cpm` assay includes counts per million.
+
+---
+
+This project has been funded by
+
+- *Silicon Valley Foundation* CZF2019-002443
+- *Bioconductor core funding* NIH NHGRI 5U24HG004059-18 
+- *Victoria Cancer Agency* ECRF21036
+- *Australian National Health and Medical Research Council* 1116955
@@ -1,14 +1,29 @@
 CuratedAtlasQueryR
 ================
 
+<!-- badges: start -->
+
+[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing)
+<!-- badges: end -->
+
 `CuratedAtlasQuery` is a query interface that allow the programmatic
 exploration and retrieval of the harmonised, curated and reannotated
 CELLxGENE single-cell human cell atlas. Data can be retrieved at cell,
 sample, or dataset levels based on filtering criteria.
 
+<img src="man/figures/logo.png" width="120x" height="139px" />
+
+<img src="man/figures/svcf_logo.jpeg" width="155x" height="58px" /><img src="man/figures/czi_logo.png" width="129px" height="58px" /><img src="man/figures/bioconductor_logo.jpg" width="202px" height="58px" /><img src="man/figures/vca_logo.png" width="219px" height="58px" />
+
+[website](https://stemangiola.github.io/CuratedAtlasQueryR)
+
 # Query interface
 
-<img src="inst/logo.png" width="120px" height="139px" />
+## Installation
+
+``` r
+devtools::install_github("stemangiola/CuratedAtlasQueryR")
+```
 
 ## Load the package
 
@@ -24,8 +39,8 @@ library(stringr)
 
 ``` r
 get_metadata()
-#> # Source:   table<metadata> [?? x 56]
-#> # Database: sqlite 3.40.0 [/stornext/Home/data/allstaff/m/mangiola.s/.cache/R/CuratedAtlasQueryR/metadata.sqlite]
+#> # Source:   table</stornext/Home/data/allstaff/m/mangiola.s/.cache/R/CuratedAtlasQueryR/metadata.parquet> [?? x 56]
+#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.0/:memory:]
 #>    .cell   sampl…¹ .sample .samp…² assay assay…³ file_…⁴ cell_…⁵ cell_…⁶ devel…⁷
 #>    <chr>   <chr>   <chr>   <chr>   <chr> <chr>   <chr>   <chr>   <chr>   <chr>  
 #>  1 AAACCT… 8a0fe0… 5f20d7… D17PrP… 10x … EFO:00… 1e334b… basal … CL:000… 31-yea…
@@ -52,6 +67,21 @@ get_metadata()
 ``` r
 get_metadata() |>
     dplyr::distinct(tissue, file_id) 
+#> # Source:   SQL [?? x 2]
+#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.0/:memory:]
+#>    tissue                                             file_id                   
+#>    <chr>                                              <chr>                     
+#>  1 epithelial cell of alveolus of lung (cell culture) 0e8f9ce4-46e5-434e-9ca0-e…
+#>  2 peripheral zone of prostate                        0f017e66-9c70-4d29-9435-2…
+#>  3 transition zone of prostate                        0f017e66-9c70-4d29-9435-2…
+#>  4 superior frontal gyrus                             0fe32cca-d111-42b6-9b93-b…
+#>  5 fovea centralis                                    100c44ed-f754-4d45-8649-d…
+#>  6 blood                                              1042ba0a-98c5-4816-897d-e…
+#>  7 telencephalon                                      3fe53a40-38ff-4f25-b33b-e…
+#>  8 kidney                                             69b67eef-43fd-40ff-8fd3-e…
+#>  9 blood                                              6a044711-8df7-4f88-bad7-f…
+#> 10 heart left ventricle                               6a579758-a4b4-4f64-be54-4…
+#> # … with more rows
 ```
 
 ``` r
@@ -239,7 +269,7 @@ get_metadata() |>
   theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
 ```
 
-<img src="inst/NCAM1_figure.png" width="629" />
+<img src="man/figures/NCAM1_figure.png" width="629" />
 
 # Cell metadata
 
@@ -277,7 +307,7 @@ present in the original CELLxGENE metadata
 
 - `tissue_harmonised`: a coarser tissue name for better filtering
 - `age_days`: the number of days corresponding to the age
-- `cell_type_harmonised`: the consensus call identiti (for immune cells)
+- `cell_type_harmonised`: the consensus call identity (for immune cells)
   using the original and three novel annotations using Seurat Azimuth
   and SingleR
 - `confidence_class`: an ordinal class of how confident
@@ -297,7 +327,16 @@ present in the original CELLxGENE metadata
 
 The `raw` assay includes RNA abundance in the positive real scale (not
 transformed with non-linear functions, e.g. log sqrt). Originally
-CELLxGENE include a mix of scales and tranformations specified in the
+CELLxGENE include a mix of scales and transformations specified in the
 `x_normalization` column.
 
 The `cpm` assay includes counts per million.
+
+------------------------------------------------------------------------
+
+This project has been funded by
+
+- *Silicon Valley Foundation* CZF2019-002443
+- *Bioconductor core funding* NIH NHGRI 5U24HG004059-18
+- *Victoria Cancer Agency* ECRF21036
+- *Australian National Health and Medical Research Council* 1116955