Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 3a99078

Browse files
committed
Merge branch 'master' of github.com:stemangiola/CuratedAtlasQueryR into update-db-0.2
Conflicts: R/query.R
2 parents fd9b511 + 0238471 commit 3a99078

File tree

14 files changed

+198
-69
lines changed

14 files changed

+198
-69
lines changed

DESCRIPTION

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,63 @@
11
Type: Package
22
Package: CuratedAtlasQueryR
33
Title: Queries the Human Cell Atlas
4-
Version: 0.3.0
4+
Version: 0.3.1
55
Authors@R: c(
66
person(
77
"Stefano",
88
"Mangiola",
99
email = "[email protected]",
10-
role = c("aut", "cre")
10+
role = c("aut", "cre", "rev")
11+
),
12+
person(
13+
"Michael",
14+
"Milton",
15+
email = "[email protected]",
16+
role = c("aut", "rev")
17+
),
18+
person(
19+
"Martin",
20+
"Morgan",
21+
email = "[email protected]",
22+
role = c("ctb", "rev")
23+
),
24+
person(
25+
"Vincent",
26+
"Carey",
27+
email = "[email protected]",
28+
role = c("ctb", "rev")
29+
),
30+
person(
31+
"Julie",
32+
"Iskander",
33+
email = "[email protected]",
34+
role = c( "rev")
35+
),
36+
person(
37+
"Tony",
38+
"Papenfuss",
39+
email = "[email protected]",
40+
role = c( "rev")
41+
),
42+
person(
43+
"Silicon Valley Foundation",
44+
"CZF2019-002443",
45+
role = c( "fnd")
46+
),
47+
person(
48+
"NIH NHGRI",
49+
"5U24HG004059-18",
50+
role = c( "fnd")
51+
),
52+
person(
53+
"Victoria Cancer Agnency",
54+
"ECRF21036",
55+
role = c( "fnd")
56+
),
57+
person(
58+
"NHMRC",
59+
"1116955",
60+
role = c( "fnd")
1161
))
1262
Description: Provides access to a copy of the Human Cell Atlas, but with
1363
harmonised metadata. This allows for uniform querying across numerous
@@ -36,7 +86,6 @@ Imports:
3686
methods,
3787
rlang,
3888
stats,
39-
RSQLite,
4089
S4Vectors,
4190
tibble,
4291
utils,

NAMESPACE

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,3 @@ importFrom(tibble,column_to_rownames)
5555
importFrom(tools,R_user_dir)
5656
importFrom(utils,head)
5757
importFrom(utils,packageName)
58-
importFrom(utils,untar)

R/query.R

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,16 @@ assay_map <- c(
1010
cpm = "cpm"
1111
)
1212

13-
#' Used in a pipeline to run one or more expressions with side effects, but
14-
#' return the input value as the output value unaffected
15-
aside <- function(x, ...) {
16-
# Courtesy of Hadley: https://fosstodon.org/@hadleywickham/109558265769090930
17-
list(...)
18-
x
19-
}
20-
21-
REMOTE_URL <- "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
13+
REMOTE_URL <- "https://swift.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas"
2214

2315
#' Given a data frame of HCA metadata, returns a SingleCellExperiment object
2416
#' corresponding to the samples in that data frame
2517
#'
2618
#' @param data A data frame containing, at minimum, a `.sample` column, which
2719
#' corresponds to a single cell sample ID. This can be obtained from the
2820
#' [get_metadata()] function.
29-
#' @param assays A character vector whose elements must be either "raw" or
30-
#' "scaled", representing the corresponding assay you want to request.
21+
#' @param assays A character vector whose elements must be either "counts" and/or
22+
#' "cpm", representing the corresponding assay(s) you want to request.
3123
#' @param repository A character vector of length one. If provided, it should be
3224
#' an HTTP URL pointing to the location where the single cell data is stored.
3325
#' @param cache_directory An optional character vector of length one. If
@@ -87,33 +79,31 @@ get_SingleCellExperiment <- function(
8779
cli_alert_info("Realising metadata.")
8880
raw_data <- collect(data)
8981
inherits(raw_data, "tbl") |> assert_that()
90-
has_name(raw_data, c(".cell", "file_id_db")) |> assert_that()
82+
has_name(raw_data, c("_cell", "file_id_db")) |> assert_that()
9183

9284
cache_directory |> dir.create(showWarnings = FALSE)
9385

94-
cells_of_interest <- raw_data |>
95-
pull(.data$.cell) |>
96-
unique() |>
97-
as.character()
98-
9986
subdirs <- assay_map[assays]
10087

10188
# The repository is optional. If not provided we load only from the cache
10289
if (!is.null(repository)) {
10390
cli_alert_info("Synchronising files")
91+
parsed_repo <- parse_url(repository)
92+
parsed_repo$scheme |>
93+
`%in%`(c("http", "https")) |>
94+
assert_that()
95+
10496
files_to_read <-
10597
raw_data |>
10698
pull(.data$file_id_db) |>
10799
unique() |>
108-
as.character()
109-
parsed_repo <- parse_url(repository)
110-
(parsed_repo$scheme %in% c("http", "https")) |> assert_that()
111-
sync_assay_files(
112-
url = parsed_repo,
113-
cache_dir = cache_directory,
114-
files = files_to_read,
115-
subdirs = subdirs
116-
)
100+
as.character() |>
101+
sync_assay_files(
102+
url = parsed_repo,
103+
cache_dir = cache_directory,
104+
files = _,
105+
subdirs = subdirs
106+
)
117107
}
118108

119109
cli_alert_info("Reading files.")
@@ -182,14 +172,14 @@ group_to_sce <- function(i, df, dir_prefix, features) {
182172
sce <- loadHDF5SummarizedExperiment(sce_path)
183173
# The cells we select here are those that are both available in the SCE
184174
# object, and requested for this particular file
185-
cells <- colnames(sce) |> intersect(df$.cell)
175+
cells <- colnames(sce) |> intersect(df$`_cell`)
186176
# We need to make the cell names globally unique, which we can guarantee
187177
# by adding a suffix that is derived from file_id_db, which is the grouping
188178
# variable
189179
new_cellnames <- paste0(cells, "_", i)
190180
new_coldata <- df |>
191-
mutate(original_cell_id = .data$.cell, .cell = new_cellnames) |>
192-
column_to_rownames(".cell") |>
181+
mutate(original_cell_id = .data$`_cell`, `_cell` = new_cellnames) |>
182+
column_to_rownames("_cell") |>
193183
as("DataFrame")
194184

195185
features |>
@@ -313,7 +303,8 @@ get_default_cache_dir <- function() {
313303
R_user_dir(
314304
"cache"
315305
) |>
316-
normalizePath()
306+
normalizePath() |>
307+
suppressWarnings()
317308
}
318309

319310
#' @importFrom assertthat assert_that
@@ -372,18 +363,17 @@ get_seurat <- function(...) {
372363
#' @importFrom dplyr tbl
373364
#' @importFrom httr progress
374365
#' @importFrom cli cli_alert_info
375-
#' @importFrom utils untar
376366
get_metadata <- function(
377367
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
378368
cache_directory = get_default_cache_dir()
379369
) {
380-
db_path <- file.path(cache_directory, "metadata.parquet")
370+
db_path <- file.path(cache_directory, "metadata.0.2.2.parquet")
381371
sync_remote_file(
382372
remote_url,
383373
db_path,
384374
progress(type = "down", con = stderr())
385375
)
386-
table <- duckdb() |>
376+
duckdb() |>
387377
dbConnect(drv = _, read_only = TRUE) |>
388378
tbl(db_path)
389379
}

README.Rmd

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ title: "CuratedAtlasQueryR"
33
output: github_document
44
---
55

6-
`CuratedAtlasQuery` is a query interface that allow the programmatic exploration and retrieval of the harmonised, curated and reannotated CELLxGENE single-cell human cell atlas. Data can be retrieved at cell, sample, or dataset levels based on filtering criteria.
6+
<!-- badges: start -->
7+
[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing)
8+
<!-- badges: end -->
79

8-
# Query interface
10+
11+
`CuratedAtlasQuery` is a query interface that allow the programmatic exploration and retrieval of the harmonised, curated and reannotated CELLxGENE single-cell human cell atlas. Data can be retrieved at cell, sample, or dataset levels based on filtering criteria.
912

1013
```{r, include = FALSE}
1114
# Note: knit this to the repo readme file using:
@@ -16,8 +19,27 @@ knitr::opts_chunk$set(
1619
)
1720
```
1821

19-
```{r, echo=FALSE, out.height = "139px", out.width = "120px"}
20-
knitr::include_graphics("inst/logo.png")
22+
```{r, echo=FALSE, out.height = c("139px"), out.width = "120x" }
23+
knitr::include_graphics(c("man/figures/logo.png"))
24+
```
25+
26+
```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
27+
knitr::include_graphics(c(
28+
"man/figures/svcf_logo.jpeg",
29+
"man/figures/czi_logo.png",
30+
"man/figures/bioconductor_logo.jpg",
31+
"man/figures/vca_logo.png"
32+
))
33+
```
34+
35+
[website](https://stemangiola.github.io/CuratedAtlasQueryR)
36+
37+
# Query interface
38+
39+
## Installation
40+
41+
```{r, eval=FALSE}
42+
devtools::install_github("stemangiola/CuratedAtlasQueryR")
2143
```
2244

2345
## Load the package
@@ -38,7 +60,7 @@ get_metadata()
3860

3961
### Explore the tissue
4062

41-
```{r, eval=FALSE}
63+
```{r}
4264
get_metadata() |>
4365
dplyr::distinct(tissue, file_id)
4466
```
@@ -168,7 +190,7 @@ get_metadata() |>
168190
```
169191

170192
```{r, echo=FALSE, message=FALSE, warning=FALSE}
171-
knitr::include_graphics("inst/NCAM1_figure.png")
193+
knitr::include_graphics("man/figures/NCAM1_figure.png")
172194
```
173195

174196
# Cell metadata
@@ -189,7 +211,7 @@ Through harmonisation and curation we introduced custom column, not present in t
189211

190212
- `tissue_harmonised`: a coarser tissue name for better filtering
191213
- `age_days`: the number of days corresponding to the age
192-
- `cell_type_harmonised`: the consensus call identiti (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
214+
- `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
193215
- `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
194216
- `cell_annotation_azimuth_l2`: Azimuth cell annotation
195217
- `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
@@ -201,6 +223,15 @@ Through harmonisation and curation we introduced custom column, not present in t
201223

202224
# RNA abundance
203225

204-
The `raw` assay includes RNA abundance in the positive real scale (not transformed with non-linear functions, e.g. log sqrt). Originally CELLxGENE include a mix of scales and tranformations specified in the `x_normalization` column.
226+
The `raw` assay includes RNA abundance in the positive real scale (not transformed with non-linear functions, e.g. log sqrt). Originally CELLxGENE include a mix of scales and transformations specified in the `x_normalization` column.
205227

206228
The `cpm` assay includes counts per million.
229+
230+
---
231+
232+
This project has been funded by
233+
234+
- *Silicon Valley Foundation* CZF2019-002443
235+
- *Bioconductor core funding* NIH NHGRI 5U24HG004059-18
236+
- *Victoria Cancer Agency* ECRF21036
237+
- *Australian National Health and Medical Research Council* 1116955

README.md

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,29 @@
11
CuratedAtlasQueryR
22
================
33

4+
<!-- badges: start -->
5+
6+
[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing)
7+
<!-- badges: end -->
8+
49
`CuratedAtlasQuery` is a query interface that allow the programmatic
510
exploration and retrieval of the harmonised, curated and reannotated
611
CELLxGENE single-cell human cell atlas. Data can be retrieved at cell,
712
sample, or dataset levels based on filtering criteria.
813

14+
<img src="man/figures/logo.png" width="120x" height="139px" />
15+
16+
<img src="man/figures/svcf_logo.jpeg" width="155x" height="58px" /><img src="man/figures/czi_logo.png" width="129px" height="58px" /><img src="man/figures/bioconductor_logo.jpg" width="202px" height="58px" /><img src="man/figures/vca_logo.png" width="219px" height="58px" />
17+
18+
[website](https://stemangiola.github.io/CuratedAtlasQueryR)
19+
920
# Query interface
1021

11-
<img src="inst/logo.png" width="120px" height="139px" />
22+
## Installation
23+
24+
``` r
25+
devtools::install_github("stemangiola/CuratedAtlasQueryR")
26+
```
1227

1328
## Load the package
1429

@@ -24,8 +39,8 @@ library(stringr)
2439

2540
``` r
2641
get_metadata()
27-
#> # Source: table<metadata> [?? x 56]
28-
#> # Database: sqlite 3.40.0 [/stornext/Home/data/allstaff/m/mangiola.s/.cache/R/CuratedAtlasQueryR/metadata.sqlite]
42+
#> # Source: table</stornext/Home/data/allstaff/m/mangiola.s/.cache/R/CuratedAtlasQueryR/metadata.parquet> [?? x 56]
43+
#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.0/:memory:]
2944
#> .cell sampl…¹ .sample .samp…² assay assay…³ file_…⁴ cell_…⁵ cell_…⁶ devel…⁷
3045
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
3146
#> 1 AAACCT… 8a0fe0… 5f20d7… D17PrP… 10x … EFO:00… 1e334b… basal … CL:000… 31-yea…
@@ -52,6 +67,21 @@ get_metadata()
5267
``` r
5368
get_metadata() |>
5469
dplyr::distinct(tissue, file_id)
70+
#> # Source: SQL [?? x 2]
71+
#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.0/:memory:]
72+
#> tissue file_id
73+
#> <chr> <chr>
74+
#> 1 epithelial cell of alveolus of lung (cell culture) 0e8f9ce4-46e5-434e-9ca0-e…
75+
#> 2 peripheral zone of prostate 0f017e66-9c70-4d29-9435-2…
76+
#> 3 transition zone of prostate 0f017e66-9c70-4d29-9435-2…
77+
#> 4 superior frontal gyrus 0fe32cca-d111-42b6-9b93-b…
78+
#> 5 fovea centralis 100c44ed-f754-4d45-8649-d…
79+
#> 6 blood 1042ba0a-98c5-4816-897d-e…
80+
#> 7 telencephalon 3fe53a40-38ff-4f25-b33b-e…
81+
#> 8 kidney 69b67eef-43fd-40ff-8fd3-e…
82+
#> 9 blood 6a044711-8df7-4f88-bad7-f…
83+
#> 10 heart left ventricle 6a579758-a4b4-4f64-be54-4…
84+
#> # … with more rows
5585
```
5686

5787
``` r
@@ -239,7 +269,7 @@ get_metadata() |>
239269
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
240270
```
241271

242-
<img src="inst/NCAM1_figure.png" width="629" />
272+
<img src="man/figures/NCAM1_figure.png" width="629" />
243273

244274
# Cell metadata
245275

@@ -277,7 +307,7 @@ present in the original CELLxGENE metadata
277307

278308
- `tissue_harmonised`: a coarser tissue name for better filtering
279309
- `age_days`: the number of days corresponding to the age
280-
- `cell_type_harmonised`: the consensus call identiti (for immune cells)
310+
- `cell_type_harmonised`: the consensus call identity (for immune cells)
281311
using the original and three novel annotations using Seurat Azimuth
282312
and SingleR
283313
- `confidence_class`: an ordinal class of how confident
@@ -297,7 +327,16 @@ present in the original CELLxGENE metadata
297327

298328
The `raw` assay includes RNA abundance in the positive real scale (not
299329
transformed with non-linear functions, e.g. log sqrt). Originally
300-
CELLxGENE include a mix of scales and tranformations specified in the
330+
CELLxGENE include a mix of scales and transformations specified in the
301331
`x_normalization` column.
302332

303333
The `cpm` assay includes counts per million.
334+
335+
------------------------------------------------------------------------
336+
337+
This project has been funded by
338+
339+
- *Silicon Valley Foundation* CZF2019-002443
340+
- *Bioconductor core funding* NIH NHGRI 5U24HG004059-18
341+
- *Victoria Cancer Agency* ECRF21036
342+
- *Australian National Health and Medical Research Council* 1116955

man/figures/bioconductor_logo.jpg

77.3 KB
Loading

man/figures/czi_logo.png

57 KB
Loading
File renamed without changes.

man/figures/svcf_logo.jpeg

18.4 KB
Loading

0 commit comments

Comments
 (0)