Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit f135421

Browse files
committed
Merge branch 'change-cell-name-convention' of github.com:stemangiola/HCAquery into change-cell-name-convention
2 parents 6b7ed88 + 23de092 commit f135421

File tree

8 files changed

+290
-97
lines changed

8 files changed

+290
-97
lines changed

R/query.R

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,41 @@ get_seurat <- function(...) {
370370
#' @importFrom dplyr tbl
371371
#' @importFrom httr progress
372372
#' @importFrom cli cli_alert_info
373+
#'
374+
#' @details
375+
#'
376+
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
377+
#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
378+
#'
379+
#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
380+
#'
381+
#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
382+
#'
383+
#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
384+
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
385+
#'
386+
#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
387+
#'
388+
#' `.sample`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
389+
#'
390+
#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
391+
#'
392+
#' `.cell`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler`
393+
#'
394+
#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
395+
#'
396+
#' - `tissue_harmonised`: a coarser tissue name for better filtering
397+
#' - `age_days`: the number of days corresponding to the age
398+
#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
399+
#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
400+
#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
401+
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
402+
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
403+
#' - `sample_id_db`: Sample subdivision for internal use
404+
#' - `file_id_db`: File subdivision for internal use
405+
#' - `.sample`: Sample ID
406+
#' - `.sample_name`: How samples were defined
407+
#'
373408
get_metadata <- function(
374409
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
375410
cache_directory = get_default_cache_dir()

README.Rmd

Lines changed: 93 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@ knitr::opts_chunk$set(
2323
knitr::include_graphics(c("man/figures/logo.png"))
2424
```
2525

26-
```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
26+
```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px", "180px")}
2727
knitr::include_graphics(c(
2828
"man/figures/svcf_logo.jpeg",
2929
"man/figures/czi_logo.png",
3030
"man/figures/bioconductor_logo.jpg",
31-
"man/figures/vca_logo.png"
31+
"man/figures/vca_logo.png",
32+
"man/figures/nectar_logo.png"
3233
))
3334
```
3435

@@ -58,31 +59,14 @@ library(stringr)
5859
get_metadata()
5960
```
6061

61-
### Explore the tissue
62+
### Explore the number of datasets per tissue
6263

6364
```{r}
6465
get_metadata() |>
65-
dplyr::distinct(tissue, file_id)
66+
dplyr::distinct(tissue, dataset_id) |>
67+
dplyr::count(tissue)
6668
```
6769

68-
```{r}
69-
#> # Source: SQL [?? x 2]
70-
#> # Database: sqlite 3.40.0 [[email protected]:5432/metadata]
71-
#> # Ordered by: desc(n)
72-
#> tissue n
73-
#> <chr> <int64>
74-
#> 1 blood 47
75-
#> 2 heart left ventricle 46
76-
#> 3 cortex of kidney 31
77-
#> 4 renal medulla 29
78-
#> 5 lung 27
79-
#> 6 liver 24
80-
#> 7 middle temporal gyrus 24
81-
#> 8 kidney 19
82-
#> 9 intestine 18
83-
#> 10 thymus 17
84-
#> # … with more rows
85-
```
8670

8771

8872
## Download single-cell RNA sequencing counts
@@ -161,36 +145,110 @@ single_cell_counts
161145

162146
We can gather all natural killer cells and plot the distribution of CD56 (NCAM1) across all tissues
163147

148+
```{r, eval=FALSE, echo=FALSE}
149+
library(tidySingleCellExperiment)
150+
library(ggplot2)
151+
152+
# Plots with styling
153+
154+
# Plot by disease
155+
get_metadata() |>
156+
# Filter and subset
157+
filter(cell_type_harmonised=="cd14 mono") |>
158+
filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |>
159+
160+
# Get counts per million for NCAM1 gene
161+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
162+
163+
# Add feature to table
164+
join_features("HLA-A", shape = "wide") |>
165+
166+
# Rank x axis
167+
as_tibble() |>
168+
with_groups(disease, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |>
169+
170+
# Plot
171+
ggplot(aes( fct_reorder(disease, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
172+
geom_jitter(shape=".") +
173+
174+
# Style
175+
guides(color="none") +
176+
scale_y_log10() +
177+
theme_bw() +
178+
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
179+
xlab("Disease") +
180+
ggtitle("HLA-A in CD14 monocytes by disease")
181+
182+
# Plot by tissue
183+
get_metadata() |>
184+
# Filter and subset
185+
filter(cell_type_harmonised=="cd14 mono") |>
186+
filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |>
187+
188+
# Get counts per million for NCAM1 gene
189+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
190+
191+
# Add feature to table
192+
join_features("HLA-A", shape = "wide") |>
193+
194+
# Rank x axis
195+
as_tibble() |>
196+
with_groups(tissue_harmonised, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |>
197+
198+
# Plot
199+
ggplot(aes( fct_reorder(tissue_harmonised, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
200+
geom_jitter(shape=".") +
201+
202+
# Style
203+
guides(color="none") +
204+
scale_y_log10() +
205+
theme_bw() +
206+
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
207+
xlab("Tissue") +
208+
ggtitle("HLA-A in CD14 monocytes by tissue")
209+
210+
```
211+
164212
```{r, eval=FALSE}
165213
library(tidySingleCellExperiment)
166214
library(ggplot2)
167215
216+
get_metadata() |>
217+
# Filter and subset
218+
filter(cell_type_harmonised=="cd14 mono") |>
219+
220+
# Get counts per million for NCAM1 gene
221+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
222+
223+
# Plot
224+
join_features("HLA-A", shape = "wide") |>
225+
ggplot(aes( disease, `HLA.A`,color = file_id)) +
226+
geom_jitter(shape=".")
227+
```
228+
229+
```{r, echo=FALSE, message=FALSE, warning=FALSE}
230+
knitr::include_graphics("man/figures/HLA_A_disease_plot.png")
231+
```
232+
233+
```{r, eval=FALSE}
234+
168235
get_metadata() |>
169236
170237
# Filter and subset
171238
filter(cell_type_harmonised=="nk") |>
172-
select(cell_, file_id_db, disease, file_id, tissue_harmonised) |>
173-
239+
174240
# Get counts per million for NCAM1 gene
175241
get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |>
176242
177-
# Get transcriptional abundance for plotting with `tidySingleCellExperiment`
178-
join_features("NCAM1", shape = "wide") |>
179-
180243
# Plot
244+
join_features("NCAM1", shape = "wide") |>
181245
ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
182-
geom_jitter(shape=".") +
183-
184-
# Style
185-
guides(color="none") +
186-
scale_y_log10() +
187-
theme_bw() +
188-
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
246+
geom_jitter(shape=".")
189247
190248
```
191249

192250
```{r, echo=FALSE, message=FALSE, warning=FALSE}
193-
knitr::include_graphics("man/figures/NCAM1_figure.png")
251+
knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
194252
```
195253

196254
# Cell metadata

README.md

Lines changed: 37 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ sample, or dataset levels based on filtering criteria.
1313

1414
<img src="man/figures/logo.png" width="120x" height="139px" />
1515

16-
<img src="man/figures/svcf_logo.jpeg" width="155x" height="58px" /><img src="man/figures/czi_logo.png" width="129px" height="58px" /><img src="man/figures/bioconductor_logo.jpg" width="202px" height="58px" /><img src="man/figures/vca_logo.png" width="219px" height="58px" />
16+
<img src="man/figures/svcf_logo.jpeg" width="155x" height="58px" /><img src="man/figures/czi_logo.png" width="129px" height="58px" /><img src="man/figures/bioconductor_logo.jpg" width="202px" height="58px" /><img src="man/figures/vca_logo.png" width="219px" height="58px" /><img src="man/figures/nectar_logo.png" width="180px" height="58px" />
1717

1818
[website](https://stemangiola.github.io/CuratedAtlasQueryR)
1919

@@ -62,44 +62,27 @@ get_metadata()
6262
#> # is_primary_data_x <chr>, organism <chr>, organism_ontology_term_id <chr>, …
6363
```
6464

65-
### Explore the tissue
65+
### Explore the number of datasets per tissue
6666

6767
``` r
6868
get_metadata() |>
69-
dplyr::distinct(tissue, file_id)
69+
dplyr::distinct(tissue, dataset_id) |>
70+
dplyr::count(tissue)
7071
#> # Source: SQL [?? x 2]
7172
#> # Database: DuckDB 0.7.0 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.0/:memory:]
72-
#> tissue file_id
73-
#> <chr> <chr>
74-
#> 1 renal medulla 52cb5191-2976-4077-ba88-47c76692bef0
75-
#> 2 pancreas 53329245-06f3-45a4-bf15-ed61f628ff83
76-
#> 3 blood 5500774a-6ebe-4ddf-adce-90302b7cd007
77-
#> 4 blood 550760cb-ede9-4e6b-b6ab-7152f2ce29e1
78-
#> 5 intestine 556bb449-bbef-43d3-9487-87031fc0decb
79-
#> 6 lung 56e0359f-ee8d-4ba5-a51d-159a183643e5
80-
#> 7 adrenal gland 56e0359f-ee8d-4ba5-a51d-159a183643e5
81-
#> 8 pleural effusion 56e0359f-ee8d-4ba5-a51d-159a183643e5
82-
#> 9 liver 56e0359f-ee8d-4ba5-a51d-159a183643e5
83-
#> 10 lymph node 56e0359f-ee8d-4ba5-a51d-159a183643e5
84-
#> # … with more rows
85-
```
73+
#> tissue n
74+
#> <chr> <dbl>
75+
#> 1 peripheral zone of prostate 10
76+
#> 2 transition zone of prostate 10
77+
#> 3 blood 47
78+
#> 4 intestine 18
79+
#> 5 middle temporal gyrus 24
80+
#> 6 heart left ventricle 46
81+
#> 7 apex of heart 16
82+
#> 8 heart right ventricle 16
83+
#> 9 left cardiac atrium 7
84+
#> 10 interventricular septum 16
8685

87-
``` r
88-
#> # Source: SQL [?? x 2]
89-
#> # Database: sqlite 3.40.0 [[email protected]:5432/metadata]
90-
#> # Ordered by: desc(n)
91-
#> tissue n
92-
#> <chr> <int64>
93-
#> 1 blood 47
94-
#> 2 heart left ventricle 46
95-
#> 3 cortex of kidney 31
96-
#> 4 renal medulla 29
97-
#> 5 lung 27
98-
#> 6 liver 24
99-
#> 7 middle temporal gyrus 24
100-
#> 8 kidney 19
101-
#> 9 intestine 18
102-
#> 10 thymus 17
10386
#> # … with more rows
10487
```
10588

@@ -243,30 +226,38 @@ We can gather all natural killer cells and plot the distribution of CD56
243226
library(tidySingleCellExperiment)
244227
library(ggplot2)
245228

229+
get_metadata() |>
230+
# Filter and subset
231+
filter(cell_type_harmonised=="cd14 mono") |>
232+
233+
# Get counts per million for NCAM1 gene
234+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
235+
236+
# Plot
237+
join_features("HLA-A", shape = "wide") |>
238+
ggplot(aes( disease, `HLA.A`,color = file_id)) +
239+
geom_jitter(shape=".")
240+
```
241+
242+
<img src="man/figures/HLA_A_disease_plot.png" width="497" />
243+
244+
``` r
245+
246246
get_metadata() |>
247247

248248
# Filter and subset
249249
filter(cell_type_harmonised=="nk") |>
250-
select(cell_, file_id_db, disease, file_id, tissue_harmonised) |>
251-
250+
252251
# Get counts per million for NCAM1 gene
253252
get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |>
254253

255-
# Get transcriptional abundance for plotting with `tidySingleCellExperiment`
256-
join_features("NCAM1", shape = "wide") |>
257-
258254
# Plot
255+
join_features("NCAM1", shape = "wide") |>
259256
ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
260-
geom_jitter(shape=".") +
261-
262-
# Style
263-
guides(color="none") +
264-
scale_y_log10() +
265-
theme_bw() +
266-
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
257+
geom_jitter(shape=".")
267258
```
268259

269-
<img src="man/figures/NCAM1_figure.png" width="629" />
260+
<img src="man/figures/HLA_A_tissue_plot.png" width="499" />
270261

271262
# Cell metadata
272263

man/figures/HLA_A_disease_plot.png

63.3 KB
Loading

man/figures/HLA_A_tissue_plot.png

54.5 KB
Loading

man/figures/nectar_logo.png

4.74 KB
Loading

man/get_metadata.Rd

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)