Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 4b08621

Browse files
authored
Merge pull request #77 from stemangiola/README-and-docs
update docs and README
2 parents 02c8a94 + 48a370c commit 4b08621

File tree

8 files changed

+321
-132
lines changed

8 files changed

+321
-132
lines changed

R/query.R

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,41 @@ get_seurat <- function(...) {
370370
#' @importFrom dplyr tbl
371371
#' @importFrom httr progress
372372
#' @importFrom cli cli_alert_info
373+
#'
374+
#' @details
375+
#'
376+
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
377+
#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
378+
#'
379+
#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
380+
#'
381+
#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
382+
#'
383+
#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
384+
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
385+
#'
386+
#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
387+
#'
388+
#' `.sample`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
389+
#'
390+
#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
391+
#'
392+
#' `.cell`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler`
393+
#'
394+
#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
395+
#'
396+
#' - `tissue_harmonised`: a coarser tissue name for better filtering
397+
#' - `age_days`: the number of days corresponding to the age
398+
#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
399+
#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
400+
#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
401+
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
402+
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
403+
#' - `sample_id_db`: Sample subdivision for internal use
404+
#' - `file_id_db`: File subdivision for internal use
405+
#' - `.sample`: Sample ID
406+
#' - `.sample_name`: How samples were defined
407+
#'
373408
get_metadata <- function(
374409
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
375410
cache_directory = get_default_cache_dir()

README.Rmd

Lines changed: 93 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@ knitr::opts_chunk$set(
2323
knitr::include_graphics(c("man/figures/logo.png"))
2424
```
2525

26-
```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
26+
```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px", "180px")}
2727
knitr::include_graphics(c(
2828
"man/figures/svcf_logo.jpeg",
2929
"man/figures/czi_logo.png",
3030
"man/figures/bioconductor_logo.jpg",
31-
"man/figures/vca_logo.png"
31+
"man/figures/vca_logo.png",
32+
"man/figures/nectar_logo.png"
3233
))
3334
```
3435

@@ -58,31 +59,14 @@ library(stringr)
5859
get_metadata()
5960
```
6061

61-
### Explore the tissue
62+
### Explore the number of datasets per tissue
6263

6364
```{r}
6465
get_metadata() |>
65-
dplyr::distinct(tissue, file_id)
66+
dplyr::distinct(tissue, dataset_id) |>
67+
dplyr::count(tissue)
6668
```
6769

68-
```{r}
69-
#> # Source: SQL [?? x 2]
70-
#> # Database: sqlite 3.40.0 [[email protected]:5432/metadata]
71-
#> # Ordered by: desc(n)
72-
#> tissue n
73-
#> <chr> <int64>
74-
#> 1 blood 47
75-
#> 2 heart left ventricle 46
76-
#> 3 cortex of kidney 31
77-
#> 4 renal medulla 29
78-
#> 5 lung 27
79-
#> 6 liver 24
80-
#> 7 middle temporal gyrus 24
81-
#> 8 kidney 19
82-
#> 9 intestine 18
83-
#> 10 thymus 17
84-
#> # … with more rows
85-
```
8670

8771

8872
## Download single-cell RNA sequencing counts
@@ -161,36 +145,110 @@ single_cell_counts
161145

162146
We can gather all natural killer cells and plot the distribution of CD56 (NCAM1) across all tissues
163147

148+
```{r, eval=FALSE, echo=FALSE}
149+
library(tidySingleCellExperiment)
150+
library(ggplot2)
151+
152+
# Plots with styling
153+
154+
# Plot by disease
155+
get_metadata() |>
156+
# Filter and subset
157+
filter(cell_type_harmonised=="cd14 mono") |>
158+
filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |>
159+
160+
# Get counts per million for NCAM1 gene
161+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
162+
163+
# Add feature to table
164+
join_features("HLA-A", shape = "wide") |>
165+
166+
# Rank x axis
167+
as_tibble() |>
168+
with_groups(disease, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |>
169+
170+
# Plot
171+
ggplot(aes( fct_reorder(disease, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
172+
geom_jitter(shape=".") +
173+
174+
# Style
175+
guides(color="none") +
176+
scale_y_log10() +
177+
theme_bw() +
178+
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
179+
xlab("Disease") +
180+
ggtitle("HLA-A in CD14 monocytes. Coloured by dataset")
181+
182+
# Plot by tissue
183+
get_metadata() |>
184+
# Filter and subset
185+
filter(cell_type_harmonised=="cd14 mono") |>
186+
filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |>
187+
188+
# Get counts per million for NCAM1 gene
189+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
190+
191+
# Add feature to table
192+
join_features("HLA-A", shape = "wide") |>
193+
194+
# Rank x axis
195+
as_tibble() |>
196+
with_groups(tissue_harmonised, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |>
197+
198+
# Plot
199+
ggplot(aes( fct_reorder(tissue_harmonised, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
200+
geom_jitter(shape=".") +
201+
202+
# Style
203+
guides(color="none") +
204+
scale_y_log10() +
205+
theme_bw() +
206+
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
207+
xlab("Disease") +
208+
ggtitle("HLA-A in CD14 monocytes. Coloured by dataset")
209+
210+
```
211+
164212
```{r, eval=FALSE}
165213
library(tidySingleCellExperiment)
166214
library(ggplot2)
167215
216+
get_metadata() |>
217+
# Filter and subset
218+
filter(cell_type_harmonised=="cd14 mono") |>
219+
220+
# Get counts per million for NCAM1 gene
221+
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
222+
223+
# Plot
224+
join_features("HLA-A", shape = "wide") |>
225+
ggplot(aes( disease, `HLA.A`,color = file_id)) +
226+
geom_jitter(shape=".")
227+
```
228+
229+
```{r, echo=FALSE, message=FALSE, warning=FALSE}
230+
knitr::include_graphics("man/figures/HLA_A_disease_plot.png")
231+
```
232+
233+
```{r, eval=FALSE}
234+
168235
get_metadata() |>
169236
170237
# Filter and subset
171238
filter(cell_type_harmonised=="nk") |>
172-
select(.cell, file_id_db, disease, file_id, tissue_harmonised) |>
173-
239+
174240
# Get counts per million for NCAM1 gene
175241
get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |>
176242
177-
# Get transcriptional abundance for plotting with `tidySingleCellExperiment`
178-
join_features("NCAM1", shape = "wide") |>
179-
180243
# Plot
244+
join_features("NCAM1", shape = "wide") |>
181245
ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
182-
geom_jitter(shape=".") +
183-
184-
# Style
185-
guides(color="none") +
186-
scale_y_log10() +
187-
theme_bw() +
188-
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
246+
geom_jitter(shape=".")
189247
190248
```
191249

192250
```{r, echo=FALSE, message=FALSE, warning=FALSE}
193-
knitr::include_graphics("man/figures/NCAM1_figure.png")
251+
knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
194252
```
195253

196254
# Cell metadata

0 commit comments

Comments
 (0)