stemangiola
diff --git a/‎R/query.R‎
Lines changed: 35 additions & 0 deletions b/‎R/query.R‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎README.Rmd‎
Lines changed: 93 additions & 35 deletions b/‎README.Rmd‎
Lines changed: 93 additions & 35 deletions
diff --git a/‎README.md‎
Lines changed: 37 additions & 46 deletions b/‎README.md‎
Lines changed: 37 additions & 46 deletions
diff --git a/‎man/figures/HLA_A_disease_plot.png‎
63.3 KB b/‎man/figures/HLA_A_disease_plot.png‎
63.3 KB
diff --git a/‎man/figures/HLA_A_tissue_plot.png‎
54.5 KB b/‎man/figures/HLA_A_tissue_plot.png‎
54.5 KB
diff --git a/‎man/figures/nectar_logo.png‎
4.74 KB b/‎man/figures/nectar_logo.png‎
4.74 KB
diff --git a/‎man/get_metadata.Rd‎
Lines changed: 34 additions & 0 deletions b/‎man/get_metadata.Rd‎
Lines changed: 34 additions & 0 deletions
@@ -370,6 +370,41 @@ get_seurat <- function(...) {
 #' @importFrom dplyr tbl
 #' @importFrom httr progress
 #' @importFrom cli cli_alert_info
+#' 
+#' @details 
+#' 
+#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
+#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
+#' 
+#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
+#' 
+#'  Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
+#' 
+#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
+#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
+#' 
+#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
+#' 
+#' `.sample`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
+#' 
+#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
+#' 
+#' `.cell`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler` 
+#' 
+#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
+#' 
+#' - `tissue_harmonised`: a coarser tissue name for better filtering
+#' - `age_days`: the number of days corresponding to the age
+#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
+#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.             
+#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
+#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
+#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
+#' - `sample_id_db`: Sample subdivision for internal use
+#' - `file_id_db`: File subdivision for internal use
+#' - `.sample`: Sample ID
+#' - `.sample_name`: How samples were defined
+#' 
 get_metadata <- function(
     remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
     cache_directory = get_default_cache_dir()
 
@@ -23,12 +23,13 @@ knitr::opts_chunk$set(
 knitr::include_graphics(c("man/figures/logo.png"))
 ```
 
-```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
+```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px", "180px")}
 knitr::include_graphics(c(
   "man/figures/svcf_logo.jpeg", 
   "man/figures/czi_logo.png", 
   "man/figures/bioconductor_logo.jpg",
-    "man/figures/vca_logo.png"
+    "man/figures/vca_logo.png",
+  "man/figures/nectar_logo.png"
 ))
 ```
 
@@ -58,31 +59,14 @@ library(stringr)
 get_metadata()
 ```
 
-### Explore the tissue 
+### Explore the number of datasets per tissue
 
 ```{r}
 get_metadata() |>
-    dplyr::distinct(tissue, file_id) 
+  dplyr::distinct(tissue, dataset_id) |> 
+  dplyr::count(tissue)
 ```
 
-```{r}
-#> # Source:     SQL [?? x 2]
-#> # Database:   sqlite 3.40.0 [[email protected]:5432/metadata]
-#> # Ordered by: desc(n)
-#>    tissue                      n
-#>    <chr>                 <int64>
-#>  1 blood                      47
-#>  2 heart left ventricle       46
-#>  3 cortex of kidney           31
-#>  4 renal medulla              29
-#>  5 lung                       27
-#>  6 liver                      24
-#>  7 middle temporal gyrus      24
-#>  8 kidney                     19
-#>  9 intestine                  18
-#> 10 thymus                     17
-#> # … with more rows
-```
 
 
 ## Download single-cell RNA sequencing counts 
@@ -161,36 +145,110 @@ single_cell_counts
 
 We can gather all natural killer cells and plot the distribution of CD56 (NCAM1) across all tissues
 
+```{r, eval=FALSE, echo=FALSE}
+library(tidySingleCellExperiment)
+library(ggplot2)
+
+# Plots with styling
+
+# Plot by disease
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+  filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |> 
+  
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Add feature to table
+  join_features("HLA-A", shape = "wide") |> 
+    
+  # Rank x axis
+  as_tibble() |> 
+  with_groups(disease, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |> 
+  
+  # Plot
+  ggplot(aes( fct_reorder(disease, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") +
+    
+  # Style
+  guides(color="none") +
+  scale_y_log10() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) + 
+  xlab("Disease") + 
+  ggtitle("HLA-A in CD14 monocytes by disease") 
+
+# Plot by tissue
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+  filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |> 
+  
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Add feature to table
+  join_features("HLA-A", shape = "wide") |> 
+    
+  # Rank x axis
+  as_tibble() |> 
+  with_groups(tissue_harmonised, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |> 
+  
+  # Plot
+  ggplot(aes( fct_reorder(tissue_harmonised, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") +
+    
+  # Style
+  guides(color="none") +
+  scale_y_log10() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) + 
+  xlab("Tissue") + 
+  ggtitle("HLA-A in CD14 monocytes by tissue") 
+
+```
+
 ```{r, eval=FALSE}
 library(tidySingleCellExperiment)
 library(ggplot2)
 
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Plot
+  join_features("HLA-A", shape = "wide") |> 
+  ggplot(aes( disease, `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") 
+```
+
+```{r, echo=FALSE, message=FALSE, warning=FALSE}
+knitr::include_graphics("man/figures/HLA_A_disease_plot.png")
+```
+
+```{r, eval=FALSE}
+
 get_metadata() |> 
     
   # Filter and subset
   filter(cell_type_harmonised=="nk") |> 
-  select(cell_, file_id_db, disease, file_id, tissue_harmonised) |> 
-  
+
   # Get counts per million for NCAM1 gene 
   get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |> 
 
-	# Get transcriptional abundance for plotting with `tidySingleCellExperiment`
-  join_features("NCAM1", shape = "wide") |> 
-	
 	# Plot
+  join_features("NCAM1", shape = "wide") |> 
   ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
-  geom_jitter(shape=".") +
-	
-	# Style
-  guides(color="none") +
-  scale_y_log10() +
-  theme_bw() +
-  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
+  geom_jitter(shape=".") 
 
 ```
 
 ```{r, echo=FALSE, message=FALSE, warning=FALSE}
-knitr::include_graphics("man/figures/NCAM1_figure.png")
+knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
 ```
 
 # Cell metadata
 
@@ -13,7 +13,7 @@ sample, or dataset levels based on filtering criteria.
 
 <img src="man/figures/logo.png" width="120x" height="139px" />
 
-<img src="man/figures/svcf_logo.jpeg" width="155x" height="58px" /><img src="man/figures/czi_logo.png" width="129px" height="58px" /><img src="man/figures/bioconductor_logo.jpg" width="202px" height="58px" /><img src="man/figures/vca_logo.png" width="219px" height="58px" />
+<img src="man/figures/svcf_logo.jpeg" width="155x" height="58px" /><img src="man/figures/czi_logo.png" width="129px" height="58px" /><img src="man/figures/bioconductor_logo.jpg" width="202px" height="58px" /><img src="man/figures/vca_logo.png" width="219px" height="58px" /><img src="man/figures/nectar_logo.png" width="180px" height="58px" />
 
 [website](https://stemangiola.github.io/CuratedAtlasQueryR)
 
@@ -62,44 +62,27 @@ get_metadata()
 #> #   is_primary_data_x <chr>, organism <chr>, organism_ontology_term_id <chr>, …
 ```
 
-### Explore the tissue
+### Explore the number of datasets per tissue
 
 ``` r
 get_metadata() |>
-    dplyr::distinct(tissue, file_id) 
+  dplyr::distinct(tissue, dataset_id) |> 
+  dplyr::count(tissue)
 #> # Source:   SQL [?? x 2]
 #> # Database: DuckDB 0.7.0 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.0/:memory:]
-#>    tissue           file_id                             
-#>    <chr>            <chr>                               
-#>  1 renal medulla    52cb5191-2976-4077-ba88-47c76692bef0
-#>  2 pancreas         53329245-06f3-45a4-bf15-ed61f628ff83
-#>  3 blood            5500774a-6ebe-4ddf-adce-90302b7cd007
-#>  4 blood            550760cb-ede9-4e6b-b6ab-7152f2ce29e1
-#>  5 intestine        556bb449-bbef-43d3-9487-87031fc0decb
-#>  6 lung             56e0359f-ee8d-4ba5-a51d-159a183643e5
-#>  7 adrenal gland    56e0359f-ee8d-4ba5-a51d-159a183643e5
-#>  8 pleural effusion 56e0359f-ee8d-4ba5-a51d-159a183643e5
-#>  9 liver            56e0359f-ee8d-4ba5-a51d-159a183643e5
-#> 10 lymph node       56e0359f-ee8d-4ba5-a51d-159a183643e5
-#> # … with more rows
-```
+#>    tissue                          n
+#>    <chr>                       <dbl>
+#>  1 peripheral zone of prostate    10
+#>  2 transition zone of prostate    10
+#>  3 blood                          47
+#>  4 intestine                      18
+#>  5 middle temporal gyrus          24
+#>  6 heart left ventricle           46
+#>  7 apex of heart                  16
+#>  8 heart right ventricle          16
+#>  9 left cardiac atrium             7
+#> 10 interventricular septum        16
 
-``` r
-#> # Source:     SQL [?? x 2]
-#> # Database:   sqlite 3.40.0 [[email protected]:5432/metadata]
-#> # Ordered by: desc(n)
-#>    tissue                      n
-#>    <chr>                 <int64>
-#>  1 blood                      47
-#>  2 heart left ventricle       46
-#>  3 cortex of kidney           31
-#>  4 renal medulla              29
-#>  5 lung                       27
-#>  6 liver                      24
-#>  7 middle temporal gyrus      24
-#>  8 kidney                     19
-#>  9 intestine                  18
-#> 10 thymus                     17
 #> # … with more rows
 ```
 
@@ -243,30 +226,38 @@ We can gather all natural killer cells and plot the distribution of CD56
 library(tidySingleCellExperiment)
 library(ggplot2)
 
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Plot
+  join_features("HLA-A", shape = "wide") |> 
+  ggplot(aes( disease, `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") 
+```
+
+<img src="man/figures/HLA_A_disease_plot.png" width="497" />
+
+``` r
+
 get_metadata() |> 
 
   # Filter and subset
   filter(cell_type_harmonised=="nk") |> 
-  select(cell_, file_id_db, disease, file_id, tissue_harmonised) |> 
-  
+
   # Get counts per million for NCAM1 gene 
   get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |> 
 
-    # Get transcriptional abundance for plotting with `tidySingleCellExperiment`
-  join_features("NCAM1", shape = "wide") |> 
-    
     # Plot
+  join_features("NCAM1", shape = "wide") |> 
   ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
-  geom_jitter(shape=".") +
-    
-    # Style
-  guides(color="none") +
-  scale_y_log10() +
-  theme_bw() +
-  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
+  geom_jitter(shape=".") 
 ```
 
-<img src="man/figures/NCAM1_figure.png" width="629" />
+<img src="man/figures/HLA_A_tissue_plot.png" width="499" />
 
 # Cell metadata