Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit f720564

Browse files
authored
Merge pull request #11 from stemangiola/edit_from_local
updates in the local computer
2 parents 7fd65c9 + e1e7dcc commit f720564

File tree

4 files changed

+227
-12
lines changed

4 files changed

+227
-12
lines changed

dev/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ slurm.wrapper
66
ERROR-1.output
77
core.26183
88
curated_annotation.rds
9+
metadata.sqlite

dev/annotation_harmonise.R

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ library(tidyseurat)
1515
library(celldex)
1616
library(SingleR)
1717
library(glmGamPoi)
18+
library(stringr)
19+
library(purrr)
20+
1821
# source("utility.R")
1922

2023
clean_cell_types = function(.x){
@@ -557,24 +560,66 @@ curated_annotation =
557560
cell_type_harmonised
558561
)) |>
559562
select(-.cell_combined) |>
560-
select(-cell_type_Monaco, -score)
561-
563+
select(-cell_type_Monaco, -score) |>
564+
565+
# Replace NA
566+
mutate(cell_type_harmonised = if_else(is.na(cell_type_harmonised), "immune_unclassified", cell_type_harmonised)) |>
567+
568+
# Add non immune
569+
select(-cell_type) |>
570+
full_join(
571+
get_metadata("dev/metadata.SQLite") |>
572+
select(.cell, .sample) |>
573+
as_tibble()
574+
) |>
575+
mutate(cell_type_harmonised = if_else(is.na(cell_type_harmonised), "non_immune", cell_type_harmonised))
576+
577+
# Save
562578
job::job({
563579
curated_annotation |>
580+
mutate(across(contains("cell_"), as.factor)) |>
564581
saveRDS("dev/curated_annotation.rds")
565582
})
566583

567584

568585
cell_metadata_with_harmonised_annotation =
569586
curated_annotation |>
587+
mutate(.cell = .cell |> str_remove(.sample) |> str_remove("_$")) |>
570588
left_join(
571-
get_metadata() |>
572-
select(.cell, .sample, file_id, file_id_db, tissue) |>
573-
as_tibble()
574-
)
575-
# xx = x |>
576-
# filter(cell_type_harmonised == "monocytes") |>
577-
# get_SingleCellExperiment()
589+
get_metadata("dev/metadata.SQLite") |>
590+
select(.cell, .sample, file_id, file_id_db, tissue, disease, is_primary_data.x, is_primary_data.y, name) |>
591+
left_join(read_csv("dev/tissue_label_curated.csv"), copy=TRUE) |>
592+
as_tibble(),
593+
by=c(".cell", ".sample")
594+
) |>
595+
distinct() |>
596+
597+
# Drop secondary data often cell type subsets
598+
filter(is_primary_data.x==TRUE)
599+
600+
# Filter samples that do not have immune
601+
cell_metadata_with_harmonised_annotation =
602+
cell_metadata_with_harmonised_annotation |>
603+
nest(data = -c(.sample, tissue_harmonised)) |>
604+
filter(map_int(data, ~ .x |> filter(cell_type_harmonised != "non_immune") |> nrow()) > 0) |>
605+
unnest(data)
606+
607+
# Tissue with no immune
608+
cell_metadata_with_harmonised_annotation =
609+
cell_metadata_with_harmonised_annotation |>
610+
filter(disease == "normal") |>
611+
612+
# Filter tissues
613+
nest(data = -c(cell_type_harmonised, tissue_harmonised)) |>
614+
add_count(tissue_harmonised, name = "n_cell_type_in_tissue") |>
615+
filter(n_cell_type_in_tissue>=18) |>
616+
add_count(cell_type_harmonised, name = "n_tissue_in_cell_type") |>
617+
filter(n_tissue_in_cell_type>=26) |>
618+
unnest(data)
619+
620+
621+
cell_metadata_with_harmonised_annotation |>
622+
saveRDS("dev/cell_metadata_with_harmonised_annotation.rds")
578623

579624

580625

@@ -590,9 +635,24 @@ cell_metadata_with_harmonised_annotation |> anti_join(annotated_samples) |> di
590635
# Histo of annotation
591636
cell_metadata_with_harmonised_annotation |> filter(!is.na(cell_type_harmonised)) |> distinct( cell_type_harmonised, .sample) |> count(.sample) |> pull(n) |> hist(breaks=30)
592637

638+
# NEEDED CHANGES - ANIMAL CELLS AND HEMATOPOIETIC SHOULD BE EVALUATED AS IMMUNE FOR REANNNOTATION?
639+
593640
# Tissue with no immune
594-
cell_metadata_with_harmonised_annotation |> filter(!is.na(cell_type_harmonised)) |> distinct( cell_type_harmonised, tissue_harmonised) |> count(cell_type_harmonised) |> arrange(n) |> print(n=99)
595641

596-
cell_metadata_with_harmonised_annotation |> filter(!is.na(cell_type_harmonised)) |> distinct( cell_type_harmonised, tissue_harmonised) |> count(tissue_harmonised) |> arrange(n) |> print(n=99)
642+
temp_tissue =
643+
cell_metadata_with_harmonised_annotation |>
644+
filter(cell_type_harmonised=="pdc") |>
645+
pull(tissue_harmonised)
646+
647+
cell_metadata_with_harmonised_annotation |>
648+
filter(!tissue_harmonised %in% temp_tissue) |>
649+
distinct(tissue_harmonised) |>
650+
651+
652+
cell_metadata_with_harmonised_annotation |>
653+
filter(disease == "normal") |>
654+
filter(!is.na(cell_type_harmonised)) |>
655+
distinct( cell_type_harmonised, tissue_harmonised) |>
656+
count(tissue_harmonised) |> arrange(n) |> print(n=99)
597657

598658

dev/composition.R

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
library(tidyverse)
2+
library(forcats)
3+
library(HCAquery)
4+
library(dittoSeq)
5+
library(sccomp)
6+
7+
source("https://gist.githubusercontent.com/stemangiola/fc67b08101df7d550683a5100106561c/raw/a0853a1a4e8a46baf33bad6268b09001d49faf51/ggplot_theme_multipanel")
8+
9+
cell_metadata_with_harmonised_annotation = readRDS("dev/cell_metadata_with_harmonised_annotation.rds")
10+
11+
data_for_plot_1 =
12+
cell_metadata_with_harmonised_annotation |>
13+
14+
left_join(
15+
get_metadata("dev/metadata.SQLite") |>
16+
select(.cell, is_primary_data.y, name, cell_type, file_id, assay) |>
17+
as_tibble()
18+
)
19+
20+
# - Number of datasets per tissue
21+
plot_count_dataset =
22+
data_for_plot_1 |>
23+
distinct(file_id, tissue_harmonised) |>
24+
count(tissue_harmonised, name = "Number of datasets") |>
25+
ggplot(aes(fct_reorder(tissue_harmonised, desc(`Number of datasets`)), `Number of datasets`)) +
26+
geom_bar(stat = "identity") +
27+
xlab("Tissue") +
28+
ylab("Number of datasets (log10)") +
29+
scale_y_log10() +
30+
theme_multipanel +
31+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
32+
33+
# - Number of samples per tissue
34+
plot_sample_dataset =
35+
data_for_plot_1 |>
36+
distinct(.sample, tissue_harmonised) |>
37+
count(tissue_harmonised, name = "Number of samples") |>
38+
ggplot(aes(fct_reorder(tissue_harmonised, desc(`Number of samples`)), `Number of samples`)) +
39+
geom_bar(stat = "identity") +
40+
xlab("Tissue") +
41+
ylab("Number of samples (log10)") +
42+
scale_y_log10() +
43+
theme_multipanel +
44+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
45+
46+
# - Histogram of cells per sample
47+
plot_cell_dataset =
48+
data_for_plot_1 |>
49+
count(.sample, assay) |>
50+
ggplot(aes(n)) +
51+
geom_histogram(aes(fill=assay), bins = 100) +
52+
scale_fill_manual(values = dittoSeq::dittoColors()) +
53+
xlab("Number of cells in sample (log10)") +
54+
ylab("Count instances") +
55+
scale_x_log10() +
56+
theme_multipanel
57+
58+
# - Immune proportion per tissue
59+
data_for_immune_proportion =
60+
cell_metadata_with_harmonised_annotation |>
61+
62+
left_join(
63+
get_metadata("dev/metadata.SQLite") |>
64+
select(.cell, is_primary_data.y, name, cell_type, file_id) |>
65+
as_tibble()
66+
) |>
67+
68+
# # Filter only whole tissue
69+
# filter(
70+
# !name |> str_detect(regex('immune', ignore_case = T)) |
71+
# tissue_harmonised %in% c("blood", "lymph node", "bone") |
72+
# is_primary_data.y == "PRIMARY"
73+
# ) |>
74+
75+
# Filter Immune enriched dataset
76+
filter(file_id != "e756c34a-abe7-4822-9a35-55ef12270247") |>
77+
filter(file_id != "ca4a7d56-739b-4e3c-8ecd-28704914cc14") |>
78+
filter(file_id != "59dfc135-19c1-4380-a9e8-958908273756" | tissue_harmonised != "intestine") |>
79+
80+
# nest(data = -c(.sample, tissue_harmonised)) |>
81+
# filter(map_int(data, ~ .x |> filter(cell_type_harmonised == "non_immune") |> nrow()) > 0 | tissue_harmonised %in% c("blood", "lymph node", "bone")) |>
82+
# unnest(data) |>
83+
84+
mutate(is_immune = cell_type_harmonised!="non_immune") |>
85+
86+
# Fix hematopoietic misclassificsation
87+
mutate(is_immune = if_else(!is_immune & cell_type |> str_detect("hematopoietic"), TRUE, is_immune)) |>
88+
89+
# Filter out
90+
filter(!cell_type |> str_detect("erythrocyte")) |>
91+
filter(!cell_type |> str_detect("platelet"))
92+
93+
data_for_immune_proportion_count =
94+
data_for_immune_proportion |>
95+
96+
# Stats
97+
count(.sample, tissue_harmonised, is_immune, file_id) |>
98+
with_groups(.sample, ~ .x |> mutate(proportion = n/sum(n), sum = sum(n))) |>
99+
filter(is_immune) |>
100+
with_groups(tissue_harmonised, ~ .x |> mutate( median_proportion = mean(proportion)))
101+
102+
dropLeadingZero <- function(l){ stringr::str_replace(l, '0(?=.)', '') }
103+
S_sqrt <- function(x){sign(x)*sqrt(abs(x))}
104+
IS_sqrt <- function(x){x^2*sign(x)}
105+
S_sqrt_trans <- function() scales::trans_new("S_sqrt",S_sqrt,IS_sqrt)
106+
107+
108+
plot_immune_proportion_dataset =
109+
data_for_immune_proportion_count |>
110+
ggplot(aes(fct_reorder(tissue_harmonised, desc(median_proportion)), proportion)) +
111+
geom_point(aes(size = sum, color=file_id)) +
112+
guides(color="none") +
113+
scale_size(trans = "log10", range = c(0.1, 2.5), limits = c(1000, 10000)) +
114+
scale_color_manual(values = dittoSeq::dittoColors()) +
115+
scale_y_continuous(trans=S_sqrt_trans(), labels = dropLeadingZero) +
116+
theme_multipanel +
117+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
118+
119+
# - scatter plot of abundance vs variability per tissue
120+
121+
122+
# - Confidence class per cell type
123+
# -
124+
125+
# Study annotation
126+
res =
127+
data_for_immune_proportion |>
128+
mutate(is_immune = as.character(is_immune)) |>
129+
filter(tissue_harmonised %in% c("blood", "intestine")) |>
130+
sccomp_glm(
131+
formula_composition = ~ 0 + tissue_harmonised,
132+
formula_variability = ~ 1,
133+
.sample, is_immune,
134+
check_outliers = FALSE,
135+
approximate_posterior_inference = FALSE,
136+
#contrasts = c("typecancer - typehealthy", "typehealthy - typecancer"),
137+
cores = 20,
138+
mcmc_seed = 42, verbose = T
139+
)
140+
141+
142+
res |> plot_summary()
143+
144+
cell_metadata_with_harmonised_annotation |>
145+
146+
147+
148+
mutate(is_immune = cell_type_harmonised!="non_immune") |>
149+
150+
# Stats
151+
count(.sample, tissue_harmonised, is_immune) |>
152+
with_groups(.sample, ~ .x |> mutate(proportion = n/sum(n))) |>
153+
filter(tissue_harmonised=="heart" & proportion > 0.75)
154+

dev/tissue_label_curated.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ stomach,stomach
139139
testis,testis
140140
thymus,thymus
141141
thyroid gland,thyroid gland
142-
anterior part of tongue,tngue
142+
anterior part of tongue,tongue
143143
posterior part of tongue,tongue
144144
tongue,tongue
145145
trachea,trachea

0 commit comments

Comments
 (0)