Skip to content

Commit 5488117

Browse files
committed
visualize sample annotations including QC metrics
1 parent 80ece52 commit 5488117

File tree

6 files changed

+229
-1
lines changed

6 files changed

+229
-1
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ The workflow performs the following steps that produce the outlined results:
9595
- **QC & Reporting:**
9696
- Employs `RSeQC` tools to generate key quality metrics like strand specificity and read distribution across genomic features (`rseqc/`).
9797
- Aggregates QC metrics from `fastp`, `STAR` and `RSeQC` into a single report using `MultiQC` (`report/multiqc_report.html`) with [AI summaries](https://seqera.io/blog/ai-summaries-multiqc/).
98+
- Generates a hierarchically-clustered QC heatmap with matching metadata annotation, exported both as a PNG and an interactive HTML with metadata as tooltips.
9899

99100
---
100101

@@ -104,6 +105,8 @@ The workflow produces the following directory structure:
104105
└── rnaseq_pipeline/
105106
├── report/
106107
│ └── multiqc_report.html # Aggregated QC report for all samples
108+
│ └── sample_annotation.png # Hierarchically clustered heatamp of QC metrics, annotated with metadata
109+
│ └── sample_annotation.html # Interactive hierarchically clustered heatamp of QC metrics, with metadata as tooltip
107110
├── fastp/ # fastp QC/filtering and adapter trimming outputs per sample
108111
├── rseqc/ # RSeQC output per sample
109112
├── star/ # STAR output per sample
@@ -136,7 +139,7 @@ Explore detailed examples showcasing module usage in comprehensive end-to-end an
136139
- [RNA-seq Analysis Recipe](https://github.com/epigen/MrBiomics/wiki/RNAseq-Analysis-Recipe)
137140

138141
# 🔍 Quality Control
139-
Below are some guidelines for the manual quality control of each sample using the generated `MultiQC` report, but keep in mind that every experiment/dataset is different. Thresholds are general suggestions and may vary based on experiment type, organism, and library prep.
142+
Below are some guidelines for the manual quality control of each sample using the generated `MultiQC` report and visualized (interactive) sample annotation, but keep in mind that every experiment/dataset is different. Thresholds are general suggestions and may vary based on experiment type, organism, and library prep.
140143

141144
- **Read Depth (STAR)**: Count of `(Uniquely) Mapped Reads` >10M is minimum, >20M reads is optimal for differential expression analysis.
142145
- **Alignment Rate (STAR):** `% (Uniquely) Mapped Reads` >70-80%. Low rates might indicate contamination or reference issues.

workflow/Snakefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ rule all:
4444
# QUANTIFICATION
4545
counts = os.path.join(result_path, "counts", "counts.csv"),
4646
sample_annotation = os.path.join(result_path, "counts", "sample_annotation.csv"),
47+
sample_annotation_plot = os.path.join(result_path, "report", "sample_annotation.png"),
4748
# ANNOTATION
4849
gene_annotation = os.path.join(result_path,'counts',"gene_annotation.csv"),
4950
# EXPORT environments and configurations

workflow/envs/ggplot.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
- nodefaults
5+
dependencies:
6+
- r-tidyverse
7+
- r-data.table
8+
- r-patchwork
9+
- r-ggplot2
10+
- r-ggnewscale
11+
- r-plotly
12+
- r-htmlwidgets
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Interactive visualization of hierarchically clustered sample QC metrics and annotation.

workflow/rules/qc.smk

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,30 @@ rule multiqc:
208208
"logs/rules/multiqc.log",
209209
wrapper:
210210
"v5.9.0/bio/multiqc"
211+
212+
213+
# visualize sample annotation (including QC metrics)
214+
rule plot_sample_annotation:
215+
input:
216+
sample_annotation = config["annotation"],
217+
sample_annotation_w_QC = os.path.join(result_path, "counts", "sample_annotation.csv"),
218+
output:
219+
sample_annotation_plot = os.path.join(result_path,"report","sample_annotation.png"),
220+
sample_annotation_html = report(os.path.join(result_path,"report","sample_annotation.html"),
221+
caption="../report/sample_annotation.rst",
222+
category="{}_{}".format(config["project_name"], module_name),
223+
subcategory="QC",
224+
labels={
225+
"name": "Sample annotation",
226+
"type": "HTML",
227+
}),
228+
log:
229+
"logs/rules/plot_sample_annotation.log",
230+
resources:
231+
mem_mb="1000",
232+
conda:
233+
"../envs/ggplot.yaml"
234+
script:
235+
"../scripts/plot_sample_annotation.R"
236+
237+
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
#### libraries ####
2+
library(data.table)
3+
library(tidyverse)
4+
library(patchwork)
5+
library(ggplot2)
6+
library(ggnewscale)
7+
# for interactive plotting
8+
library(plotly)
9+
library(htmlwidgets)
10+
# set plot base size
11+
theme_set(theme_minimal(base_size = 6))
12+
13+
#### configs ####
14+
# input
15+
sample_annotation_path <- snakemake@input[["sample_annotation"]]
16+
sample_annotation_w_QC_path <- snakemake@input[["sample_annotation_w_QC"]]
17+
18+
# output
19+
sample_annotation_plot_path <- snakemake@output[["sample_annotation_plot"]]
20+
sample_annotation_html_path <- snakemake@output[["sample_annotation_html"]]
21+
22+
#### load & prepare data ####
23+
# load data
24+
sample_annotation <- data.frame(fread(file.path(sample_annotation_path), header=TRUE), row.names=1)
25+
anno <- data.frame(fread(file.path(sample_annotation_w_QC_path), header=TRUE), row.names=1)
26+
27+
# determine QC (pipeline provided) columns
28+
qc_cols <- setdiff(names(anno), names(sample_annotation))
29+
30+
# determine metadata (user provided) columns by removing non-numeric columns that are unique for each row (e.g., bam_file)
31+
sample_annotation <- sample_annotation %>% select(where(\(.x){ n <- dplyr::n_distinct(na.omit(.x)); (is.numeric(.x) || n < nrow(sample_annotation)) }))
32+
meta_cols <- names(sample_annotation)
33+
34+
# drop columns with no variation (e.g., read_type)
35+
has_var <- function(x) length(unique(na.omit(x))) > 1
36+
qc_cols <- keep(qc_cols, ~ has_var(anno[[.x]]))
37+
meta_cols <- keep(meta_cols, ~ has_var(anno[[.x]]))
38+
39+
# collapse "duplicates" (ie redundant columns) only among non-numeric
40+
num_cols <- meta_cols[vapply(anno[meta_cols], is.numeric, logical(1))]
41+
cat_cols <- setdiff(meta_cols, num_cols)
42+
sig <- vapply(cat_cols,
43+
\(col) paste(match(anno[[col]], unique(anno[[col]])), collapse = "|"),
44+
character(1))
45+
meta_cols <- c(cat_cols[!duplicated(sig)], num_cols)
46+
47+
#### Z-score & cluster QC data ####
48+
qc_mat <- anno |> select(all_of(qc_cols)) |> scale() |> as.matrix()
49+
50+
row_ord <- hclust(dist(qc_mat))$order
51+
col_ord <- hclust(dist(t(qc_mat)))$order
52+
53+
qc_long <- as_tibble(qc_mat[row_ord, col_ord], rownames = "sample") |>
54+
pivot_longer(-sample, names_to = "metric", values_to = "z")
55+
56+
#### prepare metadata for plotting ####
57+
meta_long <- anno[row_ord, meta_cols] %>%
58+
mutate(sample = rownames(anno)[row_ord]) %>%
59+
mutate(across(-sample, as.character)) %>%
60+
pivot_longer(-sample, names_to = "meta", values_to = "value") %>%
61+
group_by(meta) %>%
62+
mutate(num_val = suppressWarnings(as.numeric(value)),
63+
type = if (all(!is.na(num_val))) "numeric" else "factor",
64+
col = if (type[1] == "numeric") {
65+
scales::col_numeric("plasma",
66+
domain = range(num_val, na.rm = TRUE))(num_val)
67+
} else {
68+
pal <- scales::hue_pal(l = 65)(n_distinct(value))
69+
setNames(pal, sort(unique(value)))[value]
70+
}
71+
) %>%
72+
ungroup() %>%
73+
select(-num_val)
74+
75+
#### embed ALL metadata into a tooltip string of interactive plot ####
76+
meta_txt <- anno[row_ord, meta_cols] %>% mutate(across(everything(), as.character))
77+
meta_txt <- pmap_chr(meta_txt, \(...) {
78+
vals <- c(...)
79+
paste(paste(names(vals), vals, sep = ": "), collapse = "<br>")
80+
})
81+
names(meta_txt) <- rownames(anno)[row_ord]
82+
83+
#### plot heatmaps ####
84+
85+
#### QC heatmap ####
86+
87+
# add (un-scaled) metric values (raw) for tooltip of interactive plot
88+
qc_raw_long <- anno[row_ord, qc_cols] %>%
89+
mutate(sample = rownames(anno)[row_ord]) %>%
90+
pivot_longer(-sample, names_to = "metric", values_to = "raw")
91+
92+
qc_long <- qc_long %>% left_join(qc_raw_long, by = c("sample", "metric"))
93+
94+
# keep the clustered order in the plot and add hover-tooltip
95+
qc_long <- qc_long %>%
96+
mutate(sample = factor(sample, levels = rownames(qc_mat)[row_ord]), # row-order
97+
metric = factor(metric, levels = colnames(qc_mat)[col_ord]), # col-order
98+
hover = paste0("Sample: ", sample,
99+
"<br>Metric: ", metric,
100+
"<br>Value: ", signif(raw, 4),
101+
"<br>", meta_txt[as.character(sample)])
102+
)
103+
104+
# plot
105+
p_qc <- ggplot(qc_long, aes(x = metric,
106+
y = sample,
107+
fill = z,
108+
text = hover)) +
109+
geom_tile() +
110+
scale_x_discrete(limits = colnames(qc_mat)[col_ord]) + # enforce col order
111+
scale_y_discrete(limits = rownames(qc_mat)[row_ord]) + # enforce row order
112+
scale_fill_gradient2(low = "blue", mid = "white", high = "red", name = "z-score",
113+
guide = guide_colourbar(barheight = 2, # thinner
114+
barwidth = 0.15)) +
115+
labs(x = NULL, y = NULL, title = "QC metrics (scaled)") +
116+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
117+
panel.grid = element_blank())
118+
119+
#### metadata heatmap as "annotation" ####
120+
p_meta <- NULL
121+
if(length(meta_cols) > 0){
122+
# order columns (x) exactly like the QC heatmap
123+
meta_long <- meta_long %>% mutate(sample = factor(sample, levels = rownames(qc_mat)[row_ord])) # row-order
124+
meta_levels <- unique(meta_long$meta)
125+
meta_long <- meta_long %>% mutate(meta = factor(meta, levels = meta_levels))
126+
127+
p_meta <- ggplot() +
128+
scale_y_discrete(limits = levels(qc_long$sample)) +
129+
scale_x_discrete(limits = meta_levels) +
130+
labs(x = NULL, y = NULL, title = "Metadata") +
131+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
132+
axis.text.y = element_blank(),
133+
axis.title = element_blank(),
134+
panel.grid = element_blank())
135+
136+
for (v in meta_levels) {
137+
dat <- dplyr::filter(meta_long, meta == v)
138+
139+
p_meta <- p_meta + ggnewscale::new_scale_fill() # reset “fill” for this column
140+
141+
if (dat$type[1] == "numeric") { # continuous legend
142+
p_meta <- p_meta +
143+
geom_tile(data = dat, aes(x = meta, y = sample, fill = as.numeric(value)), colour = "grey60", linewidth = 0.1) +
144+
scale_fill_viridis_c(name = v, option = "plasma",
145+
guide = guide_colourbar(barheight = 2, # thinner
146+
barwidth = 0.15))
147+
} else { # categorical legend
148+
pal <- setNames(dat$col, dat$value)
149+
p_meta <- p_meta +
150+
geom_tile(data = dat, aes(x = meta, y = sample, fill = value), colour = "grey60", linewidth = 0.1) +
151+
scale_fill_manual(values = pal, name = v,
152+
guide = guide_legend(keywidth = 0.25, # thinner
153+
keyheight = 0.4))
154+
}
155+
}
156+
}
157+
158+
#### combine and save plots ####
159+
p_combined <- if (is.null(p_meta)) p_qc else (p_qc | p_meta) + plot_layout(widths = c(length(qc_cols), length(meta_cols)), guides = "collect") & theme(legend.position = "right")
160+
161+
# determine sizes
162+
n_rows <- nrow(qc_mat)
163+
n_cols <- length(qc_cols) + length(meta_cols)
164+
max_row_label <- max(nchar(rownames(anno)))
165+
max_col_label <- max(nchar(c(qc_cols, meta_cols)))
166+
167+
height_in <- n_rows * 0.08 + max_col_label * 0.05 + 1
168+
width_in <- n_cols * 0.10 + max_row_label * 0.05 + 2
169+
170+
# options(repr.plot.width = width_in, repr.plot.height = height_in)
171+
# p_combined
172+
173+
ggsave(sample_annotation_plot_path, plot = p_combined, width = width_in, height = height_in, units = "in", dpi = 300)
174+
175+
#### interactive plot ####
176+
# determine sizes in pixels
177+
width_px <- round((length(qc_cols) * 0.10 + max_row_label * 0.05 + 2) * 96)
178+
height_px <- round(height_in * 96)
179+
180+
p_qc_interactive <- plotly::ggplotly(p_qc, tooltip = "text", width = width_px, height = height_px)
181+
182+
# p_qc_interactive
183+
184+
htmlwidgets::saveWidget(p_qc_interactive, sample_annotation_html_path, selfcontained = TRUE, title = "Sample annotation")

0 commit comments

Comments
 (0)