diff --git a/atacseq/01_quality_assessment/QC.Rmd b/atacseq/01_quality_assessment/QC.Rmd new file mode 100644 index 0000000..4b13337 --- /dev/null +++ b/atacseq/01_quality_assessment/QC.Rmd @@ -0,0 +1,389 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +params: + # Fill this file with the right paths to nfcore output + # params_file: params_qc_nf-core-example.R # example data + params_file: params_qc-example.R + project_file: ../../information.R + functions_file: ../libs/load_data_atac.R + factor_of_interest: group +editor_options: + chunk_output_type: console +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) + +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +if (compareVersion(R.version$minor,"3.1")<0) warning("We recommend >= R4.3.1") +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +library(ChIPpeakAnno) +library(UpSetR) +library(httr) +library(jsonlite) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Samples and metadata + +```{r example_data, message=F, warning=F, eval = params$params_file == 'params_qc-example.R'} +bcbio_qc_atacseq_testdata() +``` + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +coldata <- load_coldata(coldata_fn) +# coldata$sample=row.names(coldata) + +metrics <- load_metrics(multiqc_data_dir) + +metrics <- full_join(coldata, metrics) +rownames(metrics) <- metrics$sample +dds <- load_counts(counts_fn) + +coldata_for_dds = metrics[colnames(dds),] +rownames(coldata_for_dds) <- coldata_for_dds$sample +stopifnot(all(colnames(dds) == rownames(coldata_for_dds))) + +peaks <- load_peaks(peaks_dir) %>% left_join(coldata) +inserts <- load_inserts(inserts_dir) %>% left_join(coldata) +``` + +```{r show_metadata} +metrics_lite <- metrics %>% dplyr::select(sample, total_reads, mapped_reads_pct, frip, peak_count) +full_join(coldata, metrics_lite) %>% sanitize_datatable() +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads (the grey line). + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample, + y = total_reads, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads")+ + geom_hline(yintercept=20000000, color = "grey", linewidth=2) + +``` + + +## Mapped Reads + +Note that these numbers may be higher than the total reads due to individual reads mapping to multiple locations. + +```{r plot_mapped_reads} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapped reads") + +``` + +## Fraction of reads in peaks + +This figure shows what percentage of reads are mapping to regions within peaks called by macs3. + +```{r plot_frip} +metrics %>% filter(!is.na(frip)) %>% + ggplot(aes(x = sample, + y = frip, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "FRiP") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Fraction of reads in peaks") + +``` + +## Number of peaks + +Ideally, we will see a similar number of peaks between replicates. + +```{r plot_peak_count} +metrics %>% filter(!is.na(peak_count)) %>% + ggplot(aes(x = sample, + y = peak_count, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Number of Peaks") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Number of Peaks") + +``` + +## Non-Redundant Fraction + +The NRF is the number of uniquely mapping reads divided by the total number of reads. The ENCODE website also sets out standardized thresholds for this as well and those are summarized in the table below. + +```{r nrf table} +NRF <- c("NRF < 0.5", "0.5 < NRF < 0.8", "0.8 < NRF < 0.9", "NRF > 0.9") +NRF_level <- c("Concerning", "Acceptable", "Compliant", "Ideal") + +NRF_df <- data.frame(NRF, NRF_level) + +colnames(NRF_df) <- c("NRF", "NRF Level") +NRF_df %>% sanitize_datatable() + +``` + +```{r plot_nrf} +metrics %>% + ggplot(aes(x = sample, + y = nrf, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Non-Redundant Fraction") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Non-Redundant Fraction")+ + geom_hline(yintercept = 0.9, linetype = "dashed", color="green") + + geom_hline(yintercept = 0.8, linetype = "dashed", color="orange") + + geom_hline(yintercept = 0.5, linetype = "dashed", color="red") + +``` + +## Insert Size + +In the absence of any pre-sequencing size selection, we expect about 3 peaks in this plot -- at ~50 nt reflecting nucleosome-free regions, at ~175 nt reflecting mononucleosome fragments, and at ~350 nt reflecting dinucleosome fragments. + +```{r insert size} +ggplot(inserts, aes(x = insert_size, y = count, color = sample)) + geom_line() + + scale_color_cb_friendly() + +``` + +# Correlation Heatmap + +Inter-correlation analysis (ICA) is another way to look at how well samples +cluster by plotting the correlation between the peak regions of the +samples. + +```{r clustering fig, fig.width = 10, fig.asp = .62} +vst_cor <- cor(assays(dds)$vst) + +colma=coldata_for_dds %>% as.data.frame() +rownames(colma) <- colma$sample +colma <- colma[rownames(vst_cor), ] %>% select(.data[[params$factor_of_interest]]) +anno_colors=lapply(colnames(colma), function(c){ + l.col=cb_friendly_pal()(length(unique(colma[[c]]))) + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) + +p <- pheatmap(vst_cor, + annotation = colma, + annotation_colors = anno_colors, + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15) + ) +p +``` + +# PCA + +We can run PCA to evaluate the variation amongst our samples and whether or not the greatest sources of variation in the data (PC1 and PC2) can be attributed to the factors of interest in this experiment. + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} + +pca1 <- degPCA(assays(dds)$vst, coldata_for_dds, + condition = params$factor_of_interest, data = T)[["plot"]] +pca2 <- degPCA(assays(dds)$vst, coldata_for_dds, + condition = params$factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() + +``` + +# Peak Signal Concordance {.tabset} + +## Peak enrichment vs. Peak rank + +In this plot, we are looking at each individual replicates to evaluate what number of peaks we would retain if threshholding by peak enrichment. It is also valuable to see how this differs between replicates within a sample group. + +```{r peak enrichment vs rank} +ggplot(peaks, aes(x = peak_rank, y = peak_enrichment, color = sample)) + + geom_line() + + scale_color_cb_friendly() + + xlab("Peak rank") + ylab("Peak enrichment") + +``` + +## Peak signal distribution + +Here, we plot a histogram of peak signal values for each sample. This plot can be used to help determine a minimum value for peak enrichment that can be used for filtering. + +```{r peak signal distribution} +ggplot(peaks, aes(x = peak_enrichment, fill = sample_group)) + + geom_histogram(aes(peak_enrichment)) + + scale_fill_cb_friendly() + + xlab("Peak enrichment") + +``` + +# Peak Overlap {.tabset} + +We examine the amount of overlap between peaks in replicates of the same experimental condition. + +``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6} +for (current_sample_group in unique(peaks$sample_group)){ + + peaks_sample_group <- peaks %>% filter(sample_group == current_sample_group) + + peaks_sample_group_granges <- sapply( + unique(peaks_sample_group$sample), + function(current_sample) { + if (nrow(peaks_sample_group %>% filter(sample == current_sample)) > 0){ + ChIPpeakAnno::toGRanges( + peaks_sample_group %>% filter(sample == current_sample), + format = ifelse(grepl('broadPeak', peaks_dir), 'broadPeak', 'narrowPeak') + ) + } + } + ) + + if (length(peaks_sample_group_granges) > 1){ + # maxgap defaults to -1 which means that two peaks will be merged if they overlap by at least 1 bp + # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5 peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge" will add 1 to the overlapping counts + if (length(peaks_sample_group_granges) <= 5){ + cat("## ", current_sample_group, "\n") + + overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge') + + n_samples <- length(peaks_sample_group_granges) + + if (n_samples > 3){ + set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% + as.data.frame() %>% + mutate(group_number = row_number()) %>% + pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>% + filter(member > 0) %>% + group_by(Counts, group_number) %>% + summarize(group = paste(sample, collapse = '&')) + + set_counts_upset <- set_counts$Counts + names(set_counts_upset) <- set_counts$group + + p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5) + print(p) + } else{ + venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks)) + invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples], + NameOfPeaks = venn_sample_names))) + } + + cat('\n\n') + } + + } +} + + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/atacseq/01_quality_assessment/params_qc-example.R b/atacseq/01_quality_assessment/params_qc-example.R new file mode 100644 index 0000000..c67b574 --- /dev/null +++ b/atacseq/01_quality_assessment/params_qc-example.R @@ -0,0 +1,12 @@ +# info params + + +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/atacseq/atacseq_test_samplesheet.csv' +# This folder is in the nf-core output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/atacseq/multiqc/narrow_peak/multiqc_data/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '.' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak, also includes antibody name +counts_fn = url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/atacseq/bowtie2/merged_library/macs3/narrow_peak/consensus/deseq2/consensus_peaks.mLb.clN.rds') +# This folder is in the nf-core output directory, +inserts_dir <- '.' diff --git a/atacseq/01_quality_assessment/params_qc.R b/atacseq/01_quality_assessment/params_qc.R new file mode 100644 index 0000000..ac9a995 --- /dev/null +++ b/atacseq/01_quality_assessment/params_qc.R @@ -0,0 +1,12 @@ +# info params + + +coldata_fn='/path/to/nf-core/samplesheet.csv' +# This folder is in the nf-core output directory inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/multiqc/narrow_peak/multiqc_data/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/merged_library/macs3/narrow_peak/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak, also includes antibody name +counts_fn = '/path/to/nf-core/output/bowtie2/merged_library/macs3/narrow_peak/consensus/deseq2/consensus_peaks.mLb.clN.rds' +# This folder is in the nf-core output directory, +inserts_dir <- '/path/to/nf-core/output/bowtie2/merged_library/picard_metrics/' diff --git a/chipseq/libs/load_data_atac.R b/atacseq/libs/load_data_atac.R old mode 100755 new mode 100644 similarity index 70% rename from chipseq/libs/load_data_atac.R rename to atacseq/libs/load_data_atac.R index 902fec1..51c1bc7 --- a/chipseq/libs/load_data_atac.R +++ b/atacseq/libs/load_data_atac.R @@ -2,32 +2,65 @@ library(tidyverse) library(SummarizedExperiment) library(janitor) +bcbio_qc_atacseq_testdata <- function(){ + # if using example data to render report, download peaks from github + api_urls <- c("https://api.github.com/repos/bcbio/bcbioR-test-data/contents/atacseq/bowtie2/merged_library/macs3/narrow_peak/", + "https://api.github.com/repos/bcbio/bcbioR-test-data/contents/atacseq/bowtie2/merged_library/picard_metrics/") + + for (api_url in api_urls){ + response <- GET(api_url) + + if (status_code(response) == 200) { + content <- httr::content(response, as = "text") + files_info <- fromJSON(content) %>% filter(name != 'consensus') + + # Filter out file paths and construct raw URLs + file_paths <- files_info$path + raw_base_url <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/" + + raw_file_urls <- paste0(raw_base_url, file_paths) + + # Function to download a file from a URL + download_file <- function(url) { + file_name <- basename(url) + download.file(url, destfile = file_name, mode = "wb") + } + + # Download all files using the constructed raw URLs + for (url in raw_file_urls) { + download_file(url) + } + peaks_dir = '.' + } + } +} + load_metrics <- function(multiqc_data_dir){ fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% dplyr::select(sample, total_reads = total_sequences) %>% mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + mutate(new_sample = gsub('_[12]+$', '', new_sample)) %>% group_by(new_sample) %>% summarize(new_total_reads = sum(total_reads)) %>% dplyr::select(sample = new_sample, total_reads = new_total_reads) - samtools <- read_tsv(file.path(multiqc_data_dir, 'multiqc_samtools_stats.txt')) %>% clean_names() %>% + samtools <- read_tsv(file.path(multiqc_data_dir, 'multiqc_samtools_stats_1.txt')) %>% clean_names() %>% dplyr::select(sample, mapped_reads = reads_mapped) %>% mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% group_by(new_sample) %>% summarize(new_mapped_reads = sum(mapped_reads)) %>% dplyr::select(sample = new_sample, mapped_reads = new_mapped_reads) - phantom <- read_tsv(file.path(multiqc_data_dir, 'multiqc_phantompeakqualtools.txt')) %>% clean_names() %>% - dplyr::select(sample, nsc, rsc) - frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_frip_score-plot.txt')) %>% dplyr::select(-Sample) %>% + frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_mlib_frip_score-plot.txt')) %>% dplyr::select(-Sample) %>% pivot_longer(everything(), names_to = 'sample', values_to = 'frip') %>% filter(!is.na(frip)) - peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_peak_count-plot.txt')) %>% dplyr::select(-Sample) %>% - pivot_longer(everything(), names_to = 'sample', values_to = 'peak_count') %>% filter(!is.na(peak_count)) + peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_mlib_peak_count-plot.txt')) %>% dplyr::select(-Sample) %>% + pivot_longer(everything(), names_to = 'sample', values_to = 'peak_count') %>% filter(!is.na(peak_count)) %>% + mutate(sample = gsub('.mLb.clN_peaks', '', sample)) nrf <- read_tsv(file.path(multiqc_data_dir, 'mqc_picard_deduplication_1.txt')) %>% clean_names() %>% mutate(nrf = unique_unpaired / (unique_unpaired + duplicate_unpaired)) %>% dplyr::select(sample, nrf) - metrics <- full_join(fastqc, samtools) %>% full_join(phantom) %>% full_join(frip) %>% + metrics <- full_join(fastqc, samtools) %>% full_join(frip) %>% full_join(peak_count) %>% full_join(nrf) %>% mutate(mapped_reads_pct = round(mapped_reads/total_reads*100,1)) @@ -38,7 +71,9 @@ load_metrics <- function(multiqc_data_dir){ load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NULL, subset_column = NULL, subset_value = NULL){ coldata <- read.csv(coldata_fn) %>% - dplyr::distinct(sample_name, .keep_all = T) %>% + mutate(group = sample) %>% + mutate(sample = paste0(sample, '_REP', replicate)) %>% + dplyr::distinct(sample, .keep_all = T) %>% dplyr::select(!matches("fastq")) %>% distinct() @@ -59,6 +94,7 @@ load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NU load_counts <- function(counts_fn){ counts <- readRDS(counts_fn) + colnames(counts) <- gsub('.mLb.clN.shifted.sorted.bam', '', colnames(counts)) return(counts) } @@ -69,7 +105,7 @@ load_peaks <- function(peaks_dir){ names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) } else { peaks_fns <- list.files(peaks_dir, pattern = '_peaks.narrowPeak') - names(peaks_fns) <- gsub('_peaks.narrowPeak', '', peaks_fns) + names(peaks_fns) <- gsub('.mLb.clN_peaks.narrowPeak', '', peaks_fns) } peaks_all <- lapply(peaks_fns, function(fn) { peaks <- read_delim(file.path(peaks_dir, fn), col_names = F) @@ -83,6 +119,15 @@ load_peaks <- function(peaks_dir){ return(peaks_all) } +load_inserts <- function(inserts_dir){ + inserts_fns <- list.files(inserts_dir, pattern = '*insert_size_metrics') + names(inserts_fns) <- gsub('.mLb.clN.CollectMultipleMetrics.insert_size_metrics', '', inserts_fns) + + inserts_all <- lapply(inserts_fns, function(fn){ + read_delim(file.path(inserts_dir, fn), skip = 11, col_names = c('insert_size', 'count')) + }) %>% bind_rows(.id = 'sample') +} + make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL){ bam_files <- data.frame(bam = list.files(bam_dir, pattern = '.bam$', full.names = T)) %>% mutate(sample = sub("\\..*", "",basename(bam)))