|
| 1 | +--- |
| 2 | +title: "combattrial_liz" |
| 3 | +author: "nikeisha" |
| 4 | +date: "23/10/2020" |
| 5 | +output: html_document |
| 6 | +editor_options: |
| 7 | + chunk_output_type: console |
| 8 | +--- |
| 9 | + |
| 10 | +```{r} |
| 11 | +pacman::p_load("dplyr", "tidyverse","readxl","stringr","limma", "edgeR", "gplots", "ggplot2","ComplexHeatmap","dendextend", "imputeLCMD","vsn", "circlize", "COMBAT", "sva") |
| 12 | +
|
| 13 | +``` |
| 14 | + |
| 15 | +```{r} |
| 16 | +# read input file |
| 17 | +file_input <- file.choose() |
| 18 | +
|
| 19 | +raw_data <-read_xlsx(file_input) |
| 20 | +dim(raw_data) |
| 21 | +head(raw_data) |
| 22 | +
|
| 23 | +high_confi_no_contam <- raw_data %>% |
| 24 | + #remove low and mid confidence proteins and contaminants |
| 25 | + filter(`Protein FDR Confidence: Combined` == "High", Contaminant =="FALSE", `# Unique Peptides` > 1) %>% |
| 26 | + # select normalized abundance reporter ion intensity |
| 27 | + dplyr::select(Accession, Description, starts_with("Abundanc")) %>% |
| 28 | + #remove empty 3 channels |
| 29 | + dplyr::select(!contains(c("127C", "130N", "133N"))) |
| 30 | + |
| 31 | +
|
| 32 | +dim(high_confi_no_contam) |
| 33 | +
|
| 34 | +#save the accession, but remove from data frame |
| 35 | +accession <- high_confi_no_contam [1:2] |
| 36 | +data_only <- as.data.frame(high_confi_no_contam[3:106]) |
| 37 | +#attach accession as the row names of the data |
| 38 | +row.names(data_only) <- accession$Accession |
| 39 | +head(data_only) |
| 40 | +dim(data_only) |
| 41 | +
|
| 42 | +#plot non-normalized data |
| 43 | +boxplot(log2(data_only), col = rep(c('red', 'green', 'blue', "magenta", "pink","black", "light blue", "light green"), each =13), notch = TRUE) |
| 44 | +
|
| 45 | +plotDensities(log2(data_only), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each = 13), main = "Raw MQ data", legend = FALSE) |
| 46 | +
|
| 47 | +``` |
| 48 | + |
| 49 | +```{r} |
| 50 | +#bring in sample matrix |
| 51 | +#load sample data |
| 52 | +sampledata <- read_tsv("data/sampledata_fibres.txt", col_names = TRUE) |
| 53 | +
|
| 54 | +sampledata <- as.data.frame(sampledata) |
| 55 | +rownames(sampledata) <- sampledata$newsamplename |
| 56 | +sampledata <- sampledata %>% separate(newsamplename, c("subject", "training", "time", "fibre")) |
| 57 | +``` |
| 58 | + |
| 59 | +```{r} |
| 60 | +#There were issues with the reference channels and so they were removed, and normalisation below was introduced to compensate. |
| 61 | +
|
| 62 | +data_only2 <- data_only %>% |
| 63 | + # select normalized abundance reporter ion intensity |
| 64 | + dplyr::select(starts_with("Abundanc")) %>% |
| 65 | + #remove reference channels |
| 66 | + dplyr::select(!contains(c("126"))) |
| 67 | +
|
| 68 | +boxplot(log2(data_only2), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each =12), notch = TRUE, horizontal = TRUE) |
| 69 | +
|
| 70 | +plotDensities(log2(data_only2), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each = 13), main = "Raw MQ data") |
| 71 | +
|
| 72 | +#remove repeats and bad samples |
| 73 | +data_only2 <- data_only2[-c(47,48,88,90,61,87)] |
| 74 | +``` |
| 75 | + |
| 76 | +#presence/abs graph |
| 77 | +```{r} |
| 78 | +presabs <- data_only2 |
| 79 | +presabs[presabs > 0] <- 1 |
| 80 | +presabs[is.na(presabs)] <- 0 |
| 81 | +
|
| 82 | +Heatmap(presabs, show_row_names = FALSE, show_row_dend = FALSE) |
| 83 | +``` |
| 84 | + |
| 85 | +```{r} |
| 86 | +#remove large missing data and impute the rest |
| 87 | +data_raw<- data_only2 |
| 88 | +data_raw <- data_raw[-which(rowMeans(is.na(data_raw)) > 0.30),] |
| 89 | +data_raw <- as.matrix(data_raw) |
| 90 | +data_raw_impute <- impute.knn(data_raw) |
| 91 | +data_raw_impute <- data_raw_impute[[1]] |
| 92 | +
|
| 93 | +data_raw <- as.data.frame(data_raw_impute) |
| 94 | +``` |
| 95 | + |
| 96 | +```{r} |
| 97 | +#still do SL norm first |
| 98 | +# check the column totals (per channel sums) |
| 99 | +format(round(colSums(data_raw, na.rm = TRUE), digits = 0), big.mark = ",") |
| 100 | +
|
| 101 | +# sample loading normalization |
| 102 | +exp1_raw <- data_raw[c(1:12)] |
| 103 | +exp2_raw <- data_raw[c(13:24)] |
| 104 | +exp3_raw <- data_raw[c(25:36)] |
| 105 | +exp4_raw <- data_raw[c(37:46)] |
| 106 | +exp5_raw <- data_raw[c(47:58)] |
| 107 | +exp6_raw <- data_raw[c(59:69)] |
| 108 | +exp7_raw <- data_raw[c(70:81)] |
| 109 | +exp8_raw <- data_raw[c(82:90)] |
| 110 | +
|
| 111 | +data_no_na<- cbind(exp1_raw,exp2_raw,exp3_raw,exp4_raw,exp5_raw,exp6_raw,exp7_raw, exp8_raw) |
| 112 | +data_no_na<- na.omit(data_no_na) |
| 113 | +dim(data_no_na) |
| 114 | +exp1_raw <- data_raw[c(1:12)] |
| 115 | +exp2_raw <- data_raw[c(13:24)] |
| 116 | +exp3_raw <- data_raw[c(25:36)] |
| 117 | +exp4_raw <- data_raw[c(37:46)] |
| 118 | +exp5_raw <- data_raw[c(47:58)] |
| 119 | +exp6_raw <- data_raw[c(59:69)] |
| 120 | +exp7_raw <- data_raw[c(70:81)] |
| 121 | +exp8_raw <- data_raw[c(82:90)] |
| 122 | +
|
| 123 | +exp1_raw <- exp1_for_sl |
| 124 | +exp2_raw <- exp2_for_sl |
| 125 | +exp3_raw <- exp3_for_sl |
| 126 | +exp4_raw <- exp4_for_sl |
| 127 | +exp5_raw <- exp5_for_sl |
| 128 | +exp6_raw <- exp6_for_sl |
| 129 | +exp7_raw <- exp7_for_sl |
| 130 | +exp8_raw <- exp8_for_sl |
| 131 | +
|
| 132 | +# first basic normalization is to adjust each TMT experiment to equal signal per channel |
| 133 | +# figure out the global scaling value |
| 134 | +target <- mean(c(colSums(exp1_raw,na.rm = T), colSums(exp2_raw,na.rm = T), colSums(exp3_raw,na.rm = T), colSums(exp4_raw,na.rm = T), |
| 135 | + colSums(exp5_raw,na.rm = T),colSums(exp6_raw,na.rm = T),colSums(exp7_raw,na.rm = T),colSums(exp8_raw,na.rm = T))) |
| 136 | +# do the sample loading normalization before the IRS normalization |
| 137 | +# there is a different correction factor for each column |
| 138 | +norm_facs <- target / colSums(exp1_raw, na.rm = T) |
| 139 | +exp1_sl <- sweep(exp1_raw, 2, norm_facs, FUN = "*") |
| 140 | +norm_facs <- target / colSums(exp2_raw, na.rm = T) |
| 141 | +exp2_sl <- sweep(exp2_raw, 2, norm_facs, FUN = "*") |
| 142 | +norm_facs <- target / colSums(exp3_raw, na.rm = T) |
| 143 | +exp3_sl <- sweep(exp3_raw, 2, norm_facs, FUN = "*") |
| 144 | +norm_facs <- target / colSums(exp4_raw, na.rm = T) |
| 145 | +exp4_sl <- sweep(exp4_raw, 2, norm_facs, FUN = "*") |
| 146 | +norm_facs <- target / colSums(exp5_raw, na.rm = T) |
| 147 | +exp5_sl <- sweep(exp5_raw, 2, norm_facs, FUN = "*") |
| 148 | +norm_facs <- target / colSums(exp6_raw, na.rm = T) |
| 149 | +exp6_sl <- sweep(exp6_raw, 2, norm_facs, FUN = "*") |
| 150 | +norm_facs <- target / colSums(exp7_raw, na.rm = T) |
| 151 | +exp7_sl <- sweep(exp7_raw, 2, norm_facs, FUN = "*") |
| 152 | +norm_facs <- target / colSums(exp8_raw, na.rm = T) |
| 153 | +exp8_sl <- sweep(exp8_raw, 2, norm_facs, FUN = "*") |
| 154 | +
|
| 155 | +# make a pre-IRS data frame after sample loading normalizations |
| 156 | +data_sl <- cbind(exp1_sl, exp2_sl, exp3_sl, exp4_sl, exp5_sl, exp6_sl, exp7_sl, exp8_sl) |
| 157 | +
|
| 158 | +boxplot(log2(data_sl), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each = 12), notch = TRUE) |
| 159 | +plotDensities(log2(data_sl), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each = 12), main = "Raw MQ data") |
| 160 | +``` |
| 161 | + |
| 162 | + |
| 163 | +```{r} |
| 164 | +#combat step |
| 165 | +sampledata$training <- factor(sampledata$training) |
| 166 | +mod <- model.matrix(~ training, data = sampledata) |
| 167 | +batch <- sampledata$batch |
| 168 | +
|
| 169 | +# run ComBat as alternative to IRS |
| 170 | +# NOTE: SL norm is probably better to do before applying the batch corection |
| 171 | +data_combat <- ComBat_seq(counts = as.matrix(data_sl), batch = batch) |
| 172 | +data_combat <- as.data.frame(data_combat) |
| 173 | +par(mfrow = c(1, 1)) # any plotting in the ComBat call leaves plots as 2x2 |
| 174 | +
|
| 175 | +# ComBat introduces some negative corrected counts that we need to fix |
| 176 | +#data_combat <- data_combat[apply(data_combat, 1, function(x) all(x >= 0)), ] |
| 177 | +
|
| 178 | +boxplot(log2(data_combat), notch = TRUE, col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each = 12), |
| 179 | + main = "ComBat batch correction of SL data\nExp1 (red), Exp2 (green), Exp3 (blue)", |
| 180 | + xlab = 'TMT Sample', ylab = 'log2 of Intensity') |
| 181 | +
|
| 182 | +plotDensities(log2(data_combat), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), 12), main = "ComBat data") |
| 183 | +``` |
| 184 | + |
| 185 | + |
| 186 | +```{r} |
| 187 | +#apply TMM afterwards |
| 188 | +format(round(colSums(data_combat), digits = 0), big.mark = ",") |
| 189 | +
|
| 190 | +# apply TMM normalization to the ComBat-corrected data |
| 191 | +combat_tmm <- calcNormFactors(data_combat) |
| 192 | +data_combat_tmm <- sweep(data_combat, 2, combat_tmm, FUN = "/") |
| 193 | +
|
| 194 | +# look at the box plots |
| 195 | +boxplot(log2(data_combat_tmm), notch = TRUE, col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), each = 12), |
| 196 | + main = "ComBat batch corrected with TMM\nExp1 (red), Exp2 (green), Exp3 (blue)", |
| 197 | + xlab = 'TMT Sample', ylab = 'log2 of Intensity') |
| 198 | +
|
| 199 | +# can also look at density plots (like a distribution histogram) |
| 200 | +plotDensities(log2(data_combat_tmm), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), 12), main = "ComBat/TMM data") |
| 201 | +
|
| 202 | +
|
| 203 | +# ggsave( |
| 204 | +# "results/densityplot.pdf", |
| 205 | +# plotDensities(log2(data_combat_tmm), col = rep(c("red", "green", "blue", "pink","orange","yellow","purple","grey"), 12), main = "ComBat/TMM data", legend = FALSE), |
| 206 | +# width = 10, |
| 207 | +# height = 9, |
| 208 | +# dpi = 300, |
| 209 | +# useDingbats=FALSE |
| 210 | +# ) |
| 211 | +``` |
| 212 | + |
| 213 | + |
0 commit comments