|
| 1 | +## local functions |
| 2 | +median.quartile <- function(x){ |
| 3 | + out <- quantile(x, probs = c(0.25,0.5,0.75)) |
| 4 | + names(out) <- c("ymin","y","ymax") |
| 5 | + return(out) |
| 6 | +} |
| 7 | + |
| 8 | + |
| 9 | +dataDir <- "/home/sebastian/Data/Collaborations/FSU/PromoterSeqCap/sortingTSVS for Tremethick paper /Figure 1/" |
| 10 | +l1 <- lapply(list.files(path = dataDir, pattern=".tsv"), function(x){ |
| 11 | + dt <- data.table::fread(paste(dataDir, x, sep = "/")) |
| 12 | + return(dt) |
| 13 | +}) |
| 14 | +n1 <- gsub(x = list.files(path = dataDir, pattern=".tsv"), pattern = ".tsv", replacement = "") |
| 15 | +names(l1) <- unlist(lapply(strsplit(n1, "_"), function(x) paste(x[2:3], collapse = "_"))) |
| 16 | + |
| 17 | +l1 <- lapply(l1, function(x) { |
| 18 | + ucscID <- unlist(lapply(strsplit(x$gene, ";"), function(x) x[4])) |
| 19 | + extGene <- unlist(lapply(strsplit(x$gene, ";"), function(x) x[6])) |
| 20 | + x$ucscID <- ucscID |
| 21 | + x$extGene <- extGene |
| 22 | + x$chr <- unlist(lapply(strsplit(x$gene, ";"), function(x) x[1])) |
| 23 | + x$start <- unlist(lapply(strsplit(x$gene, ";"), function(x) x[2])) |
| 24 | + x$end <- unlist(lapply(strsplit(x$gene, ";"), function(x) x[3])) |
| 25 | + return(x) |
| 26 | +}) |
| 27 | +l1 |
| 28 | +gr.promoters <- GenomicRanges::makeGRangesFromDataFrame(l1$Total_10A, ignore.strand = T, keep.extra.columns = T) |
| 29 | + |
| 30 | +rm(dt1) |
| 31 | +dt1 <- merge(subset(l1$Total_10A, select = c("extGene", "group1")), subset(l1$Total_TGFb, select = c("extGene", "group1")), by.x = "extGene", by.y = "extGene") |
| 32 | +colnames(dt1)[2:3] <- c("MCF10A_WT", "MCF10A_TGFb") |
| 33 | +dt1 <- merge(dt1, subset(l1$Total_shZ, select = c("extGene", "group1")), by.x = "extGene", by.y = "extGene") |
| 34 | +dt1 <- merge(dt1, subset(l1$Total_CA1a, select = c("extGene", "group1")), by.x = "extGene", by.y = "extGene") |
| 35 | +colnames(dt1)[4:5] <- c("MCF10A_shZ", "MCF10CA1A") |
| 36 | +colnames(dt1) |
| 37 | + |
| 38 | + |
| 39 | +# For input (figure 1) |
| 40 | +# We have decided on the following classes |
| 41 | +# 1. Strong +2 (1 for 10A, 1 for TGFb, 1 for shH2AZ, 1 for CAC1) |
| 42 | +# 2. Strong +1 (2 for 10A, 2 for TGFb, 2 for shH2AZ, 2 for CAC1) |
| 43 | +# 3. weak upstream (3 for 10A, 5 for TGFb, 5 for shH2AZ, 5 for CAC1) |
| 44 | +# 4. weak array across promoter (4 for 10A, 7 for TGFb, 7 for shH2AZ, 7 for CAC1) |
| 45 | +# 5. Strong -2 (5 for 10A, 6 for TGFb, 6 for shH2AZ, 6 for CAC1) |
| 46 | +# 6. Strong -1 (6 for 10A, 4 for TGFb, 4 for shH2AZ, 4 for CAC1) |
| 47 | +# 7. Overall depleted (7 for 10A, 3 for TGFb, 3 for shH2AZ, 3 for CAC1) |
| 48 | + |
| 49 | +mcf10awtCategories <- data.table::data.table(cat = c("Strong +2", "Strong +1", "weak upstream", "weak array across promoter", "Strong -2", "Strong -1", "Overall depleted"), |
| 50 | + group = c(1,2,3,4,5,6,7)) |
| 51 | + |
| 52 | +mcf10atgfbCategories <- data.table::data.table(cat = c("Strong +2", "Strong +1", "weak upstream", "weak array across promoter", "Strong -2", "Strong -1", "Overall depleted"), |
| 53 | + group = c(1,2,5,7,6,4,3)) |
| 54 | + |
| 55 | +mcf10ashh2azCategories <- data.table::data.table(cat = c("Strong +2", "Strong +1", "weak upstream", "weak array across promoter", "Strong -2", "Strong -1", "Overall depleted"), |
| 56 | + group = c(1,2,5,7,6,4,3)) |
| 57 | + |
| 58 | +mcf10cac1Categories <- data.table::data.table(cat = c("Strong +2", "Strong +1", "weak upstream", "weak array across promoter", "Strong -2", "Strong -1", "Overall depleted"), |
| 59 | + group = c(1,2,5,7,6,4,3)) |
| 60 | + |
| 61 | +dt1$MCF10A_WT.category <- as.factor(mcf10awtCategories$cat[match(dt1$MCF10A_WT, mcf10awtCategories$group)]) |
| 62 | +dt1$MCF10A_TGFb.category <- as.factor(mcf10atgfbCategories$cat[match(dt1$MCF10A_TGFb, mcf10atgfbCategories$group)]) |
| 63 | +dt1$MCF10A_shZ.category <- as.factor(mcf10ashh2azCategories$cat[match(dt1$MCF10A_shZ, mcf10ashh2azCategories$group)]) |
| 64 | +dt1$MCF10CA1A.category <- as.factor(mcf10cac1Categories$cat[match(dt1$MCF10CA1A, mcf10cac1Categories$group)]) |
| 65 | + |
| 66 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_TGFb.category"), #, "MCF10A_shZ.category", "MCF10CA1A.category"), |
| 67 | + data = dt1, |
| 68 | + label.size = 4, |
| 69 | + text.angle = 0, label.colour= "black", same.level = F) |
| 70 | + |
| 71 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_shZ.category"), #, , "MCF10CA1A.category"), |
| 72 | + data = dt1, |
| 73 | + label.size = 4, |
| 74 | + text.angle = 0, label.colour= "black", same.level = F) |
| 75 | + |
| 76 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10CA1A.category"), #, , ), |
| 77 | + data = dt1, |
| 78 | + label.size = 4, |
| 79 | + text.angle = 0, label.colour= "black", same.level = F) |
| 80 | + |
| 81 | +factor(dt1$MCF10A_TGFb.category) |
| 82 | + |
| 83 | +table(dt1$MCF10A_WT.category) |
| 84 | +table(dt1$MCF10A_TGFb.category) |
| 85 | +table("WT" = dt1$MCF10A_WT.category, "TGFb" = dt1$MCF10A_TGFb.category) |
| 86 | +table("WT" = dt1$MCF10A_WT.category, "shZ" = dt1$MCF10A_shZ.category) |
| 87 | +table("WT" = dt1$MCF10A_WT.category, "WT" = dt1$MCF10A_WT.category) |
| 88 | +table("TGFb" = dt1$MCF10A_TGFb.category, "TGFb" = dt1$MCF10A_TGFb.category) |
| 89 | + |
| 90 | +levels(dt1$MCF10A_shZ.category) |
| 91 | + |
| 92 | +## gather table |
| 93 | +Fig2Categories <- dt1[, c("extGene", "MCF10A_WT.category", "MCF10A_TGFb.category", "MCF10A_shZ.category", "MCF10CA1A.category")] |
| 94 | +Fig2Categories <- data.table(gather(Fig2Categories, key = "condition", value = "category", c("MCF10A_WT.category", "MCF10A_TGFb.category", "MCF10A_shZ.category", "MCF10CA1A.category"))) |
| 95 | +Fig2Categories$condition <- unlist(lapply(strsplit(Fig2Categories$condition, "\\."), function(x) x[1])) |
| 96 | +Fig2Categories$category <- factor(Fig2Categories$category, levels = levels(as.factor(Fig2Categories$category))[c(5,4,7,6,3,2,1)]) |
| 97 | + |
| 98 | +deGenesTGFbIDs <- deGenesTGFb[deGenesTGFb$qval < 0.1]$target_id |
| 99 | +deGenesTGFbUpIDs <- deGenesTGFb[deGenesTGFb$qval < 0.1 & deGenesTGFb$b > 0]$target_id |
| 100 | +deGenesTGFbDownIDs <- deGenesTGFb[deGenesTGFb$qval < 0.1 & deGenesTGFb$b < 0]$target_id |
| 101 | + |
| 102 | +left <- "MCF10A_WT" |
| 103 | +right <- "MCF10A_TGFb" |
| 104 | +left.categories <- llist |
| 105 | +left.categories <- c("Overall depleted", "weak array across promoter") |
| 106 | +df1 <- data.table(Fig2Categories[condition == left]$extGene, Fig2Categories[condition == left]$category, Fig2Categories[condition == right]$category) |
| 107 | +colnames(df1) <- c("extGene", left, right) |
| 108 | +#df1 <- df1[extGene %in% deGenesTGFbUpIDs] |
| 109 | +#df1 <- df1[extGene %in% deGenesTGFbDownIDs] |
| 110 | +df1 <- df1[extGene %in% sigEMTCells$cellLine_sig] |
| 111 | + |
| 112 | +df1 <- df2["epi"] |
| 113 | +data <- data.frame(df1[ df1[[left]] %in% left.categories , ]) |
| 114 | +ggparallel::ggparallel(vars = list(left, right), data = data, same.level = T, label.size = 3, |
| 115 | + text.angle = 0, label.colour= "black") + ggtitle("Epithelial") |
| 116 | + |
| 117 | +geneIDs <- data$extGene |
| 118 | +s1 <- which(kt1D6$target_id %in% geneIDs) |
| 119 | +s2 <- which(deGenesTGFb$target_id %in% sigEMTCells$cellLine_sig) |
| 120 | +s2 |
| 121 | +plot(deGenesTGFb[s2]$b, -log10(deGenesTGFb[s2]$qval)) |
| 122 | + |
| 123 | +bp1 <- ggplot(data = kt1D6[s1], |
| 124 | + aes(x= condition, y = log2(tpm + 1), colour = condition)) + geom_boxplot() |
| 125 | +bp1 |
| 126 | +bp2 <- ggplot(data =m3[m3[geneIDs][condition == "MCF10A_wt"][category %in% "Strong +1"]$target_id], |
| 127 | + aes(x= condition, y = log2(tpm + 1), colour = condition)) + geom_boxplot() + facet_wrap(~category) |
| 128 | + |
| 129 | +bp2 |
| 130 | +vp1 <- ggplot(data = kt1D6[s1], |
| 131 | + aes(x= condition, y = log2(tpm + 1)), fill = condition) + geom_violin() |
| 132 | +vp1 |
| 133 | +h1 <- ggplot(data = kt1D6[s1], |
| 134 | + aes(log2(tpm + 1), ..density.., fill = condition)) + geom_histogram(binwidth = 0.11, alpha = 0.5) |
| 135 | +h1 |
| 136 | + |
| 137 | +### |
| 138 | +data <- as.data.frame(df1) |
| 139 | +vars <- unlist(list(left, right)) |
| 140 | +llist <- NULL |
| 141 | +same.level = T |
| 142 | +for (i in unique(vars)) { |
| 143 | + if (!same.level) levels(data[,i]) <- paste(i, levels(data[,i]), sep=":") |
| 144 | + llist <- unique(c(llist, levels(data[,i]))) |
| 145 | +} |
| 146 | +(llist) |
| 147 | +## integrate expression data |
| 148 | +kt1 <- results$full$kallisto_table_genes |
| 149 | +#kt1 <- results$full$kallisto_table_genes |
| 150 | +setkey(kt1, "target_id") |
| 151 | + |
| 152 | +kt1D6 <- kt1[grep("D6", kt1$sample)] |
| 153 | +kt1D6.wide <- tidyr::spread(kt1D6[, c("target_id", "sample", "tpm")], sample, tpm) |
| 154 | +kt1D6 <- kt1D6[, list("tpm" = mean(tpm)), by = list(condition, target_id)] |
| 155 | +s1 <- dt1[(dt1$MCF10A_WT.category == "Strong +1" & dt1$MCF10A_TGFb.category == "Strong -1")]$extGene |
| 156 | +s2 <- which(kt1D6.wide$target_id %in% s1) |
| 157 | +s1 <- which(kt1D6$target_id %in% s1) |
| 158 | +length(s1) |
| 159 | + |
| 160 | +gplots::heatmap.2(scale(log2(as.matrix(subset(kt1D6.wide[s2], select = c("MCF10AD6_1", |
| 161 | + "MCF10AD6_2", |
| 162 | + #"MCF10AD6_3", |
| 163 | + "MCF10ATGFbD6_1", |
| 164 | + "MCF10ATGFbD6_2", |
| 165 | + "MCF10ATGFbD6_3"))) + 1)), trace = "none") |
| 166 | +m1 <- scale(log2(as.matrix(subset(kt1D6.wide[s2], select = c("MCF10AD6_1", |
| 167 | + "MCF10AD6_2", |
| 168 | + #"MCF10AD6_3", |
| 169 | + "MCF10ATGFbD6_1", |
| 170 | + "MCF10ATGFbD6_2", |
| 171 | + "MCF10ATGFbD6_3"))) + 1)) |
| 172 | +m2 <- log2(as.matrix(subset(kt1D6.wide[s2], select = c("MCF10AD6_1", |
| 173 | + "MCF10AD6_2", |
| 174 | + #"MCF10AD6_3", |
| 175 | + "MCF10ATGFbD6_1", |
| 176 | + "MCF10ATGFbD6_2", |
| 177 | + "MCF10ATGFbD6_3"))) + 1) |
| 178 | + |
| 179 | +rownames(m1) <- kt1D6.wide[s2]$target_id |
| 180 | +rownames(m2) <- kt1D6.wide[s2]$target_id |
| 181 | + |
| 182 | + |
| 183 | +head(m1) |
| 184 | +heatmaply::heatmaply(m1,#[sd1 > 0.3,], |
| 185 | + trace = "none", |
| 186 | + scale = "none", |
| 187 | + protocol = "ggplotly") |
| 188 | +library(vegan) |
| 189 | +km1 <- vegan::cascadeKM(m2, 2, 11, criterion = "calinski") |
| 190 | +plot(km1) |
| 191 | +data.table(km1$partition) |
| 192 | +g1 <- km1$partition[,5] |
| 193 | +names(g1) <- rownames(m2) |
| 194 | +g1 |
| 195 | +setkey(kt1D6, "target_id") |
| 196 | +w1 <-which(kt1D6$target_id %in% names(g1)) |
| 197 | +kt1D6.g1 <- kt1D6[w1] |
| 198 | +kmeansClusters <- data.table("extGene" = names(g1), "group" = g1) |
| 199 | +kt1D6.g1 <- merge(kt1D6.g1, kmeansClusters, by.x = "target_id", by.y = "extGene", all.y = T) |
| 200 | + |
| 201 | +bp2 <- ggplot(data = kt1D6.g1, |
| 202 | + aes(x= condition, y = log2(tpm + 1), colour = condition)) + |
| 203 | + geom_boxplot() + |
| 204 | + facet_wrap(~group) |
| 205 | +bp2 |
| 206 | + |
| 207 | +vp2 <- ggplot(data = kt1D6.g1, |
| 208 | + aes(x= condition, y = log2(tpm + 1), colour = condition)) + |
| 209 | + geom_violin() + |
| 210 | + facet_wrap(~group) + |
| 211 | + stat_summary(fun.y=median.quartile,geom='point') |
| 212 | +vp2 |
| 213 | + |
| 214 | +h2 <- ggplot(data = kt1D6.g1, |
| 215 | + aes(log2(tpm + 1), fill = condition)) + geom_histogram(bins = 30, alpha = 0.5,) |
| 216 | +h2 |
| 217 | + |
| 218 | +f2 <- ggplot(data = kt1D6.g1, |
| 219 | + aes(log2(tpm + 1), colour = condition)) + geom_freqpoly(bins = 30, alpha = 0.5) |
| 220 | +f2 |
| 221 | + |
| 222 | +median(log2(kt1D6[s1][condition == "MCF10A_wt"]$tpm + 1)) |
| 223 | +median(log2(kt1D6[s1][condition == "MCF10A_TGFb"]$tpm + 1)) |
| 224 | + |
| 225 | +median(log2(kt1D6[condition == "MCF10A_wt"]$tpm + 1)) |
| 226 | +median(log2(kt1D6[condition == "MCF10A_TGFb"]$tpm + 1)) |
| 227 | + |
| 228 | + |
| 229 | +# EMT genes-specific parallel plots --------------------------------------- |
| 230 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_shZ.category"), #, , "MCF10CA1A.category"), |
| 231 | + data = dt1[unique(ucscTranscriptsSigEMTCells[epi_mes == "epi"]$target_id)], |
| 232 | + label.size = 4, |
| 233 | + text.angle = 0, label.colour= "black", same.level = F) + ggtitle("shZ - EPI") |
| 234 | +"ParallelPlot_WT-shZ_EPI_Genes.pdf" |
| 235 | + |
| 236 | + |
| 237 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_shZ.category"), #, , "MCF10CA1A.category"), |
| 238 | + data = dt1[unique(ucscTranscriptsSigEMTCells[epi_mes == "mes"]$target_id)], |
| 239 | + label.size = 4, |
| 240 | + text.angle = 0, label.colour= "black", same.level = F) + ggtitle("shZ - MES") |
| 241 | +"ParallelPlot_WT-shZ_MES_Genes.pdf" |
| 242 | + |
| 243 | +#----- TGFb |
| 244 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_TGFb.category"), #, , "MCF10CA1A.category"), |
| 245 | + data = dt1[unique(ucscTranscriptsSigEMTCells$target_id)], |
| 246 | + label.size = 4, |
| 247 | + text.angle = 0, label.colour= "black", same.level = F) |
| 248 | + |
| 249 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_TGFb.category"), #, , "MCF10CA1A.category"), |
| 250 | + data = dt1[unique(ucscTranscriptsSigEMTCells[epi_mes == "epi"]$target_id)], |
| 251 | + label.size = 4, |
| 252 | + text.angle = 0, label.colour= "black", same.level = F) + ggtitle("TGFb - EPI") |
| 253 | +"ParallelPlot_WT-TGFb_EPI_Genes.pdf" |
| 254 | + |
| 255 | +ggparallel::ggparallel(list("MCF10A_WT.category", "MCF10A_TGFb.category"), #, , "MCF10CA1A.category"), |
| 256 | + data = dt1[unique(ucscTranscriptsSigEMTCells[epi_mes == "mes"]$target_id)], |
| 257 | + label.size = 4, |
| 258 | + text.angle = 0, label.colour= "black", same.level = F) + ggtitle("TGFb - MES") |
| 259 | +"ParallelPlot_WT-TGFb_MES_Genes.pdf" |
| 260 | + |
0 commit comments