From 911e890d355d94fb4ca14c08c8d5ca1722feced4 Mon Sep 17 00:00:00 2001 From: Stefano Beretta Date: Mon, 31 Jul 2023 13:00:48 +0000 Subject: [PATCH] Upload New File --- scripts/WEScolonies_plot_results_LowFreq.R | 418 +++++++++++++++++++++ 1 file changed, 418 insertions(+) create mode 100644 scripts/WEScolonies_plot_results_LowFreq.R diff --git a/scripts/WEScolonies_plot_results_LowFreq.R b/scripts/WEScolonies_plot_results_LowFreq.R new file mode 100644 index 0000000..9b3e949 --- /dev/null +++ b/scripts/WEScolonies_plot_results_LowFreq.R @@ -0,0 +1,418 @@ +library(dplyr) +library(ggplot2) +library(RColorBrewer) +library(CMplot) +library(gtable) +library(circlize) +library(grid) +library(gridExtra) +library(stringr) +library(e1071) +library(openxlsx) +library(VennDiagram) +library(scales) +library(ggVennDiagram) +library(reshape2) + +# A helper function to define a region on the layout +define_region <- function(row, col){ + viewport(layout.pos.row = row, layout.pos.col = col) +} + +# Function to define colors for SNVs +snv_colors2 <- function(del = FALSE, alpha = 1) { + nuc <- c("A", "C", "G", "T") + # Point deletions + dels <- alpha(brewer.pal(5, "Greys")[2:5], 1) + names(dels) <- paste0(nuc, "*") + # Transitions + ts <- alpha(brewer.pal(5, "Blues")[2:5], 1) + names(ts) <- c("CT", "GA", "AG", "TC") + # Tansversions + tv1 <- alpha(brewer.pal(5, "YlGn")[2:5], 1) + names(tv1) <- c("AC", "TG", "AT", "TA") + tv2 <- alpha(brewer.pal(5, "OrRd")[2:5], 1) + names(tv2) <- c("CA", "GT", "CG", "GC") + var_cols <- c(ts, tv1, tv2) + if (del) { + var_cols <- c(var_cols, dels) + } + return(var_cols) +} + +# Variant Plots +plot_variants <- function(full.t, out_dir, plot_prefix, fill_by) { + dir.create(path = out_dir, showWarnings = F) + # Per-Sample + tt <- full.t %>% + filter(grepl("chr", CHROM)) %>% + filter(!CHROM %in% c("chrY", "chrM")) %>% + group_by(Sample, !!!syms(fill_by)) %>% + summarise(Count = n()) + write.xlsx(x = list("NumVariants" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantCounts.xlsx"), sep = "/")) + + p <- ggplot(tt, aes(x = Sample, y = Count, fill = get(fill_by))) + + theme_bw(base_size = 12) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), + legend.position = "none") + + geom_bar(stat = "identity") + + scale_y_continuous(labels = scales::comma, n.breaks = 8) + + xlab("") + + facet_grid(.~get(fill_by), scales = "free_x", space = "free") + ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantCounts.pdf"), sep = "/"), plot = p, width = 6, height = 6) + + tt <- full.t %>% + filter(grepl("chr", CHROM)) %>% + filter(!CHROM %in% c("chrY", "chrM")) %>% + group_by(Sample, !!!syms(fill_by), REF, ALT) %>% + summarise(Count = n()) + tt <- mutate(tt, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV", + nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "DEL", + nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL", + nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS", + TRUE ~ "Other")) + write.xlsx(x = list("VariantClass" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantClassification.xlsx"), sep = "/")) + tt_type <- tt %>% + group_by(Sample, !!!syms(fill_by), TYPE) %>% + summarise(SumCount = sum(Count)) %>% + arrange(desc(SumCount)) %>% + group_by(Sample, !!!syms(fill_by)) %>% + mutate(CountPerc = SumCount/sum(SumCount)) + write.xlsx(x = list("VariantType" = tt_type), file = paste(out_dir, paste0(plot_prefix, "_VariantType.xlsx"), sep = "/")) + + p <- ggplot(tt_type, aes(x = Sample, y = SumCount, fill = TYPE)) + + theme_bw(base_size = 12) + + theme(axis.text.x = element_text(angle = 30, hjust = 1), + legend.position = "top") + + guides(fill = guide_legend(ncol = 6)) + + xlab("") + + ylab("Count") + + scale_fill_brewer(palette = "Set1", name = "") + + geom_bar(stat = "identity", position = "stack") + + scale_y_continuous(labels = scales::comma, n.breaks = 8) + + facet_grid(.~get(fill_by), scales = "free_x", space = "free") + ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypes.pdf"), sep = "/"), plot = p, width = 6, height = 6) + + p <- ggplot(tt_type, aes(x = Sample, y = CountPerc, fill = TYPE)) + + theme_bw(base_size = 12) + + theme(axis.text.x = element_text(angle = 30, hjust = 1), + legend.position = "top") + + guides(fill = guide_legend(ncol = 6)) + + xlab("") + + ylab("Count") + + scale_fill_brewer(palette = "Set1", name = "") + + geom_bar(stat = "identity", position = "stack") + + scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) + + facet_grid(.~get(fill_by), scales = "free_x", space = "free") + ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypesPerc.pdf"), sep = "/"), plot = p, width = 6, height = 6) + + tt <- full.t %>% + filter(nchar(REF) == 1 & nchar(ALT) == 1 & grepl("chr", CHROM)) %>% + filter(!CHROM %in% c("chrY", "chrM")) %>% + group_by(Sample, !!!syms(fill_by), REF, ALT) %>% + summarise(Count = n()) %>% + group_by(Sample, !!!syms(fill_by)) %>% + mutate(CountPerc = Count/sum(Count)) + tt$Variant <- paste0(tt$REF, tt$ALT) + tt$Variant <- factor(tt$Variant, levels = sort(unique(tt$Variant))) + write.xlsx(x = list("SNV" = tt), file = paste(out_dir, paste0(plot_prefix, "_SNVcounts.xlsx"), sep = "/")) + + tt$Variant <- factor(tt$Variant, levels = rev(names(snv_colors2()))) + p2 <- ggplot(tt, aes(x = Sample, y = Count, fill = Variant)) + + theme_bw(base_size = 12) + + theme(axis.text.x = element_text(angle = 30, hjust = 1)) + + geom_bar(stat = "identity") + + scale_fill_manual(values = snv_colors2(del = F)) + + xlab("") + + scale_y_continuous(labels = scales::comma, n.breaks = 8) + + facet_grid(.~get(fill_by), scales = "free_x", space = "free") + ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcounts.pdf"), sep = "/"), plot = p2, width = 7, height = 7) + + p2p <- ggplot(tt, aes(x = Sample, y = CountPerc, fill = Variant)) + + theme_bw(base_size = 12) + + theme(axis.text.x = element_text(angle = 30, hjust = 1)) + + geom_bar(stat = "identity") + + scale_fill_manual(values = snv_colors2(del = F)) + + scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) + + xlab("") + + facet_grid(.~get(fill_by), scales = "free_x", space = "free") + ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcountsPerc.pdf"), sep = "/"), plot = p2p, width = 7, height = 7) +} + +############### +### General ### +############### +wdir <- "Fiumara_BasePrimeEd2022_WES/WEScolonies" + +out_dir <- paste(wdir, "results", sep = "/") +dir.create(out_dir, showWarnings = F) + +# Samples +samples <- c("S1-BE4CC", "S2-BE4CCw-osgRNA", "S3-ABE8CC", "S4-BE4arca", "S5-mockcolonies", "S6-mock24h") + +full.t <- data.frame() + +# Mutect2 +for (ss in samples) { + print(ss) + t <- read.table(gzfile(paste(wdir, "data", paste0(ss, ".Mutect2.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t") + colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE") + t$Sample <- ss + t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-") + t$Group <- str_split_fixed(ss, "-", 2)[,2] + t$Tool <- "Mutect2" + t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,4]) + t$GT <- gsub("[|]", "/", str_split_fixed(t$SAMPLE, ":", 5)[,1]) + t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 5)[,2], ",", 2)[,1]) + t$VAF <- (t$DP - t$AD) / t$DP + t <- subset(t, DP >= 10) + full.t <- rbind(full.t, t) +} + +# Freebayes +for (ss in samples) { + print(ss) + t <- read.table(gzfile(paste(wdir, "data", paste0(ss, ".Freebayes.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t") + colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE") + t$Sample <- ss + t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-") + t$Group <- str_split_fixed(ss, "-", 2)[,2] + t$Tool <- "Freebayes" + t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 9)[,3]) + t$GT <- gsub("[|]", "/", str_split_fixed(t$SAMPLE, ":", 9)[,1]) + t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 9)[,4], ",", 2)[,1]) + t$VAF <- (t$DP - t$AD) / t$DP + t <- subset(t, DP >= 10) + full.t <- rbind(full.t, t) +} + +# Filter CHR only +chr <- c(paste0("chr", seq(1,22)), "chrX") +full.t <- subset(full.t, CHROM %in% chr) +full.t$CHROM <- factor(full.t$CHROM, levels = chr) +full.t$Sample <- factor(full.t$Sample, levels = unique(full.t$Sample)) + +full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ] + +## Tool overlap on all (nomulti) variants +full.venn <- list() +for (ss in samples) { + print(ss) + full.venn[[ss]] <- list() + full.t.nomulti.ss <- subset(full.t.nomulti, Sample == ss) + for (tt in unique(full.t.nomulti.ss$Tool)) { + full.venn[[ss]][[tt]] <- full.t.nomulti.ss[full.t.nomulti.ss$Tool == tt, "Vars"] + } +} +pdf(paste(out_dir, "Full_NoMulti_venn_Tools.pdf", sep = "/"), width = 16, height = 8) +plot.new() +pushViewport(viewport(layout = grid.layout(2, 3))) +i <- 1 +j <- 1 +for (ss in names(full.venn)) { + venn <- Venn(full.venn[[ss]]) + data <- process_data(venn) + vreg <- venn_region(data) + vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]")) + cc <- brewer.pal(name = "Set2", n = 8) + cc_cat <- cc[1:length(unique(vreg$NumInters))] + names(cc_cat) <- unique(vreg$NumInters) + s.plot <- ggplot() + + # change mapping of color filling + geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) + + scale_fill_manual(values = cc_cat) + + # adjust edge size and color + geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) + + # show set label in bold + geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) + + # add a alternative region name + geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) + + ggtitle(paste0("Sample", ss)) + + theme_void() + print(s.plot, vp = define_region(row = i, col = j)) + j <- j + 1 + if (j > 3) { + j <- 1 + i <- i + 1 + } +} +dev.off() + + +## Tool overlap on NO 0/0 (nomulti) variants +full.t.nomulti.no00 <- subset(full.t.nomulti, GT != "0/0") + +full.venn <- list() +for (ss in samples) { + print(ss) + full.venn[[ss]] <- list() + full.t.nomulti.ss <- subset(full.t.nomulti.no00, Sample == ss) + for (tt in unique(full.t.nomulti.ss$Tool)) { + full.venn[[ss]][[tt]] <- full.t.nomulti.ss[full.t.nomulti.ss$Tool == tt, "Vars"] + } +} +pdf(paste(out_dir, "Full_NoMulti_no00_venn_Tools.pdf", sep = "/"), width = 16, height = 8) +plot.new() +pushViewport(viewport(layout = grid.layout(2, 3))) +i <- 1 +j <- 1 +for (ss in names(full.venn)) { + venn <- Venn(full.venn[[ss]]) + data <- process_data(venn) + vreg <- venn_region(data) + vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]")) + cc <- brewer.pal(name = "Set2", n = 8) + cc_cat <- cc[1:length(unique(vreg$NumInters))] + names(cc_cat) <- unique(vreg$NumInters) + s.plot <- ggplot() + + # change mapping of color filling + geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) + + scale_fill_manual(values = cc_cat) + + # adjust edge size and color + geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) + + # show set label in bold + geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) + + # add a alternative region name + geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) + + ggtitle(paste0("Sample", ss)) + + theme_void() + print(s.plot, vp = define_region(row = i, col = j)) + j <- j + 1 + if (j > 3) { + j <- 1 + i <- i + 1 + } +} +dev.off() + + +# Check overlap among samples +full.venn <- list() +for (vs in unique(full.t.nomulti.no00$Sample)) { + print(vs) + full.venn[[vs]] <- full.t.nomulti.no00[full.t.nomulti.no00$Sample == vs & full.t.nomulti.no00$Tool == "Mutect2", "Vars"] +} +venn <- Venn(full.venn) +data <- process_data(venn) +vreg <- venn_region(data) +vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]")) +cc <- brewer.pal(name = "Set2", n = 8) +cc_cat <- cc[1:length(unique(vreg$NumInters))] +names(cc_cat) <- unique(vreg$NumInters) +s.plot <- ggplot() + + # change mapping of color filling + geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) + + scale_fill_manual(values = cc_cat) + + # adjust edge size and color + geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) + + # show set label in bold + geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) + + # add a alternative region name + geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 3) + + theme_void() +ggsave(filename = paste(out_dir, "Full_NoMulti_no00_vars_venn_Mutect2.pdf", sep = "/"), plot = s.plot, width = 10, height = 10) + + +# Control +vitro_ctrl <- subset(full.t.nomulti.no00, Sample == "S6-mock24h") +vitro_ctrl_vars <- unique(vitro_ctrl_vars$Vars) +full.t.nomulti.no00.noGerm <- subset(full.t.nomulti.no00, !(Vars %in% vitro_ctrl_vars)) + +# Check Tool overlap +full.venn <- list() +for (ss in samples) { + if(ss == "S6-mock24h") {next} + print(ss) + full.venn[[ss]] <- list() + full.t.nomulti.no00.noGerm.ss <- subset(full.t.nomulti.no00.noGerm, Sample == ss) + for (tt in unique(full.t.nomulti.no00.noGerm.ss$Tool)) { + full.venn[[ss]][[tt]] <- full.t.nomulti.no00.noGerm.ss[full.t.nomulti.no00.noGerm.ss$Tool == tt, "Vars"] + } +} +pdf(paste(out_dir, "Full_NoMulti_no00_noGerm_venn_Tools.pdf", sep = "/"), width = 16, height = 8) +plot.new() +pushViewport(viewport(layout = grid.layout(2, 3))) +i <- 1 +j <- 1 +for (ss in names(full.venn)) { + venn <- Venn(full.venn[[ss]]) + data <- process_data(venn) + vreg <- venn_region(data) + vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]")) + cc <- brewer.pal(name = "Set2", n = 8) + cc_cat <- cc[1:length(unique(vreg$NumInters))] + names(cc_cat) <- unique(vreg$NumInters) + s.plot <- ggplot() + + # change mapping of color filling + geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) + + scale_fill_manual(values = cc_cat) + + # adjust edge size and color + geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) + + # show set label in bold + geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) + + # add a alternative region name + geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) + + ggtitle(paste0("Sample", ss)) + + theme_void() + print(s.plot, vp = define_region(row = i, col = j)) + j <- j + 1 + if (j > 3) { + j <- 1 + i <- i + 1 + } +} +dev.off() + +plot_variants(full.t = subset(full.t.nomulti.no00.noGerm, Tool == "Mutect2"), + out_dir = out_dir, plot_prefix = "Full_NoMulti_no00_noGerm_Mutect2", fill_by = "Group") + +#### Low Freq #### +full.t.nomulti.no00.noGerm.lowfreq <- subset(full.t.nomulti.no00.noGerm, VAF > 0.05 & VAF < 0.2) +table(full.t.nomulti.no00.noGerm.lowfreq$Tool, full.t.nomulti.no00.noGerm.lowfreq$Sample) + +full.venn <- list() +for (ss in samples) { + if(ss == "S6-mock24h") {next} + print(ss) + full.venn[[ss]] <- list() + full.t.nomulti.no00.noGerm.lowfreq.ss <- subset(full.t.nomulti.no00.noGerm.lowfreq, Sample == ss) + for (tt in unique(full.t.nomulti.no00.noGerm.lowfreq.ss$Tool)) { + full.venn[[ss]][[tt]] <- full.t.nomulti.no00.noGerm.lowfreq.ss[full.t.nomulti.no00.noGerm.lowfreq.ss$Tool == tt, "Vars"] + } +} +pdf(paste(out_dir, "Full_NoMulti_no00_noGerm_lowfreq_venn_Tools.pdf", sep = "/"), width = 16, height = 8) +plot.new() +pushViewport(viewport(layout = grid.layout(2, 3))) +i <- 1 +j <- 1 +for (ss in names(full.venn)) { + venn <- Venn(full.venn[[ss]]) + data <- process_data(venn) + vreg <- venn_region(data) + vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]")) + cc <- brewer.pal(name = "Set2", n = 8) + cc_cat <- cc[1:length(unique(vreg$NumInters))] + names(cc_cat) <- unique(vreg$NumInters) + s.plot <- ggplot() + + # change mapping of color filling + geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) + + scale_fill_manual(values = cc_cat) + + # adjust edge size and color + geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) + + # show set label in bold + geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) + + # add a alternative region name + geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) + + ggtitle(paste0("Sample", ss)) + + theme_void() + print(s.plot, vp = define_region(row = i, col = j)) + j <- j + 1 + if (j > 3) { + j <- 1 + i <- i + 1 + } +} +dev.off() + +plot_variants(full.t = subset(full.t.nomulti.no00.noGerm.lowfreq, Tool == "Mutect2"), + out_dir = out_dir, plot_prefix = "Full_NoMulti_no00_noGerm_lowfreq_Mutect2", fill_by = "Group") -- GitLab