diff --git a/scripts/BARseq_plots.R b/scripts/BARseq_plots.R new file mode 100644 index 0000000000000000000000000000000000000000..6d359e9186689ef6571d0b2022ce808d2685cb8c --- /dev/null +++ b/scripts/BARseq_plots.R @@ -0,0 +1,408 @@ +# BAR-seq Analysis +library(dplyr) +library(stringr) +library(ggplot2) +library(reshape2) +library(ComplexHeatmap) +library(scales) +library(circlize) +library(RColorBrewer) +library(edgeR) +library(openxlsx) +library(DESeq2) +library(gridExtra) + +####################### +## Utility Functions ## +####################### +deriv <- function(x, y) diff(y) / diff(x) +middle_pts <- function(x) x[-1] - diff(x) / 2 + +elbow_point = function(x, y) { + # start and end point positions + wmin_x = which.min(x) + wmax_x = which.max(x) + # start point + x_start = x[wmin_x] + y_start = y[wmin_x] + # end point + x_end = x[wmax_x] + y_end = y[wmax_x] + # distance between x, y start and end + dx = x_end - x_start + dy = y_end - y_start + d_start_end = sqrt(dx^2 + dy^2) + # distance between each point in (x, y) and the line passing through the + # points (x_start, y_start) and (x_end, y_end) + d = abs(dx * (y_start - y) - (x_start - x) * dy) / d_start_end + wmax_d = which.max(d) + return(list(idx = wmax_d, x = x[wmax_d], y = y[wmax_d])) +} + +# Cut selection +select_cut <- function(t) { + d1 <- deriv(t$Rank, t$Count) + d1log <- deriv(log10(t$Rank), log10(t$Count)) + sd1_log <- which.min(diff(d1log)) + 1 + return(sd1_log) +} + +############# +## General ## +############# +wdir <- "Fiumara_BasePrimeEd2022_BARseq" +fname_suf <- "_edit_2.barcode.mc2.tsv" +data_dir <- paste(wdir, "02-barseq", sep = "/") +out_dir <- paste(wdir, "03-plots", sep = "/") +dir.create(path = out_dir, showWarnings = F) + +sinfo <- data.frame("Sample" = c()) +for (sample in list.dirs(path = data_dir, full.names = F)) { + if (sample == "") { next } + if (length(grep("clone", sample, value = T)) >= 1) { next } + print(sample) + sinfo <- rbind(sinfo, data.frame("Sample" = sample)) +} + +sinfo$CT <- str_split_fixed(sinfo$Sample, "_", 4)[,3] +sinfo <- mutate(sinfo, CellType = case_when(CT == "15w" ~ "PB 15Weeks", + CT == "BM" ~ "BM", + CT == "SPL" ~ "SPL", + CT == "9W" ~ "PB 9Weeks", + CT == "B" ~ "B cells", + CT == "M" ~ "Myeloid cells", + CT == "T" ~ "T cells", + CT == "CD34" ~ "CD34")) +sinfo$Mouse <- paste0("Mouse_", str_split_fixed(sinfo$Sample, "_", 4)[,4]) +sinfo$Mouse <- gsub("Mouse_A", "BE4_", sinfo$Mouse) +sinfo$Mouse <- gsub("Mouse_B", "ABE8_", sinfo$Mouse) +sinfo$Mouse <-gsub("Mouse_C", "Cas9_", sinfo$Mouse) +sinfo$Mouse <- gsub("Mouse_D", "UTelectro_", sinfo$Mouse) +sinfo$Treatment <- str_split_fixed(sinfo$Mouse, "_", 2)[,1] + +sinfo$Treatment <- factor(sinfo$Treatment, levels = c("UTelectro","Cas9","BE4","ABE8")) +sinfo$Mouse <- factor(sinfo$Mouse, levels = c("UTelectro_0","UTelectro_1","UTelectro_2","UTelectro_3","UTelectro_4", + "Cas9_0","Cas9_1","Cas9_4", + "BE4_0","BE4_1","BE4_2","BE4_3", + "ABE8_0","ABE8_1","ABE8_2","ABE8_3","ABE8_4" +)) +sinfo$CellType <- factor(sinfo$CellType, levels = c("PB 9Weeks","PB 15Weeks","BM","SPL","CD34","B cells","Myeloid cells","T cells")) +sinfo <- sinfo[order(sinfo$Treatment, sinfo$Mouse, sinfo$CellType),] + +################### +### Export XLSX ### +################### +df_count <- data.frame() +df_logcpm <- data.frame() +df_count_selected <- data.frame() +df_logcpm_selected <- data.frame() +df_count_dominant <- data.frame() +df_logcpm_dominant <- data.frame() +for (sample in sinfo$Sample) { + print(sample) + t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") + colnames(t) <- c("Barcode", "Count", "SaturationPerc") + t$logCPM <- log1p(edgeR::cpm(t$Count)) + t_count <- t[, c("Barcode", "Count")] + colnames(t_count) <- c("Barcode", sample) + t_logcpm <- t[, c("Barcode", "logCPM")] + colnames(t_logcpm) <- c("Barcode", sample) + t$Rank <- seq(1, nrow(t)) + # Selection + sel_cut <- select_cut(t) + t_sel <- subset(t, Rank <= t$Rank[sel_cut]) + t_count_sel <- t_sel[, c("Barcode", "Count")] + colnames(t_count_sel) <- c("Barcode", paste0(sample, ":rank", t$Rank[sel_cut])) + t_logcpm_sel <- t_sel[, c("Barcode", "logCPM")] + colnames(t_logcpm_sel) <- c("Barcode", paste0(sample, ":rank", t$Rank[sel_cut])) + # Dominant + ebow <- elbow_point(x = t$Rank, t$Count) + dominant_cut <- ebow$idx + t_dom <- subset(t, Rank <= t$Rank[dominant_cut]) + t_count_dom <- t_dom[, c("Barcode", "Count")] + colnames(t_count_dom) <- c("Barcode", paste0(sample, ":rank", t$Rank[dominant_cut])) + t_logcpm_dom <- t_dom[, c("Barcode", "logCPM")] + colnames(t_logcpm_dom) <- c("Barcode", paste0(sample, ":rank", t$Rank[dominant_cut])) + + if(nrow(df_count) == 0) { + df_count <- t_count + df_logcpm <- t_logcpm + df_count_selected <- t_count_sel + df_logcpm_selected <- t_logcpm_sel + df_count_dominant <- t_count_dom + df_logcpm_dominant <- t_logcpm_dom + } else { + df_count <- merge(df_count, t_count, by = "Barcode", all = T) + df_logcpm <- merge(df_logcpm, t_logcpm, by = "Barcode", all = T) + df_count_selected <- merge(df_count_selected, t_count_sel, by = "Barcode", all = T) + df_logcpm_selected <- merge(df_logcpm_selected, t_logcpm_sel, by = "Barcode", all = T) + df_count_dominant <- merge(df_count_dominant, t_count_dom, by = "Barcode", all = T) + df_logcpm_dominant <- merge(df_logcpm_dominant, t_logcpm_dom, by = "Barcode", all = T) + } +} +write.xlsx(x = list("Count" = df_count, "logCPM" = df_logcpm, + "Count_Selected" = df_count_selected, "logCPM_Selected" = df_logcpm_selected, + "Count_Dominant" = df_count_dominant, "logCPM_Dominant" = df_logcpm_dominant), + file = paste(out_dir, "Full_barcodes.xlsx", sep = "/"), overwrite = T, + asTable = T, colWidths = "auto", firstRow = T) + +# Stats +full.df <- data.frame() +for (sample in sinfo$Sample) { + print(sample) + t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") + colnames(t) <- c("Barcode", "Count", "SaturationPerc") + tot_bars <- nrow(t) + t$logCPM <- log1p(edgeR::cpm(t$Count)) + t$Rank <- seq(1, nrow(t)) + # Cut selection + sel_cut <- select_cut(t) + tot_sum <- sum(t$Count) + t_sel <- subset(t, Rank <= t$Rank[sel_cut]) + sel_bars <- nrow(t_sel) + sel_sum <- sum(t_sel$Count) + # Cut dominant + ebow <- elbow_point(x = t$Rank, t$Count) + dominant_cut <- ebow$idx + t_dom <- subset(t, Rank <= t$Rank[dominant_cut]) + dom_bars <- nrow(t_dom) + dom_sum <- sum(t_dom$Count) + mm <- as.character(sinfo[sinfo$Sample == sample, "Mouse"]) + ct <- as.character(sinfo[sinfo$Sample == sample, "CellType"]) + df <- data.frame("Sample" = c(sample), "Treatment" = c(TT), + "Mouse" = c(mm), "CellType" = c(ct), + "TotBarcodes" = c(tot_bars), "SumTotBarcodes" = c(tot_sum), + "SelBarcodes" = c(sel_bars), "SelBarcodesPerc" = c(round((sel_bars/tot_bars)*100, 2)), + "SumSelBarcodes" = c(sel_sum), "SumSelBarcodesPerc" = c(round((sel_sum/tot_sum)*100, 2)), + "DomBarcodes" = c(dom_bars), "DomBarcodesPerc" = c(round((dom_bars/tot_bars)*100, 2)), + "SumDomBarcodes" = c(dom_sum), "SumDomBarcodesPerc" = c(round((dom_sum/tot_sum)*100, 2)) + ) + full.df <- rbind(full.df, df) +} +write.xlsx(x = list("SampleStats" = full.df), file = paste(out_dir, "SampleStats.xlsx", sep = "/"), overwrite = T) + + +######################### +### Heatmap Selection ### +######################### +full.t <- data.frame() +full.t.hm <- data.frame() +full.t.annot <- data.frame() +for (sample in sinfo$Sample) { + print(sample) + t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") + colnames(t) <- c("Barcode", "Count", "SaturationPerc") + t$Sample <- sample + t$logCPM <- log1p(edgeR::cpm(t$Count)) + t$Rank <- seq(1, nrow(t)) + t$NormRank <- t$Rank / nrow(t) + # Cut selection + sel_cut <- select_cut(t) + t$SelRank <- t$Rank[sel_cut] + t$SelNormRank <- t$NormRank[sel_cut] + full.t <- rbind(full.t, t) + + t.hm <- subset(t, Rank <= t$Rank[sel_cut]) %>% select(Barcode, Count, Sample, SaturationPerc) + full.t.hm <- rbind(full.t.hm, t.hm) + for (r in 1:nrow(t.hm)) { + if (nrow(full.t.annot) == 0) { + full.t.annot <- t.hm[r, c("Barcode", "Sample"), drop = FALSE] + } else { + bcode <- t.hm[r, "Barcode"] + if (!(bcode %in% full.t.annot$Barcode)) { + full.t.annot <- rbind(full.t.annot, t.hm[r, c("Barcode", "Sample"), drop = FALSE]) + } + } + } +} +full.t <- merge(full.t, sinfo, by = "Sample") + +full.t.hm$SaturationPerc <- NULL +full.tt.hm <- dcast(full.t.hm, Barcode ~ Sample, value.var = "Count") +rownames(full.tt.hm) <- full.tt.hm$Barcode +full.tt.hm$Barcode <- NULL +full.tt.hm[is.na(full.tt.hm)] <- 0 +# logCPM +full.tt.hm.cpm <- edgeR::cpm(full.tt.hm) +full.tt.hm.cpm <- log1p(full.tt.hm.cpm) +full.tt.hm.cpm <- full.tt.hm.cpm[full.t.annot$Barcode, sinfo$Sample] +# log Abundance +full.tt.hm <- log1p(full.tt.hm) +full.tt.hm <- full.tt.hm[full.t.annot$Barcode, sinfo$Sample] + +annot <- sinfo +rownames(annot) <- annot$Sample +mm_num <- paste0("Mouse", seq(1,length(levels(annot$Mouse)))) +names(mm_num) <- levels(annot$Mouse) +annot$MouseName <- factor(mm_num[annot$Mouse], levels = paste0("Mouse", seq(1,length(levels(annot$Mouse))))) +ct_colors <- brewer.pal(length(unique(annot$CellType)), "Set2") +names(ct_colors) <- unique(annot$CellType) + +ha_col <- columnAnnotation(df = annot[, "CellType", drop = F], + col = list(CellType = ct_colors), + show_annotation_name = F) + + +hm = Heatmap(matrix = as.matrix(full.tt.hm), + name = "log(Count)", + col = colorRampPalette(brewer.pal(n = 9, name ="OrRd"))(100), + cluster_columns = FALSE, + cluster_rows = FALSE, + rect_gp = gpar(col = "grey", lwd = 0), + border = T, + show_row_names = FALSE, + show_column_names = TRUE, + column_names_gp = gpar(fontsize = 9), + column_split = annot$MouseName, + column_title_side = "top", + column_title_gp = gpar(fontsize = 9), + column_gap = unit(c(rep(.1, 4), 2, rep(.1, 3), 2, rep(.1, 2), 2, rep(.1, 4)), "mm"), + top_annotation = ha_col, + use_raster = TRUE +) +pdf(paste(out_dir, "Full_HM_selected.pdf", sep = "/"), width = 16, height = 10) +draw(hm, merge_legends = TRUE) +dev.off() + +hm = Heatmap(matrix = as.matrix(full.tt.hm.cpm), + name = "logCPM", + col = colorRampPalette(brewer.pal(n = 9, name ="OrRd"))(100), + cluster_columns = FALSE, + cluster_rows = FALSE, + rect_gp = gpar(col = "grey", lwd = 0), + border = T, + show_row_names = FALSE, + show_column_names = TRUE, + column_names_gp = gpar(fontsize = 9), + column_split = annot$MouseName, + column_title_side = "top", + column_title_gp = gpar(fontsize = 9), + column_gap = unit(c(rep(.1, 4), 2, rep(.1, 3), 2, rep(.1, 2), 2, rep(.1, 4)), "mm"), + top_annotation = ha_col, + use_raster = TRUE +) +pdf(paste(out_dir, "Full_HM_logCPM_selected.pdf", sep = "/"), width = 16, height = 10) +draw(hm, merge_legends = TRUE) +dev.off() + + +####################### +### Barcode Sharing ### +####################### +### Selected ### +df_count <- data.frame() +for (sample in sinfo$Sample) { + print(sample) + t <- read.table(paste(data_dir, sample, paste0(sample, fname_suf), sep = "/"), header = T, sep = "\t") + colnames(t) <- c("Barcode", "Count", "SaturationPerc") + t$logCPM <- log1p(edgeR::cpm(t$Count)) + t$Rank <- seq(1, nrow(t)) + t$NormRank <- t$Rank / nrow(t) + # Cut selection + sel_cut <- select_cut(t) + t <- subset(t, Rank <= t$Rank[sel_cut]) + t_count <- t[, c("Barcode", "Count")] + colnames(t_count) <- c("Barcode", sample) + if(nrow(df_count) == 0) { + df_count <- t_count + } else { + df_count <- merge(df_count, t_count, by = "Barcode", all = T) + } +} +tt <- df_count +tt[is.na(tt)] <- 0 +rownames(tt) <- tt$Barcode +tt$Barcode <- NULL +present = !is.na(tt) & tt > 0 +intersect = crossprod(present) +union = nrow(tt) - crossprod(!present) +jacc_mat = intersect / union +jacc_mat[lower.tri(jacc_mat)] <- NA +jacc_mat_melt <- melt(jacc_mat) +jacc_mat_melt <- merge(jacc_mat_melt, sinfo, by.y = "Sample", by.x = "Var1") + +p <- ggplot(jacc_mat_melt, aes(x = Var1, y = Var2, fill = value)) + + theme_bw(base_size = 16) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 5)) + + theme(axis.text.y = element_text(size = 5)) + + theme(panel.spacing.x = unit(0, "mm")) + + ylab("") + + xlab("") + + ggtitle("Full Selected - Jaccard Index") + + geom_tile(width = .95, height = .95) + + scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard", + colours=c("navyblue", "darkmagenta", "darkorange1"), + breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) + + facet_grid(. ~ Treatment, scales = "free", space = "free") + +zoom1_ds <- c(grep("_B0", jacc_mat_melt$Var1, value = T), grep("_B1", jacc_mat_melt$Var1, value = T)) +zoom1 <- subset(jacc_mat_melt, Var1 %in% zoom1_ds & Var2 %in% zoom1_ds) +zoom1$Treatment <- droplevels(zoom1$Treatment) +p_zoom1 <- ggplot(zoom1, aes(x = Var1, y = Var2, fill = value)) + + theme_bw() + + theme(axis.text.x = element_blank()) + + theme(axis.text.y = element_blank()) + + theme(legend.position = "none") + + theme( + plot.background = element_rect(fill = "transparent", colour = NA), + panel.border = element_rect(fill = "transparent", colour = "forestgreen", size = 2), + plot.margin = unit(c(0, 0, 0, 0), "null"), + axis.ticks = element_blank() + ) + + ylab("") + + xlab("") + + geom_tile(width = .95, height = .95) + + scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard", + colours=c("navyblue", "darkmagenta", "darkorange1"), + breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) + +zoom2_ds <- c(grep("_C1", jacc_mat_melt$Var1, value = T), grep("_C4", jacc_mat_melt$Var1, value = T)) +zoom2 <- subset(jacc_mat_melt, Var1 %in% zoom2_ds & Var2 %in% zoom2_ds) +p_zoom2 <- ggplot(zoom2, aes(x = Var1, y = Var2, fill = value)) + + theme_bw() + + theme(axis.text.x = element_blank()) + + theme(axis.text.y = element_blank()) + + theme(legend.position = "none") + + theme( + plot.background = element_rect(fill = "transparent", colour = NA), + panel.border = element_rect(fill = "transparent", colour = "red", size = 2), + plot.margin = unit(c(0, 0, 0, 0), "null"), + axis.ticks = element_blank() + ) + + ylab("") + + xlab("") + + geom_tile(width = .95, height = .95) + + scale_fill_gradientn(limits = c(0.05,1),na.value = "grey90", name = "Jaccard", + colours=c("navyblue", "darkmagenta", "darkorange1"), + breaks = seq(0, 1, 0.2), labels = format(seq(0, 1, 0.2))) + + +ss <- 0.27 +pdf(file = paste(out_dir, "Full_sharing_HM_selected.pdf", sep = "/"), width = 12, height = 10) +grid.newpage() +grid.draw(arrangeGrob(p)) +grid.rect(x = unit(0.7, "npc"), y = unit(0.728, "npc"), + width = unit(0.1, "npc"), height = unit(0.1, "npc"), + gp=gpar(col = "forestgreen", lwd = 3)) +grid.draw(grobTree(ggplotGrob(p_zoom1), + vp=viewport(width = unit(ss, "npc"), height = unit(ss, "npc"), + x = unit(0.74,"npc"),y = unit(0.48,"npc")))) +grid.lines(x = unit(c(0.6245, 0.65), "npc"), + y = unit(c(0.613, 0.677), "npc"), + gp=gpar(col = "forestgreen", lwd = 3)) +grid.lines(x = unit(c(0.874, 0.75), "npc"), + y = unit(c(0.613, 0.677), "npc"), + gp=gpar(col = "forestgreen", lwd = 3)) +grid.rect(x = unit(0.418, "npc"), y = unit(0.44, "npc"), + width = unit(0.1, "npc"), height = unit(0.1, "npc"), + gp=gpar(col = "red", lwd = 3)) +grid.draw(grobTree(ggplotGrob(p_zoom2), + vp=viewport(width = unit(ss, "npc"), height = unit(ss, "npc"), + x = unit(0.48,"npc"),y = unit(0.22,"npc")))) +grid.lines(x = unit(c(0.47, 0.613), "npc"), + y = unit(c(0.39, 0.355), "npc"), + gp=gpar(col = "red", lwd = 3)) +grid.lines(x = unit(c(0.368, 0.3645), "npc"), + y = unit(c(0.39, 0.355), "npc"), + gp=gpar(col = "red", lwd = 3)) +dev.off()