chartingBCs_light <- function(bseq.fold, sampleids, # SampleIDs vector to include in the analysis ed = 2, # edit distance mc = 3, # minimum counts bcodes2reassign = NULL, # vector of barcodes to be removed [is applied to each sample] : if null no barcodes are removed at all samplesheet = NULL, # columns: sampleid,group,custom_threshold (rank to cut) do.filter = TRUE, # TRUE/FALSE to apply filtering or not projectid = "Exp2020_06_MMs", # prefix to files in output folder outfolder, # Output folder counts_vector = c(10,20,30,50,100,200), # vector with counts threshold for heatmap (all barcodes with n > cut-off in at least 1 sample) pcts_vector = c(0.001,0.01,0.1,1,2,3,5,10,20), # vector with pct threshold for heatmap (all barcodes with % > cut-off in at least 1 sample) custom.cut = FALSE, # TRUE/FALSE to apply filtering using custom_theshold / automatic method = "MaxSat", # Possible values: MaxSat / MinPct MinlogCPMFilter = FALSE, # with custom.cut TRUE and CutOff > 0; if FALSE: filter rank based, if TRUE: logCPM filter based removeMaxN = TRUE # Performing removal of barcodes present in at least 2 samples within group ){ # Script for analyzing barcodes # AIM: # retrieve raw stats # import data and calculate CPMs for each sample + other stats # define threshold for each sampleID (with all barcodes including the ones with Ns) # subset barcodes for each sample according to sd1_log rank value or custom specified thresholds # rerun calculation of CPMs and abundances # create plots including: # # circosplots # Boxplots library(data.table) library(reshape2) library(openxlsx) library(ComplexHeatmap) library(gridExtra) library(stringr) library(dplyr) library(RColorBrewer) library(edgeR) library(ggplot2) library(ggalluvial) library(circlize) library(chorddiag) library(htmlwidgets) library(UpSetR) library(stringi) #### GLOBAL SETTINGS #### #ht_opt(RESET = TRUE) # Header style for excel outputs hs <- createStyle( textDecoration = "BOLD", fgFill = "#1D8348", ) # Setting scientific option options(scipen = 999) # Output folder if(!is.null(outfolder)){ outfolder <- paste0(outfolder, "/") dir.create(outfolder, recursive = T, showWarnings = F) }else{ stop("Please set the output folder ..") } tabledir <- paste(outfolder,"Tables/", sep = "/") dir.create(tabledir, recursive = T, showWarnings = F) plotdir <- paste(outfolder,"Plots/", sep = "/") dir.create(plotdir, recursive = T, showWarnings = F) # Retrieve group info from samplesheet file annot <- read.table(file = samplesheet, header = T, stringsAsFactors = F) annot <- subset(x = annot, SampleID %in% sampleids) print(sampleids) sampleids <- annot$SampleID print(sampleids) #### FUNCTIONS #### datacharts <- function(indata, prefix = "raw", plotdir, projectid = "project", mc, hs, tabledir){ fulldata_tmp <- indata #### Count distribution pdf file creation #### pdf(file = paste0(plotdir, projectid,"_",prefix,"_distribution_counts.pdf"), width = 12, height = 9) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) + geom_line() + ggtitle("Rank by SampleID") plot(myplot) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) + geom_line() + facet_wrap( . ~ SampleID) + ggtitle("Rank by SampleID - split") plot(myplot) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) + geom_line() + ggtitle("Normalized Rank by SampleID") plot(myplot) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) + geom_line() + facet_wrap( . ~ SampleID) + ggtitle("Normalized by SampleID - split") plot(myplot) if(length(unique(fulldata_tmp$Group)) > 1){ myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) + geom_line() + ggtitle("Rank by Group") plot(myplot) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) + geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Rank by Group - split") plot(myplot) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) + geom_line(aes(color = Group)) + ggtitle("Normalized Rank by Group") plot(myplot) myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) + geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Normalized by Group - split") plot(myplot) } dev.off() # REMOVING temporal data # rm(indata) #### Collect raw stats #### # filter fulldata according to min.counts (mc) and collecting number of barcodes by sampleID BarcodeCounts <- fulldata_tmp[Counts >= mc, .(nBarcodes = .N), by = SampleID ] colnames(BarcodeCounts) <- c("SampleID","nBarcodes") pdf(paste0(plotdir, projectid,"_",prefix,"_counts_by_coverage.pdf"), width = 12, height = 9) plot(ggplot(data = BarcodeCounts, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) + geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", mc)) + theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5)) ) colnames(BarcodeCounts) <- c("SampleID",paste0("nBarcodes_",mc)) for(i in c(5,10,20,50,80,100,200,500,1000,1500,2000,3000,5000,10000)){ tmp <- fulldata_tmp[Counts >= i, .(nBarcodes = .N), by = SampleID ] colnames(tmp) <- c("SampleID","nBarcodes") plot(ggplot(data = tmp, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) + geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", i)) + theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5)) ) colnames(tmp) <- c("SampleID",paste0("nBarcodes_",i)) BarcodeCounts <- merge(BarcodeCounts,tmp, by = "SampleID") } dev.off() # closing multichart BarcodeCounts <- as.data.frame(t(BarcodeCounts)) colnames(BarcodeCounts) <- BarcodeCounts[1,] cols_back <- colnames(BarcodeCounts) rows_back <- rownames(BarcodeCounts) BarcodeCounts <- as.data.frame(BarcodeCounts[2:nrow(BarcodeCounts),]) colnames(BarcodeCounts) <- cols_back rownames(BarcodeCounts) <- rows_back[2:length(rows_back)] BarcodeCounts$MinCount <- rownames(BarcodeCounts) BarcodeCounts <- BarcodeCounts[,c("MinCount",colnames(BarcodeCounts)[1:(ncol(BarcodeCounts)-1)])] write.xlsx(x = BarcodeCounts, file = paste0(tabledir, projectid, "_",prefix,"_N.barcodes_by_min_counts.xlsx"), asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs) # Casting and transform to barcode x sampleid matrix all_counts_casted <- dcast(fulldata_tmp, formula = Barcode ~ SampleID, value.var = 'Counts') all_counts_casted <- as.data.frame(all_counts_casted) all_counts_casted[is.na(all_counts_casted)] <- 0 # replace NAs with 0 rownames(all_counts_casted) <- all_counts_casted$Barcode all_counts_casted$Barcode <- NULL # Create binary version all_counts_casted_binary <- all_counts_casted all_counts_casted_binary[all_counts_casted_binary > 0] <- 1 all_counts_casted_binary <- as.matrix(all_counts_casted_binary) # matrix of abundances in pct percent_abundance <- prop.table(x = as.matrix(all_counts_casted), margin = 2)*100 # create % abundances matrix percent_abundance_melt <- melt(percent_abundance) percent_abundance <- as.data.frame(percent_abundance) # matrix in CPM CPM <- as.data.frame(cpm(y = all_counts_casted)) # matrix in logCPM logCPM <- log1p(CPM) #### save list of raw matrices in xlsx format #### write.xlsx(x = list(Raw = as.data.frame(all_counts_casted), Binary = as.data.frame(all_counts_casted_binary), CPM = as.data.frame(CPM), logCPM = as.data.frame(logCPM), pct_abund = percent_abundance), file = paste0(tabledir,projectid,"_",prefix,"_counts_matrices.xlsx"), asTable = T,overwrite = T, rowNames = T, colWidths = "auto", firstRow = T, firstCol = T, headerStyle = hs) write.xlsx(x = list(full = as.data.frame(fulldata_tmp)), file = paste0(tabledir,projectid,"_",prefix,"_fulldata.xlsx"), overwrite = T, asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs) return(list(Raw = as.data.frame(all_counts_casted), Binary = as.data.frame(all_counts_casted_binary), CPM = as.data.frame(CPM), logCPM = as.data.frame(logCPM), pct_abund = percent_abundance)) } #### IMPORTING RAW DATA #### fulldata <- fread(input = paste0(bseq.fold,"/",sampleids[1],"/",sampleids[1],"_edit_",ed,".barcode.mc",mc,".tsv")) #print(paste0("Before removal of unwanted barcodes : ", nrow(fulldata))) fulldata$SampleID <- sampleids[1] colnames(fulldata) <- c("Barcode","Counts","SaturationOri","SampleID") fulldata$Rank <- as.numeric(rownames(fulldata)) fulldata$RankNorm <- fulldata$Rank / nrow(fulldata) fulldata$Percentage <- fulldata$Counts / sum(fulldata$Counts) * 100 fulldata$Saturation <- cumsum(fulldata$Percentage) fulldata$logCPM <- log1p(fulldata$Counts / sum(fulldata$Counts)*1000000) for(id in sampleids[2:length(sampleids)]){ indata <- fread(input = paste0(bseq.fold,"/",id,"/",id,"_edit_",ed,".barcode.mc",mc,".tsv")) indata$SampleID <- id colnames(indata) <- c("Barcode","Counts","SaturationOri","SampleID") indata$Rank <- as.numeric(rownames(indata)) indata$RankNorm <- indata$Rank / nrow(indata) indata$Percentage <- indata$Counts / sum(indata$Counts) * 100 indata$Saturation <- cumsum(indata$Percentage) indata$logCPM <- log1p(indata$Counts / sum(indata$Counts)*1000000) fulldata <- rbind(fulldata, indata) } # Adding info from samplesheet fulldata <- data.table(merge.data.frame(x = fulldata, y = annot, by = "SampleID", all.x = T, sort = F)) ## Adding new columns to final full data including: # Pct_x_VCN = Percentage * VCN # Pct_x_Engraft fulldata$Pct_x_EngraftWBM <- fulldata$Percentage * (fulldata$Engraftment/100) * (fulldata$PctNGFR/100) raw_objects <- datacharts(indata = fulldata, prefix = "raw", plotdir = plotdir, projectid = projectid, mc = mc , hs = hs, tabledir = tabledir) raw_objects$fulldata <- fulldata #### DERIVATES #### fulldata_clean_list <- list() deriv <- function(x, y) diff(y) / diff(x) middle_pts <- function(x) x[-1] - diff(x) / 2 combined_plots_p1 <- list() combined_plots_p2 <- list() cut_thresh <- list() for (sample in unique(fulldata$SampleID)) { t <- subset(fulldata, SampleID == sample) # subsetting specific sample d1log <- deriv(log10(t$Rank), log10(t$Counts)) sd1_log <- which.min(diff(d1log)) + 1 d1logCPM <- deriv(t$Rank, t$logCPM) sd1_logCPM <- which.min(diff(d1logCPM)) + 1 prova <- deriv(log10(t$Rank), t$logCPM) sd1_prova <- which.min(diff(prova)) + 1 cut_thresh[[sample]] <- t$Rank[sd1_log] } #### PERFORM CUT #### pdf(file = paste0(plotdir, projectid,"_raw_barcode_distribution_cut_offs.pdf"), width = 12, height = 6) # for (sample in unique(fulldata$SampleID)) { for (sample in annot$SampleID) { t <- subset(fulldata, SampleID == sample) # removing barcodes ACCORDING TO threshold mythresh <- ifelse(isTRUE(custom.cut), yes = annot[annot$SampleID == sample,]$Cutoff, no = cut_thresh[[sample]]) if(isTRUE(do.filter)){ if(mythresh > 1){ if(isTRUE(MinlogCPMFilter)){ print("Going with MinCPM filter ...") t_filt <- t[t$logCPM > mythresh,] p1 <- ggplot(t, aes(x = Rank, y = logCPM)) + theme_bw() + theme() + ggtitle(paste0("Rank vs logCPM: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T) combined_plots_p1[[sample]] <- p1 p2 <- ggplot(t, aes(x = Rank, y = Counts)) + theme_bw() + theme() + ggtitle(paste0("Rank vs Counts: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T) combined_plots_p2[[sample]] <- p2 print(grid.arrange(p1, p2, ncol = 2)) }else{ print("Going with rank based filter or auto according to custom.cut parameter...") t_filt <- t[t$Rank < mythresh,] p1 <- ggplot(t, aes(x = Rank, y = logCPM)) + theme_bw() + theme() + ggtitle(paste0("Rank vs logCPM: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T) combined_plots_p1[[sample]] <- p1 p2 <- ggplot(t, aes(x = Rank, y = Counts)) + theme_bw() + theme() + ggtitle(paste0("Rank vs Counts: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T) combined_plots_p2[[sample]] <- p2 print(grid.arrange(p1, p2, ncol = 2)) } }else{ if(method == "MinPct"){ print("Going with percentage filter ...") #t_filt <- t[t$Percentage > mythresh,] t_filt <- subset(t, Percentage > mythresh) p1 <- ggplot(t, aes(x = Rank, y = logCPM)) + theme_bw() + theme() + ggtitle(paste0("Rank vs logCPM: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T) combined_plots_p1[[sample]] <- p1 p2 <- ggplot(t, aes(x = Rank, y = Counts)) + theme_bw() + theme() + ggtitle(paste0("Rank vs Counts: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T) combined_plots_p2[[sample]] <- p2 print(grid.arrange(p1, p2, ncol = 2)) } else if(method == "MaxSat"){ print("Going with saturation filter ...") t_filt <- t[t$Saturation < (mythresh*100),] #t_filt <- subset(t, subset = NewAbund < (mythresh * 100)) p1 <- ggplot(t, aes(x = Rank, y = logCPM)) + theme_bw() + theme() + ggtitle(paste0("Rank vs logCPM: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < (mythresh*100),])], color = "#3a93bd",show.legend = T) combined_plots_p1[[sample]] <- p1 p2 <- ggplot(t, aes(x = Rank, y = Counts)) + theme_bw() + theme() + ggtitle(paste0("Rank vs Counts: ", sample)) + geom_point() + geom_line() + geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < mythresh*100,])], color = "#3a93bd",show.legend = T) combined_plots_p2[[sample]] <- p2 print(grid.arrange(p1, p2, ncol = 2)) } else{ stop("Wrong method !!") } } }else{ t_filt <- t } # removing barcodes with Ns t_filt <- t_filt[grep(x = t_filt$Barcode, pattern = "N", invert = T),] # Re-calculate metrics t_filt$Rank <- as.numeric(rownames(t_filt)) t_filt$RankNorm <- t_filt$Rank / nrow(t_filt) t_filt$Percentage <- t_filt$Counts / sum(t_filt$Counts) * 100 t_filt$Saturation <- cumsum(t_filt$Percentage) t_filt$logCPM <- log1p(t_filt$Counts / sum(t_filt$Counts)*1000000) fulldata_clean_list[[sample]] <- t_filt } dev.off() # Re-create fulldata and run the charts fulldata <- NULL fulldata <- fulldata_clean_list[[sampleids[1]]] for(id in sampleids[2:length(sampleids)]){ fulldata <- rbind(fulldata, fulldata_clean_list[[id]]) } saveRDS(fulldata, paste0(outfolder, "fulldata_post_curve_filt_objects.rds")) #### RUN FILTERING ACCORDING TO SHARING WITHIN GROUP #### if(isTRUE(removeMaxN)){ bc_by_group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")] bcode2remove <- NULL for (g in unique(annot$Group)) { print(paste0("Collecting barcodes present in more than ", max(annot[annot$Group == g,]$MaxN), " samples in group ", g, " ...")) bcode2remove <- c(bcode2remove,bc_by_group[bc_by_group$Group == g & bc_by_group$cnt > max(annot[annot$Group == g,]$MaxN),]$Barcode) print(paste0("Number of barcodes identified: ", length(bcode2remove))) } print(paste0("Total number of barcodes to remove from analysis: ", length(unique(bcode2remove)))) removed_data <- fulldata[fulldata$Barcode %in% unique(bcode2remove),] fulldata <- fulldata[!fulldata$Barcode %in% unique(bcode2remove),] print(x = paste0("Prima stampa")) print(head(fulldata)) fulldata <- as.data.table(fulldata %>% group_by(SampleID) %>% mutate(Percentage = Counts / sum(Counts) * 100) %>% mutate(Saturation = cumsum(Percentage)) %>% mutate(Rank = row_number()) %>% mutate(RankNorm = Rank/max(Rank))) print(paste0("Seconda stampa .. post-ricalcolo percentuali")) print(head(fulldata)) write.xlsx(x = list(full = as.data.frame(removed_data)), file = paste0(tabledir,projectid,"_removed_data.xlsx"), overwrite = T, asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs) } # Apply eventual re-assignment of barcodes to most expressing sample - according to the bcodes2reassign vector # #saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp.rds")) if(!is.null(bcodes2reassign)){ tmpdf_reass <- as.data.frame(bcodes2reassign) colnames(tmpdf_reass) <- c("bcodes2reassign") tmpdf_reass$SampleID <- names(bcodes2reassign) tmpdf_reass$SampleBC <- paste0(tmpdf_reass$SampleID,"_", tmpdf_reass$bcodes2reassign) fulldata$SampleBC <- paste0(fulldata$SampleID, "_", fulldata$Barcode) fulldata_good <- fulldata[!fulldata$Barcode %in% bcodes2reassign, ] print("fulldata_good print - elements shared within groups or not shared between groups") print(dim(fulldata_good)) fulldata_reassigned <- fulldata[fulldata$SampleBC %in% tmpdf_reass$SampleBC, ] print("fulldata_reassigned print - elements to reassing") print(dim(fulldata_reassigned)) fulldata2 <- rbind(fulldata_good, fulldata_reassigned) print(dim(fulldata2)) fulldata2 <- fulldata2[order(factor(fulldata2$SampleID, levels=rev(sampleids)), fulldata2$Counts, decreasing = T ),] fulldata2$SampleBC <- NULL fulldata2 <- as.data.table(fulldata2 %>% group_by(SampleID) %>% mutate(Percentage = Counts / sum(Counts) * 100) %>% mutate(Saturation = cumsum(Percentage)) %>% mutate(Rank = row_number()) %>% mutate(RankNorm = Rank/max(Rank))) print("Fulldata 2 print .....") #head(fulldata2) # fulldata2 <- as.data.frame(fulldata2 %>% group_by(SampleID) %>% # mutate(Percentage_new = Counts / sum(Counts) * 100) %>% # mutate(Saturation_new = cumsum(Percentage_new))) %>% # mutate(Rank_new = row_number()) %>% # mutate(RankNorm_new = Rank_new/max(Rank_new)) dim(fulldata2) fulldata <- fulldata2 print(dim(fulldata)) rm(fulldata2) } print(dim(fulldata)) #### Collect data and stats after filtering #### filtered_objects <- datacharts(indata = fulldata, prefix = "filtered", plotdir = plotdir, projectid = projectid, mc = mc, hs = hs, tabledir = tabledir) #saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp2.rds")) # Creating jaccard matrix jaccard <- function(a, b) { intersection = length(intersect(a, b)) union = length(a) + length(b) - intersection return (intersection/union) } jacc_mt <- matrix(0, nrow = length(unique(fulldata$SampleID)), ncol = length(unique(fulldata$SampleID))) rownames(jacc_mt) <- unique(fulldata$SampleID) colnames(jacc_mt) <- rownames(jacc_mt) jacc_df <- as.data.frame(jacc_mt) intersect_df <- jacc_df sharing_df <- jacc_df for (k in unique(fulldata$SampleID)) { for (j in unique(fulldata$SampleID)){ jacc_df[k,j] <- jaccard(a = fulldata[fulldata$SampleID == k]$Barcode, b = fulldata[fulldata$SampleID == j]$Barcode) intersect_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode)) sharing_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode)) / length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == k]$Barcode)) * 100 } } all_counts_casted_binary <- as.matrix(filtered_objects$Binary) #print(head(all_counts_casted_binary)) jacc_mt <- as.matrix(jacc_df) intersect_mt <- as.matrix(intersect_df) sharing_mt <- as.matrix(sharing_df) unique_shared <- as.data.frame(matrix(nrow=length(sampleids),ncol=4)) colnames(unique_shared) <- c("Total","Unique","Shared","Shared_Pct") rownames(unique_shared) <- colnames(x = all_counts_casted_binary) for(i in 1:nrow(unique_shared)){ unique_shared$Total[i] <- colSums(all_counts_casted_binary)[i] a <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) == 0),])) # print(a) unique_shared$Unique[i] <- a b <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) > 0),])) # print(b) unique_shared$Shared[i] <- b unique_shared$Shared_Pct[i] <- unique_shared$Shared[i]/unique_shared$Total[i] * 100 } pheatmap::pheatmap(mat = intersect_mt, main = "Barcodes intersection", display_numbers = T,number_format = "%.0f", cluster_cols = F, cluster_rows = F, color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black", filename = paste0(plotdir, projectid,"_Intersection_pairwise.pdf"), width = 12, height = 12) numsize = ifelse(nrow(sharing_mt) > 15, 4, 10) pheatmap::pheatmap(mat = sharing_mt, main = "Percent of sharing", display_numbers = T,number_format = "%.1f", cluster_cols = F, cluster_rows = F, fontsize_number = numsize, color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black", filename = paste0(plotdir, projectid,"_Sharing_pairwise.pdf"), width = 12, height = 12) pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity", display_numbers = T,number_format = "%.2f", cluster_cols = F, cluster_rows = F, fontsize_number = numsize, color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black", filename = paste0(plotdir, projectid,"_Jaccard_pairwise.pdf"), width = 12, height = 12) pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity", display_numbers = T,number_format = "%.2f", cluster_cols = T, cluster_rows = T, fontsize_number = numsize, color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black", filename = paste0(plotdir, projectid,"_Jaccard_pairwise_auto.pdf"), width = 12, height = 12) # Sharing between groups present in the matrix intersect_vectors <- list() intersect_vectors_ori <- list() #saveRDS(fulldata, file = "Fulldata.rds") sum.in.group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")] sum.in.group.melt <- as.data.frame(dcast(data = sum.in.group, formula = Barcode ~ Group, value.var = "cnt")) sum.in.group.melt[is.na(sum.in.group.melt)] <- 0 rownames(sum.in.group.melt) <- sum.in.group.melt$Barcode sum.in.group.melt$Barcode <- NULL if(length(unique(sum.in.group$Group)) > 1){ l <- list() for (gr in unique(sum.in.group$Group)) { l[[gr]] <- c(0,1) } j <- expand.grid(l) if(ncol(j) == 2){ j <- matrix(j[rowSums(j) == 2,], nrow=1) }else{ j <- as.matrix(j[rowSums(j) > 1,]) } colnames(j) <- unique(sum.in.group$Group) for(i in 1:nrow(j)){ rownames(j)[i] <- paste(x = colnames(j)[which(j[i,] == 1)],collapse = "_vs_") } splitdata <- split(sum.in.group, sum.in.group$Group) for(i in names(splitdata)){ splitdata[[i]] <- splitdata[[i]]$Barcode } ###### UPSET ###### upsetr.input <- fromList(splitdata) rownames(upsetr.input) <- unique(unlist(splitdata)) upsetr.input <- upsetr.input[,colnames(j)] for(comparison in rownames(j)){ matching.vect <- NULL for(n in 1:nrow(upsetr.input)){ a <- as.character(upsetr.input[n,]) b <- as.character(j[comparison,]) if(identical(a,b)){ matching.vect <- c(matching.vect , n) } } bcodes <- rownames(upsetr.input[matching.vect,]) intersect_vectors[[comparison]] <- sum.in.group.melt[bcodes,] } counts_by_sample <- fulldata[, .(count = .N), by = SampleID] counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T) counts_by_sample_annot$Cutoff <- NULL pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9) print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count)) + geom_boxplot()) dev.off() counts_by_sample <- fulldata[, .(count = .N), by = SampleID] counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T) counts_by_sample_annot$Cutoff <- NULL pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9) print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count, fill = Group)) + geom_boxplot() + geom_point()) dev.off() # UpSet R (only if groups >= 2) pdf(file = paste0(plotdir, projectid,"_UpSetR_by_groups.pdf"), width = 18, height = 9) print(upset(upsetr.input, order.by = "freq", sets = rev(colnames(upsetr.input)), keep.order = TRUE, nintersects = NA)) dev.off() #saveRDS(intersect_vectors, "intersect_vectors.rds") intersect_vectors_ori <- intersect_vectors for(i in 1:length(names(intersect_vectors))){ if(nchar(names(intersect_vectors)[i]) > 30){ newname <- str_sub(string = stri_reverse(names(intersect_vectors)[i]), start = 1, end = 30) names(intersect_vectors)[i] <- newname } if(isTRUE(duplicated(names(intersect_vectors)))){ stop() } } write.xlsx(x = intersect_vectors, file = paste0(tabledir, projectid,"_SharingByGroup.xlsx"), overwrite = T, asTable = T, rowNames = T, colWidths = "auto", firstRow = T, headerStyle = hs) write.xlsx(x = sum.in.group.melt, file = paste0(tabledir, projectid,"_AggregationByGroup.xlsx"), overwrite = T, asTable = T, rowNames = T, colWidths = "auto", firstRow = T, headerStyle = hs) } write.xlsx(x = list(Jaccard = as.data.frame(jacc_mt), Intersection = as.data.frame(intersect_mt), Sharing = as.data.frame(sharing_mt), Unique_and_shared = as.data.frame(unique_shared) ), file = paste0(tabledir, projectid,"_Sharing.xlsx"), overwrite = T, asTable = T, rowNames = T, colWidths = "auto", firstRow = T, headerStyle = hs) # Shared barcodes in the group (only if N° groups == 1) if(length(unique(fulldata$Group)) == 1){ shared_barcodes_list <- list() file.remove(paste0(tabledir, "/","Sharing_", projectid,".txt")) for(i in unique(fulldata$SampleID)){ shared_barcodes_list[[i]] <- intersect(fulldata[fulldata$SampleID == i,]$Barcode, fulldata[fulldata$SampleID != i,]$Barcode) cur.pct <- length(shared_barcodes_list[[i]]) / length(fulldata[fulldata$SampleID == i,]$Barcode) write(x = paste0("Sharing ", i, " vs all: ", round(cur.pct * 100,2) , "%"), file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T) } shared_barcodes <- unique(unlist(shared_barcodes_list)) cur.pct <- length(shared_barcodes) / length(unique(fulldata$Barcode)) write(x = paste0("Sharing ", "within ", " group: ", round(cur.pct * 100,2) , "%"), file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T) } circosdata <- NULL if(length(sampleids) > 2){ circosdata <- as.data.frame(sharing_mt) diag(circosdata) <- 0 circosdata$SampleID <- rownames(circosdata) mydata_long <- melt(circosdata, id.vars = "SampleID") colnames(mydata_long) <- c("rowname","key","value") pdf(file= paste0(plotdir, projectid,"_Circos_by_sampleIDs.pdf"), width = 16, height = 16) chordDiagram(mydata_long, grid.col = colorRampPalette(brewer.pal(12, "Paired"))(length(unique(mydata_long$rowname)))) dev.off() } saveRDS(object = list( fulldata = fulldata, counts_abund_matrices = filtered_objects, intersect_mt = intersect_mt, sharing_mt = sharing_mt, jacc_mt = jacc_mt, aggregationbygroup = sum.in.group, aggregationbygroup_melt = sum.in.group.melt, sharingbygroup = intersect_vectors_ori, #sorted_barcodes_all = sorted_barcodes_all, #sorted_barcodes_shared = sorted_barcodes_shared, sampleinfo = annot, raw_objects = raw_objects, circosdata = circosdata ), file = paste0(outfolder, projectid, "_objects.rds")) }