chartingBCs_light <- function(bseq.fold, 
                        sampleids, # SampleIDs vector to include in the analysis
                        ed = 2, # edit distance
                        mc = 3, # minimum counts
                        bcodes2reassign = NULL, # vector of barcodes to be removed [is applied to each sample] : if null no barcodes are removed at all
                        samplesheet = NULL, # columns: sampleid,group,custom_threshold (rank to cut)
                        do.filter = TRUE, # TRUE/FALSE to apply filtering or not
                        projectid = "Exp2020_06_MMs", # prefix to files in output folder
                        outfolder, # Output folder
                        counts_vector = c(10,20,30,50,100,200), # vector with counts threshold for heatmap (all barcodes with n > cut-off in at least 1 sample)
                        pcts_vector = c(0.001,0.01,0.1,1,2,3,5,10,20), # vector with pct threshold for heatmap (all barcodes with % > cut-off in at least 1 sample)
                        custom.cut = FALSE, # TRUE/FALSE to apply filtering using custom_theshold / automatic
                        method = "MaxSat", # Possible values: MaxSat / MinPct 
                        MinlogCPMFilter = FALSE, # with custom.cut TRUE and CutOff > 0;  if FALSE: filter rank based, if TRUE: logCPM filter based
                        removeMaxN = TRUE # Performing removal of barcodes present in at least 2 samples within group
                        ){
  
  # Script for analyzing barcodes
  
  # AIM:

  # retrieve raw stats
  # import data and calculate CPMs for each sample + other stats
  # define threshold for each sampleID (with all barcodes including the ones with Ns)
  # subset barcodes for each sample according to sd1_log rank value or custom specified thresholds
  # rerun calculation of CPMs and abundances
  # create plots including:
  # 
  # circosplots
  # Boxplots
  
  library(data.table)
  library(reshape2)
  library(openxlsx)
  library(ComplexHeatmap)
  library(gridExtra)
  library(stringr)
  library(dplyr)
  library(RColorBrewer)
  library(edgeR)
  library(ggplot2)
  library(ggalluvial)
  library(circlize)
  library(chorddiag)
  library(htmlwidgets)
  library(UpSetR)
  library(stringi)
  
#### GLOBAL SETTINGS ####
 
  #ht_opt(RESET = TRUE)
 
  # Header style for excel outputs
  
  hs <- createStyle(
    textDecoration = "BOLD", fgFill = "#1D8348",
  )
  
  # Setting scientific option
  
  options(scipen = 999)
  
  
  # Output folder
  if(!is.null(outfolder)){
    outfolder <- paste0(outfolder, "/")
    dir.create(outfolder, recursive = T, showWarnings = F)
  }else{
    stop("Please set the output folder ..")
  }
  
  tabledir <- paste(outfolder,"Tables/", sep = "/")
  dir.create(tabledir, recursive = T, showWarnings = F)
  plotdir <- paste(outfolder,"Plots/", sep = "/")
  dir.create(plotdir, recursive = T, showWarnings = F)
 
  # Retrieve group info from samplesheet file
  
  annot <- read.table(file = samplesheet, header = T, stringsAsFactors = F)
  annot <- subset(x = annot, SampleID %in% sampleids)
  
  print(sampleids)
  
  sampleids <- annot$SampleID
  
  print(sampleids)


#### FUNCTIONS ####
  
  datacharts <- function(indata, prefix = "raw", plotdir, projectid = "project", mc, hs, tabledir){
    
    fulldata_tmp <- indata
    
    #### Count distribution pdf file creation ####
    
    pdf(file = paste0(plotdir, projectid,"_",prefix,"_distribution_counts.pdf"), width = 12, height = 9)
    
    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) + 
      geom_line() + ggtitle("Rank by SampleID")
    plot(myplot)
    
    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) + 
      geom_line() + facet_wrap( . ~ SampleID) + ggtitle("Rank by SampleID - split")
    plot(myplot)
    
    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) + 
      geom_line() + ggtitle("Normalized Rank by SampleID") 
    plot(myplot)
    
    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) + 
      geom_line() + facet_wrap( . ~ SampleID)  + ggtitle("Normalized by SampleID - split")
    plot(myplot)
    
    if(length(unique(fulldata_tmp$Group)) > 1){
      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) + 
        geom_line() + ggtitle("Rank by Group")
      plot(myplot)
      
      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) + 
        geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Rank by Group - split")
      plot(myplot)
      
      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) + 
        geom_line(aes(color = Group)) + ggtitle("Normalized Rank by Group") 
      plot(myplot)
      
      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) + 
        geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Normalized by Group - split")
      plot(myplot)
    }
    
    dev.off()
    
    # REMOVING temporal data
    
 #   rm(indata)
    
    #### Collect raw stats ####
    
    # filter fulldata according to min.counts (mc) and collecting number of barcodes by sampleID
    
    BarcodeCounts <- fulldata_tmp[Counts >= mc, .(nBarcodes = .N), by = SampleID ]
    colnames(BarcodeCounts) <- c("SampleID","nBarcodes")
    
    pdf(paste0(plotdir, projectid,"_",prefix,"_counts_by_coverage.pdf"), width = 12, height = 9)
    plot(ggplot(data = BarcodeCounts, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) + 
           geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", mc)) + 
           theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
    )
    colnames(BarcodeCounts) <- c("SampleID",paste0("nBarcodes_",mc))
    
    for(i in c(5,10,20,50,80,100,200,500,1000,1500,2000,3000,5000,10000)){
      tmp <- fulldata_tmp[Counts >= i, .(nBarcodes = .N), by = SampleID ]
      colnames(tmp) <- c("SampleID","nBarcodes")
      plot(ggplot(data = tmp, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) + 
             geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", i)) +
             theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5)) 
      )
      colnames(tmp) <- c("SampleID",paste0("nBarcodes_",i))
      BarcodeCounts <- merge(BarcodeCounts,tmp, by = "SampleID")
    }
    
    dev.off() # closing multichart
    
    BarcodeCounts <- as.data.frame(t(BarcodeCounts))
    colnames(BarcodeCounts) <- BarcodeCounts[1,]
    cols_back <- colnames(BarcodeCounts)
    rows_back <- rownames(BarcodeCounts)
    BarcodeCounts <- as.data.frame(BarcodeCounts[2:nrow(BarcodeCounts),])
    colnames(BarcodeCounts) <- cols_back
    rownames(BarcodeCounts) <- rows_back[2:length(rows_back)]
    BarcodeCounts$MinCount <- rownames(BarcodeCounts)
    BarcodeCounts <- BarcodeCounts[,c("MinCount",colnames(BarcodeCounts)[1:(ncol(BarcodeCounts)-1)])]
    
    write.xlsx(x = BarcodeCounts, file = paste0(tabledir, projectid, "_",prefix,"_N.barcodes_by_min_counts.xlsx"), 
               asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs)
    
    # Casting and transform to barcode x sampleid matrix
    
    all_counts_casted <- dcast(fulldata_tmp, formula = Barcode ~ SampleID, value.var = 'Counts')
    all_counts_casted <- as.data.frame(all_counts_casted)
    all_counts_casted[is.na(all_counts_casted)] <- 0 # replace NAs with 0
    rownames(all_counts_casted) <- all_counts_casted$Barcode
    all_counts_casted$Barcode <- NULL
    
    # Create binary version
    all_counts_casted_binary <- all_counts_casted
    all_counts_casted_binary[all_counts_casted_binary > 0] <- 1
    all_counts_casted_binary <- as.matrix(all_counts_casted_binary)
    
    # matrix of abundances in pct
    percent_abundance <- prop.table(x = as.matrix(all_counts_casted), margin = 2)*100 # create % abundances matrix
    percent_abundance_melt <- melt(percent_abundance)
    percent_abundance <- as.data.frame(percent_abundance)
    
    # matrix in CPM
    CPM <- as.data.frame(cpm(y = all_counts_casted))
    
    # matrix in logCPM
    logCPM <- log1p(CPM)
    
    #### save list of raw matrices in xlsx format ####
    
    write.xlsx(x = list(Raw = as.data.frame(all_counts_casted),
                        Binary = as.data.frame(all_counts_casted_binary),
                        CPM = as.data.frame(CPM), 
                        logCPM = as.data.frame(logCPM),
                        pct_abund = percent_abundance),
               file = paste0(tabledir,projectid,"_",prefix,"_counts_matrices.xlsx"), asTable = T,overwrite = T,
               rowNames = T, colWidths = "auto", firstRow = T, firstCol = T, headerStyle = hs)
    
    write.xlsx(x = list(full = as.data.frame(fulldata_tmp)), file = paste0(tabledir,projectid,"_",prefix,"_fulldata.xlsx"),
               overwrite = T,
               asTable = T, colWidths = "auto", 
               firstRow = T, rowNames = F,  headerStyle = hs)
    
    return(list(Raw = as.data.frame(all_counts_casted),
                Binary = as.data.frame(all_counts_casted_binary),
                CPM = as.data.frame(CPM), 
                logCPM = as.data.frame(logCPM),
                pct_abund = percent_abundance))
  }

#### IMPORTING RAW DATA ####
  
  fulldata <- fread(input = paste0(bseq.fold,"/",sampleids[1],"/",sampleids[1],"_edit_",ed,".barcode.mc",mc,".tsv"))
  #print(paste0("Before removal of unwanted barcodes : ", nrow(fulldata)))
  fulldata$SampleID <- sampleids[1]
  colnames(fulldata) <- c("Barcode","Counts","SaturationOri","SampleID")
  fulldata$Rank <- as.numeric(rownames(fulldata))
  fulldata$RankNorm <- fulldata$Rank / nrow(fulldata)
  fulldata$Percentage <- fulldata$Counts / sum(fulldata$Counts) * 100
  fulldata$Saturation <- cumsum(fulldata$Percentage)
  fulldata$logCPM <- log1p(fulldata$Counts / sum(fulldata$Counts)*1000000)
  
  for(id in sampleids[2:length(sampleids)]){
    indata <- fread(input = paste0(bseq.fold,"/",id,"/",id,"_edit_",ed,".barcode.mc",mc,".tsv"))
    indata$SampleID <- id
    colnames(indata) <- c("Barcode","Counts","SaturationOri","SampleID")
    indata$Rank <- as.numeric(rownames(indata))
    indata$RankNorm <- indata$Rank / nrow(indata)
    indata$Percentage <- indata$Counts / sum(indata$Counts) * 100
    indata$Saturation <- cumsum(indata$Percentage)
    indata$logCPM <- log1p(indata$Counts / sum(indata$Counts)*1000000)
    fulldata <- rbind(fulldata, indata)
  }

  
  # Adding info from samplesheet
  fulldata <- data.table(merge.data.frame(x = fulldata, y = annot, by = "SampleID", all.x = T, sort = F))
  
  ## Adding new columns to final full data including:
  # Pct_x_VCN = Percentage * VCN
  # Pct_x_Engraft
  
  fulldata$Pct_x_EngraftWBM <- fulldata$Percentage * (fulldata$Engraftment/100) * (fulldata$PctNGFR/100)
  
  raw_objects <- datacharts(indata = fulldata, prefix = "raw", plotdir = plotdir, projectid = projectid, mc = mc , hs = hs, tabledir = tabledir)
  raw_objects$fulldata <- fulldata
  
  
#### DERIVATES ####

  fulldata_clean_list <- list()
  
  deriv <- function(x, y) diff(y) / diff(x)
  middle_pts <- function(x) x[-1] - diff(x) / 2
  
  combined_plots_p1 <- list()
  combined_plots_p2 <- list()
  cut_thresh <- list()
  
  for (sample in unique(fulldata$SampleID)) {

    t <- subset(fulldata, SampleID == sample) # subsetting specific sample
    d1log <- deriv(log10(t$Rank), log10(t$Counts))
    sd1_log <- which.min(diff(d1log)) + 1
    
    d1logCPM <- deriv(t$Rank, t$logCPM)
    sd1_logCPM <- which.min(diff(d1logCPM)) + 1
    
    prova <- deriv(log10(t$Rank), t$logCPM)
    sd1_prova <- which.min(diff(prova)) + 1
    
    cut_thresh[[sample]] <-  t$Rank[sd1_log]
    
  }


#### PERFORM CUT ####    
  
  pdf(file = paste0(plotdir, projectid,"_raw_barcode_distribution_cut_offs.pdf"), width = 12, height = 6)
  
#  for (sample in unique(fulldata$SampleID)) {
   for (sample in annot$SampleID) {
        
    t <- subset(fulldata, SampleID == sample)
    
    # removing barcodes ACCORDING TO threshold
      
    mythresh <- ifelse(isTRUE(custom.cut), yes = annot[annot$SampleID == sample,]$Cutoff, no = cut_thresh[[sample]])
      
    if(isTRUE(do.filter)){
      if(mythresh > 1){
        if(isTRUE(MinlogCPMFilter)){
          print("Going with MinCPM filter ...")
          t_filt <-  t[t$logCPM > mythresh,]
          
          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs logCPM: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T)
          
          combined_plots_p1[[sample]] <- p1 
          
          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs Counts: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T)
          
          combined_plots_p2[[sample]] <- p2
          print(grid.arrange(p1, p2, ncol = 2))
          
        }else{
          print("Going with rank based filter or auto according to custom.cut parameter...")
          t_filt <-  t[t$Rank < mythresh,]
          
          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs logCPM: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T)
          
          combined_plots_p1[[sample]] <- p1 
          
          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs Counts: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T)
          
          combined_plots_p2[[sample]] <- p2
          print(grid.arrange(p1, p2, ncol = 2))
        }
      }else{
        if(method == "MinPct"){
          print("Going with percentage filter ...")
          #t_filt <- t[t$Percentage > mythresh,]
          t_filt <- subset(t, Percentage > mythresh)
          
          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs logCPM: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T)
          
          
          combined_plots_p1[[sample]] <- p1 
          
          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs Counts: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T)
          
          combined_plots_p2[[sample]] <- p2
          print(grid.arrange(p1, p2, ncol = 2))
          
        }
        else if(method == "MaxSat"){
          print("Going with saturation filter ...")
          t_filt <- t[t$Saturation < (mythresh*100),]
          
          #t_filt <- subset(t, subset = NewAbund < (mythresh * 100))
          
          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs logCPM: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < (mythresh*100),])], color = "#3a93bd",show.legend = T)
          
          combined_plots_p1[[sample]] <- p1 
          
          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
            theme_bw() +
            theme() +
            ggtitle(paste0("Rank vs Counts: ", sample)) +
            geom_point() +
            geom_line() +
            geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < mythresh*100,])], color = "#3a93bd",show.legend = T)
          
          combined_plots_p2[[sample]] <- p2
          print(grid.arrange(p1, p2, ncol = 2))
          
        }
        else{
          stop("Wrong method !!")
        }
      }
    }else{
      t_filt <- t
    }
      
    # removing barcodes with Ns 
      
    t_filt <- t_filt[grep(x = t_filt$Barcode, pattern = "N", invert = T),]

    # Re-calculate metrics
      
    t_filt$Rank <- as.numeric(rownames(t_filt))
    t_filt$RankNorm <- t_filt$Rank / nrow(t_filt)
    t_filt$Percentage <- t_filt$Counts / sum(t_filt$Counts) * 100
    t_filt$Saturation <- cumsum(t_filt$Percentage)
    t_filt$logCPM <- log1p(t_filt$Counts / sum(t_filt$Counts)*1000000)
      
    fulldata_clean_list[[sample]] <- t_filt
      
  }
  
  dev.off()

  # Re-create fulldata and run the charts
  
  fulldata <- NULL
  fulldata <- fulldata_clean_list[[sampleids[1]]]
  for(id in sampleids[2:length(sampleids)]){
    fulldata <- rbind(fulldata, fulldata_clean_list[[id]])
  }
  
  saveRDS(fulldata, paste0(outfolder, "fulldata_post_curve_filt_objects.rds"))
  
  #### RUN FILTERING ACCORDING TO SHARING WITHIN GROUP ####
  
  if(isTRUE(removeMaxN)){
    bc_by_group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")] 
    
    bcode2remove <- NULL
    
    for (g in unique(annot$Group)) {
      print(paste0("Collecting barcodes present in more than ", max(annot[annot$Group == g,]$MaxN), " samples in group ", g, " ..."))
      bcode2remove <- c(bcode2remove,bc_by_group[bc_by_group$Group == g & bc_by_group$cnt > max(annot[annot$Group == g,]$MaxN),]$Barcode)
      print(paste0("Number of barcodes identified:  ", length(bcode2remove)))
    }
    
    print(paste0("Total number of barcodes to remove from analysis: ", length(unique(bcode2remove))))
    removed_data <- fulldata[fulldata$Barcode %in% unique(bcode2remove),]
    fulldata <- fulldata[!fulldata$Barcode %in% unique(bcode2remove),]
    
    print(x = paste0("Prima stampa"))
    print(head(fulldata))
    
    fulldata <- as.data.table(fulldata %>% group_by(SampleID) %>% 
                                mutate(Percentage = Counts / sum(Counts) * 100) %>%
                                mutate(Saturation = cumsum(Percentage)) %>% 
                                mutate(Rank = row_number()) %>% 
                                mutate(RankNorm = Rank/max(Rank)))
    
    print(paste0("Seconda stampa .. post-ricalcolo percentuali"))
    print(head(fulldata))
    
    write.xlsx(x = list(full = as.data.frame(removed_data)), file = paste0(tabledir,projectid,"_removed_data.xlsx"),
               overwrite = T,
               asTable = T, colWidths = "auto", firstRow = T, rowNames = F,  headerStyle = hs)
  }

    # Apply eventual re-assignment of barcodes to most expressing sample - according to the bcodes2reassign vector #
  
  #saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp.rds"))
  
  if(!is.null(bcodes2reassign)){
    
    tmpdf_reass <- as.data.frame(bcodes2reassign)
    colnames(tmpdf_reass) <- c("bcodes2reassign")
    tmpdf_reass$SampleID <- names(bcodes2reassign)
    tmpdf_reass$SampleBC <- paste0(tmpdf_reass$SampleID,"_", tmpdf_reass$bcodes2reassign)
    
    fulldata$SampleBC <- paste0(fulldata$SampleID, "_", fulldata$Barcode)
    fulldata_good <- fulldata[!fulldata$Barcode %in% bcodes2reassign, ]
    print("fulldata_good print - elements shared within groups or not shared between groups")
    print(dim(fulldata_good))
    fulldata_reassigned <- fulldata[fulldata$SampleBC %in% tmpdf_reass$SampleBC, ]
    print("fulldata_reassigned print - elements to reassing")
    print(dim(fulldata_reassigned))
    fulldata2 <- rbind(fulldata_good, fulldata_reassigned)
    print(dim(fulldata2))
    fulldata2 <- fulldata2[order(factor(fulldata2$SampleID, levels=rev(sampleids)), 
                    fulldata2$Counts, decreasing = T
                    ),]
    fulldata2$SampleBC <- NULL
    
    fulldata2 <- as.data.table(fulldata2 %>% group_by(SampleID) %>% 
                                mutate(Percentage = Counts / sum(Counts) * 100) %>% 
                                mutate(Saturation = cumsum(Percentage)) %>% 
                                mutate(Rank = row_number()) %>% 
                                mutate(RankNorm = Rank/max(Rank)))
    
    print("Fulldata 2 print .....")
    #head(fulldata2)
    
    # fulldata2 <- as.data.frame(fulldata2 %>% group_by(SampleID) %>% 
    #                              mutate(Percentage_new = Counts / sum(Counts) * 100) %>% 
    #                              mutate(Saturation_new = cumsum(Percentage_new))) %>% 
    #   mutate(Rank_new = row_number()) %>% 
    #   mutate(RankNorm_new = Rank_new/max(Rank_new))
    dim(fulldata2)
    fulldata <- fulldata2
    print(dim(fulldata))
    rm(fulldata2)
  }
  
  print(dim(fulldata))
  
  #### Collect data and stats after filtering ####
  
  filtered_objects <- datacharts(indata = fulldata, prefix = "filtered", plotdir = plotdir, projectid = projectid, mc = mc, hs = hs, tabledir = tabledir)
  
  #saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp2.rds"))
  
   # Creating jaccard matrix
  
  jaccard <- function(a, b) {
      intersection = length(intersect(a, b))
      union = length(a) + length(b) - intersection
      return (intersection/union)
  }  
  
  jacc_mt <- matrix(0, nrow = length(unique(fulldata$SampleID)),
                     ncol = length(unique(fulldata$SampleID)))
  
  rownames(jacc_mt) <- unique(fulldata$SampleID)
  colnames(jacc_mt) <- rownames(jacc_mt)
  jacc_df <- as.data.frame(jacc_mt)
  intersect_df <- jacc_df
  sharing_df <- jacc_df
  
  for (k in unique(fulldata$SampleID)) {
    for (j in unique(fulldata$SampleID)){
      jacc_df[k,j] <- jaccard(a = fulldata[fulldata$SampleID == k]$Barcode, b = fulldata[fulldata$SampleID == j]$Barcode)
      intersect_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode))
      sharing_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode)) / length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == k]$Barcode)) * 100
    }
  }
  
  all_counts_casted_binary <- as.matrix(filtered_objects$Binary)
  #print(head(all_counts_casted_binary))

  jacc_mt <- as.matrix(jacc_df)
  intersect_mt <- as.matrix(intersect_df)
  sharing_mt <- as.matrix(sharing_df)

  unique_shared <- as.data.frame(matrix(nrow=length(sampleids),ncol=4))
  colnames(unique_shared) <- c("Total","Unique","Shared","Shared_Pct")
  rownames(unique_shared) <- colnames(x = all_counts_casted_binary)
  
  for(i in 1:nrow(unique_shared)){
    
    unique_shared$Total[i] <- colSums(all_counts_casted_binary)[i]
    a <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) == 0),]))
   # print(a)
    unique_shared$Unique[i] <- a
    b <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) > 0),]))
  #  print(b)
    unique_shared$Shared[i] <- b
    unique_shared$Shared_Pct[i] <- unique_shared$Shared[i]/unique_shared$Total[i] * 100
  }
  
  pheatmap::pheatmap(mat = intersect_mt, main = "Barcodes intersection",
                     display_numbers = T,number_format = "%.0f",
                     cluster_cols = F, cluster_rows = F, 
                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
                     filename = paste0(plotdir, projectid,"_Intersection_pairwise.pdf"), width = 12, height = 12)
  
  numsize = ifelse(nrow(sharing_mt) > 15, 4, 10)
  
  pheatmap::pheatmap(mat = sharing_mt, main = "Percent of sharing",
                     display_numbers = T,number_format = "%.1f",
                     cluster_cols = F, cluster_rows = F, fontsize_number = numsize,
                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
                     filename = paste0(plotdir, projectid,"_Sharing_pairwise.pdf"), 
                     width = 12, height = 12)
  
  pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity",
                     display_numbers = T,number_format = "%.2f",
                     cluster_cols = F, cluster_rows = F, fontsize_number = numsize,
                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
                     filename = paste0(plotdir, projectid,"_Jaccard_pairwise.pdf"), 
                     width = 12, height = 12)
  
  pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity",
                     display_numbers = T,number_format = "%.2f",
                     cluster_cols = T, cluster_rows = T, fontsize_number = numsize,
                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
                     filename = paste0(plotdir, projectid,"_Jaccard_pairwise_auto.pdf"), 
                     width = 12, height = 12)
  

  # Sharing between groups present in the matrix

  intersect_vectors <- list()
  intersect_vectors_ori <- list()
  #saveRDS(fulldata, file = "Fulldata.rds")
  
  sum.in.group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")] 
  
  sum.in.group.melt <- as.data.frame(dcast(data = sum.in.group, formula = Barcode ~ Group, value.var = "cnt"))
  sum.in.group.melt[is.na(sum.in.group.melt)] <- 0
  rownames(sum.in.group.melt) <- sum.in.group.melt$Barcode
  sum.in.group.melt$Barcode <- NULL
  
  if(length(unique(sum.in.group$Group)) > 1){
  
  l <- list()
  
  for (gr in unique(sum.in.group$Group)) {
    l[[gr]] <- c(0,1)
  }
  
  j <- expand.grid(l)
  if(ncol(j) == 2){
    j <- matrix(j[rowSums(j) == 2,], nrow=1)
  }else{
    j <- as.matrix(j[rowSums(j) > 1,])
  }
  
  colnames(j) <- unique(sum.in.group$Group)
  
  for(i in 1:nrow(j)){
    rownames(j)[i] <- paste(x = colnames(j)[which(j[i,] == 1)],collapse = "_vs_")
  }

  splitdata <- split(sum.in.group, sum.in.group$Group)
  for(i in names(splitdata)){
    splitdata[[i]] <- splitdata[[i]]$Barcode
  }
  
  ###### UPSET ######
  
  upsetr.input <- fromList(splitdata)
  rownames(upsetr.input) <- unique(unlist(splitdata))
  upsetr.input <- upsetr.input[,colnames(j)]
  
  for(comparison in rownames(j)){
    matching.vect <- NULL
    for(n in 1:nrow(upsetr.input)){
      
      a <- as.character(upsetr.input[n,])
      b <- as.character(j[comparison,])
      
      if(identical(a,b)){
        matching.vect <- c(matching.vect , n)
      }
    }
    bcodes <- rownames(upsetr.input[matching.vect,])
    intersect_vectors[[comparison]] <- sum.in.group.melt[bcodes,]
  }
  
  counts_by_sample <- fulldata[, .(count = .N), by = SampleID]
  counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T)
  counts_by_sample_annot$Cutoff <- NULL
  
  pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9)
  print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count)) + geom_boxplot())
  dev.off()
  
  counts_by_sample <- fulldata[, .(count = .N), by = SampleID]
  counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T)
  counts_by_sample_annot$Cutoff <- NULL
  
  pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9)
  print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count, fill = Group)) + geom_boxplot() + geom_point())
  dev.off()
  
  # UpSet R (only if groups >= 2)
  
  pdf(file = paste0(plotdir, projectid,"_UpSetR_by_groups.pdf"), width = 18, height = 9)
  print(upset(upsetr.input, order.by = "freq", sets = rev(colnames(upsetr.input)), keep.order = TRUE, nintersects = NA))
  dev.off()
  
  #saveRDS(intersect_vectors, "intersect_vectors.rds")
  intersect_vectors_ori <- intersect_vectors
  
  for(i in 1:length(names(intersect_vectors))){
    if(nchar(names(intersect_vectors)[i]) > 30){
      newname <- str_sub(string = stri_reverse(names(intersect_vectors)[i]), start = 1, end = 30)
      names(intersect_vectors)[i] <- newname
    }
    if(isTRUE(duplicated(names(intersect_vectors)))){
      stop()
    }
  }
  
  write.xlsx(x = intersect_vectors, file = paste0(tabledir, projectid,"_SharingByGroup.xlsx"), overwrite = T,
             asTable = T, rowNames = T, colWidths = "auto", firstRow = T,  headerStyle = hs)
  
  write.xlsx(x = sum.in.group.melt, file = paste0(tabledir, projectid,"_AggregationByGroup.xlsx"), overwrite = T,
  asTable = T, rowNames = T, colWidths = "auto", firstRow = T,  headerStyle = hs)
  
  }
  
  write.xlsx(x = list(Jaccard = as.data.frame(jacc_mt),
                      Intersection = as.data.frame(intersect_mt), 
                      Sharing = as.data.frame(sharing_mt),
                      Unique_and_shared = as.data.frame(unique_shared)
                      ), 
             file = paste0(tabledir, projectid,"_Sharing.xlsx"), overwrite = T,
             asTable = T, rowNames = T, colWidths = "auto", firstRow = T,  headerStyle = hs)

  # Shared barcodes in the group (only if N° groups == 1)
  
  if(length(unique(fulldata$Group)) == 1){
    
    shared_barcodes_list <- list()
    file.remove(paste0(tabledir, "/","Sharing_", projectid,".txt"))
    
    for(i in unique(fulldata$SampleID)){
      shared_barcodes_list[[i]] <- intersect(fulldata[fulldata$SampleID == i,]$Barcode, 
                                             fulldata[fulldata$SampleID != i,]$Barcode)
      cur.pct <- length(shared_barcodes_list[[i]]) / length(fulldata[fulldata$SampleID == i,]$Barcode)
      write(x = paste0("Sharing ", i, " vs all: ", round(cur.pct * 100,2) , "%"),
            file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T)
    }
    
    shared_barcodes <- unique(unlist(shared_barcodes_list))
    cur.pct <- length(shared_barcodes) / length(unique(fulldata$Barcode))
    write(x = paste0("Sharing ", "within ", " group: ", round(cur.pct * 100,2) , "%"),
          file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T)
    
  }
  

  circosdata <- NULL
  
  if(length(sampleids) > 2){

    circosdata <- as.data.frame(sharing_mt)
    diag(circosdata) <- 0
    
    circosdata$SampleID <- rownames(circosdata)
    
    mydata_long <- melt(circosdata, id.vars = "SampleID")
    colnames(mydata_long) <- c("rowname","key","value")
    
    pdf(file= paste0(plotdir, projectid,"_Circos_by_sampleIDs.pdf"), 
        width = 16, height = 16)
      chordDiagram(mydata_long,
                 grid.col = colorRampPalette(brewer.pal(12, "Paired"))(length(unique(mydata_long$rowname))))
    dev.off()
  }
  
  saveRDS(object = list(
              fulldata = fulldata,
              counts_abund_matrices = filtered_objects,
              intersect_mt = intersect_mt,
              sharing_mt = sharing_mt,
              jacc_mt = jacc_mt,
              aggregationbygroup = sum.in.group,
              aggregationbygroup_melt = sum.in.group.melt,
              sharingbygroup = intersect_vectors_ori,
              #sorted_barcodes_all = sorted_barcodes_all,
              #sorted_barcodes_shared = sorted_barcodes_shared,
              sampleinfo = annot,
              raw_objects = raw_objects,
              circosdata = circosdata
              ), file = paste0(outfolder, projectid, "_objects.rds"))
}