Script barcoding analysis

0cb7bdea · Matteo Barcella · 6d9d4857 · 0cb7bdea · 0cb7bdea · 0cb7bdea
Commit 0cb7bdea authored Aug 05, 2025 by Matteo Barcella
--- a/ChartingBCs_Light.R
+++ b/ChartingBCs_Light.R
+
+chartingBCs_light <- function(bseq.fold, 
+                        sampleids, # SampleIDs vector to include in the analysis
+                        ed = 2, # edit distance
+                        mc = 3, # minimum counts
+                        bcodes2reassign = NULL, # vector of barcodes to be removed [is applied to each sample] : if null no barcodes are removed at all
+                        samplesheet = NULL, # columns: sampleid,group,custom_threshold (rank to cut)
+                        do.filter = TRUE, # TRUE/FALSE to apply filtering or not
+                        projectid = "Exp2020_06_MMs", # prefix to files in output folder
+                        outfolder, # Output folder
+                        counts_vector = c(10,20,30,50,100,200), # vector with counts threshold for heatmap (all barcodes with n > cut-off in at least 1 sample)
+                        pcts_vector = c(0.001,0.01,0.1,1,2,3,5,10,20), # vector with pct threshold for heatmap (all barcodes with % > cut-off in at least 1 sample)
+                        custom.cut = FALSE, # TRUE/FALSE to apply filtering using custom_theshold / automatic
+                        method = "MaxSat", # Possible values: MaxSat / MinPct 
+                        MinlogCPMFilter = FALSE, # with custom.cut TRUE and CutOff > 0;  if FALSE: filter rank based, if TRUE: logCPM filter based
+                        removeMaxN = TRUE # Performing removal of barcodes present in at least 2 samples within group
+                        ){
+  
+  # Script for analyzing barcodes
+  
+  # AIM:
+
+  # retrieve raw stats
+  # import data and calculate CPMs for each sample + other stats
+  # define threshold for each sampleID (with all barcodes including the ones with Ns)
+  # subset barcodes for each sample according to sd1_log rank value or custom specified thresholds
+  # rerun calculation of CPMs and abundances
+  # create plots including:
+  # 
+  # circosplots
+  # Boxplots
+  
+  library(data.table)
+  library(reshape2)
+  library(openxlsx)
+  library(ComplexHeatmap)
+  library(gridExtra)
+  library(stringr)
+  library(dplyr)
+  library(RColorBrewer)
+  library(edgeR)
+  library(ggplot2)
+  library(ggalluvial)
+  library(circlize)
+  library(chorddiag)
+  library(htmlwidgets)
+  library(UpSetR)
+  library(stringi)
+  
+#### GLOBAL SETTINGS ####
+ 
+  #ht_opt(RESET = TRUE)
+ 
+  # Header style for excel outputs
+  
+  hs <- createStyle(
+    textDecoration = "BOLD", fgFill = "#1D8348",
+  )
+  
+  # Setting scientific option
+  
+  options(scipen = 999)
+  
+  
+  # Output folder
+  if(!is.null(outfolder)){
+    outfolder <- paste0(outfolder, "/")
+    dir.create(outfolder, recursive = T, showWarnings = F)
+  }else{
+    stop("Please set the output folder ..")
+  }
+  
+  tabledir <- paste(outfolder,"Tables/", sep = "/")
+  dir.create(tabledir, recursive = T, showWarnings = F)
+  plotdir <- paste(outfolder,"Plots/", sep = "/")
+  dir.create(plotdir, recursive = T, showWarnings = F)
+ 
+  # Retrieve group info from samplesheet file
+  
+  annot <- read.table(file = samplesheet, header = T, stringsAsFactors = F)
+  annot <- subset(x = annot, SampleID %in% sampleids)
+  
+  print(sampleids)
+  
+  sampleids <- annot$SampleID
+  
+  print(sampleids)
+
+
+#### FUNCTIONS ####
+  
+  datacharts <- function(indata, prefix = "raw", plotdir, projectid = "project", mc, hs, tabledir){
+    
+    fulldata_tmp <- indata
+    
+    #### Count distribution pdf file creation ####
+    
+    pdf(file = paste0(plotdir, projectid,"_",prefix,"_distribution_counts.pdf"), width = 12, height = 9)
+    
+    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) + 
+      geom_line() + ggtitle("Rank by SampleID")
+    plot(myplot)
+    
+    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) + 
+      geom_line() + facet_wrap( . ~ SampleID) + ggtitle("Rank by SampleID - split")
+    plot(myplot)
+    
+    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) + 
+      geom_line() + ggtitle("Normalized Rank by SampleID") 
+    plot(myplot)
+    
+    myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) + 
+      geom_line() + facet_wrap( . ~ SampleID)  + ggtitle("Normalized by SampleID - split")
+    plot(myplot)
+    
+    if(length(unique(fulldata_tmp$Group)) > 1){
+      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) + 
+        geom_line() + ggtitle("Rank by Group")
+      plot(myplot)
+      
+      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) + 
+        geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Rank by Group - split")
+      plot(myplot)
+      
+      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) + 
+        geom_line(aes(color = Group)) + ggtitle("Normalized Rank by Group") 
+      plot(myplot)
+      
+      myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) + 
+        geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Normalized by Group - split")
+      plot(myplot)
+    }
+    
+    dev.off()
+    
+    # REMOVING temporal data
+    
+ #   rm(indata)
+    
+    #### Collect raw stats ####
+    
+    # filter fulldata according to min.counts (mc) and collecting number of barcodes by sampleID
+    
+    BarcodeCounts <- fulldata_tmp[Counts >= mc, .(nBarcodes = .N), by = SampleID ]
+    colnames(BarcodeCounts) <- c("SampleID","nBarcodes")
+    
+    pdf(paste0(plotdir, projectid,"_",prefix,"_counts_by_coverage.pdf"), width = 12, height = 9)
+    plot(ggplot(data = BarcodeCounts, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) + 
+           geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", mc)) + 
+           theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
+    )
+    colnames(BarcodeCounts) <- c("SampleID",paste0("nBarcodes_",mc))
+    
+    for(i in c(5,10,20,50,80,100,200,500,1000,1500,2000,3000,5000,10000)){
+      tmp <- fulldata_tmp[Counts >= i, .(nBarcodes = .N), by = SampleID ]
+      colnames(tmp) <- c("SampleID","nBarcodes")
+      plot(ggplot(data = tmp, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) + 
+             geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", i)) +
+             theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5)) 
+      )
+      colnames(tmp) <- c("SampleID",paste0("nBarcodes_",i))
+      BarcodeCounts <- merge(BarcodeCounts,tmp, by = "SampleID")
+    }
+    
+    dev.off() # closing multichart
+    
+    BarcodeCounts <- as.data.frame(t(BarcodeCounts))
+    colnames(BarcodeCounts) <- BarcodeCounts[1,]
+    cols_back <- colnames(BarcodeCounts)
+    rows_back <- rownames(BarcodeCounts)
+    BarcodeCounts <- as.data.frame(BarcodeCounts[2:nrow(BarcodeCounts),])
+    colnames(BarcodeCounts) <- cols_back
+    rownames(BarcodeCounts) <- rows_back[2:length(rows_back)]
+    BarcodeCounts$MinCount <- rownames(BarcodeCounts)
+    BarcodeCounts <- BarcodeCounts[,c("MinCount",colnames(BarcodeCounts)[1:(ncol(BarcodeCounts)-1)])]
+    
+    write.xlsx(x = BarcodeCounts, file = paste0(tabledir, projectid, "_",prefix,"_N.barcodes_by_min_counts.xlsx"), 
+               asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs)
+    
+    # Casting and transform to barcode x sampleid matrix
+    
+    all_counts_casted <- dcast(fulldata_tmp, formula = Barcode ~ SampleID, value.var = 'Counts')
+    all_counts_casted <- as.data.frame(all_counts_casted)
+    all_counts_casted[is.na(all_counts_casted)] <- 0 # replace NAs with 0
+    rownames(all_counts_casted) <- all_counts_casted$Barcode
+    all_counts_casted$Barcode <- NULL
+    
+    # Create binary version
+    all_counts_casted_binary <- all_counts_casted
+    all_counts_casted_binary[all_counts_casted_binary > 0] <- 1
+    all_counts_casted_binary <- as.matrix(all_counts_casted_binary)
+    
+    # matrix of abundances in pct
+    percent_abundance <- prop.table(x = as.matrix(all_counts_casted), margin = 2)*100 # create % abundances matrix
+    percent_abundance_melt <- melt(percent_abundance)
+    percent_abundance <- as.data.frame(percent_abundance)
+    
+    # matrix in CPM
+    CPM <- as.data.frame(cpm(y = all_counts_casted))
+    
+    # matrix in logCPM
+    logCPM <- log1p(CPM)
+    
+    #### save list of raw matrices in xlsx format ####
+    
+    write.xlsx(x = list(Raw = as.data.frame(all_counts_casted),
+                        Binary = as.data.frame(all_counts_casted_binary),
+                        CPM = as.data.frame(CPM), 
+                        logCPM = as.data.frame(logCPM),
+                        pct_abund = percent_abundance),
+               file = paste0(tabledir,projectid,"_",prefix,"_counts_matrices.xlsx"), asTable = T,overwrite = T,
+               rowNames = T, colWidths = "auto", firstRow = T, firstCol = T, headerStyle = hs)
+    
+    write.xlsx(x = list(full = as.data.frame(fulldata_tmp)), file = paste0(tabledir,projectid,"_",prefix,"_fulldata.xlsx"),
+               overwrite = T,
+               asTable = T, colWidths = "auto", 
+               firstRow = T, rowNames = F,  headerStyle = hs)
+    
+    return(list(Raw = as.data.frame(all_counts_casted),
+                Binary = as.data.frame(all_counts_casted_binary),
+                CPM = as.data.frame(CPM), 
+                logCPM = as.data.frame(logCPM),
+                pct_abund = percent_abundance))
+  }
+
+#### IMPORTING RAW DATA ####
+  
+  fulldata <- fread(input = paste0(bseq.fold,"/",sampleids[1],"/",sampleids[1],"_edit_",ed,".barcode.mc",mc,".tsv"))
+  #print(paste0("Before removal of unwanted barcodes : ", nrow(fulldata)))
+  fulldata$SampleID <- sampleids[1]
+  colnames(fulldata) <- c("Barcode","Counts","SaturationOri","SampleID")
+  fulldata$Rank <- as.numeric(rownames(fulldata))
+  fulldata$RankNorm <- fulldata$Rank / nrow(fulldata)
+  fulldata$Percentage <- fulldata$Counts / sum(fulldata$Counts) * 100
+  fulldata$Saturation <- cumsum(fulldata$Percentage)
+  fulldata$logCPM <- log1p(fulldata$Counts / sum(fulldata$Counts)*1000000)
+  
+  for(id in sampleids[2:length(sampleids)]){
+    indata <- fread(input = paste0(bseq.fold,"/",id,"/",id,"_edit_",ed,".barcode.mc",mc,".tsv"))
+    indata$SampleID <- id
+    colnames(indata) <- c("Barcode","Counts","SaturationOri","SampleID")
+    indata$Rank <- as.numeric(rownames(indata))
+    indata$RankNorm <- indata$Rank / nrow(indata)
+    indata$Percentage <- indata$Counts / sum(indata$Counts) * 100
+    indata$Saturation <- cumsum(indata$Percentage)
+    indata$logCPM <- log1p(indata$Counts / sum(indata$Counts)*1000000)
+    fulldata <- rbind(fulldata, indata)
+  }
+
+  
+  # Adding info from samplesheet
+  fulldata <- data.table(merge.data.frame(x = fulldata, y = annot, by = "SampleID", all.x = T, sort = F))
+  
+  ## Adding new columns to final full data including:
+  # Pct_x_VCN = Percentage * VCN
+  # Pct_x_Engraft
+  
+  fulldata$Pct_x_EngraftWBM <- fulldata$Percentage * (fulldata$Engraftment/100) * (fulldata$PctNGFR/100)
+  
+  raw_objects <- datacharts(indata = fulldata, prefix = "raw", plotdir = plotdir, projectid = projectid, mc = mc , hs = hs, tabledir = tabledir)
+  raw_objects$fulldata <- fulldata
+  
+  
+#### DERIVATES ####
+
+  fulldata_clean_list <- list()
+  
+  deriv <- function(x, y) diff(y) / diff(x)
+  middle_pts <- function(x) x[-1] - diff(x) / 2
+  
+  combined_plots_p1 <- list()
+  combined_plots_p2 <- list()
+  cut_thresh <- list()
+  
+  for (sample in unique(fulldata$SampleID)) {
+
+    t <- subset(fulldata, SampleID == sample) # subsetting specific sample
+    d1log <- deriv(log10(t$Rank), log10(t$Counts))
+    sd1_log <- which.min(diff(d1log)) + 1
+    
+    d1logCPM <- deriv(t$Rank, t$logCPM)
+    sd1_logCPM <- which.min(diff(d1logCPM)) + 1
+    
+    prova <- deriv(log10(t$Rank), t$logCPM)
+    sd1_prova <- which.min(diff(prova)) + 1
+    
+    cut_thresh[[sample]] <-  t$Rank[sd1_log]
+    
+  }
+
+
+#### PERFORM CUT ####    
+  
+  pdf(file = paste0(plotdir, projectid,"_raw_barcode_distribution_cut_offs.pdf"), width = 12, height = 6)
+  
+#  for (sample in unique(fulldata$SampleID)) {
+   for (sample in annot$SampleID) {
+        
+    t <- subset(fulldata, SampleID == sample)
+    
+    # removing barcodes ACCORDING TO threshold
+      
+    mythresh <- ifelse(isTRUE(custom.cut), yes = annot[annot$SampleID == sample,]$Cutoff, no = cut_thresh[[sample]])
+      
+    if(isTRUE(do.filter)){
+      if(mythresh > 1){
+        if(isTRUE(MinlogCPMFilter)){
+          print("Going with MinCPM filter ...")
+          t_filt <-  t[t$logCPM > mythresh,]
+          
+          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs logCPM: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p1[[sample]] <- p1 
+          
+          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs Counts: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p2[[sample]] <- p2
+          print(grid.arrange(p1, p2, ncol = 2))
+          
+        }else{
+          print("Going with rank based filter or auto according to custom.cut parameter...")
+          t_filt <-  t[t$Rank < mythresh,]
+          
+          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs logCPM: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p1[[sample]] <- p1 
+          
+          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs Counts: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p2[[sample]] <- p2
+          print(grid.arrange(p1, p2, ncol = 2))
+        }
+      }else{
+        if(method == "MinPct"){
+          print("Going with percentage filter ...")
+          #t_filt <- t[t$Percentage > mythresh,]
+          t_filt <- subset(t, Percentage > mythresh)
+          
+          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs logCPM: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T)
+          
+          
+          combined_plots_p1[[sample]] <- p1 
+          
+          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs Counts: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p2[[sample]] <- p2
+          print(grid.arrange(p1, p2, ncol = 2))
+          
+        }
+        else if(method == "MaxSat"){
+          print("Going with saturation filter ...")
+          t_filt <- t[t$Saturation < (mythresh*100),]
+          
+          #t_filt <- subset(t, subset = NewAbund < (mythresh * 100))
+          
+          p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs logCPM: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < (mythresh*100),])], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p1[[sample]] <- p1 
+          
+          p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
+            theme_bw() +
+            theme() +
+            ggtitle(paste0("Rank vs Counts: ", sample)) +
+            geom_point() +
+            geom_line() +
+            geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < mythresh*100,])], color = "#3a93bd",show.legend = T)
+          
+          combined_plots_p2[[sample]] <- p2
+          print(grid.arrange(p1, p2, ncol = 2))
+          
+        }
+        else{
+          stop("Wrong method !!")
+        }
+      }
+    }else{
+      t_filt <- t
+    }
+      
+    # removing barcodes with Ns 
+      
+    t_filt <- t_filt[grep(x = t_filt$Barcode, pattern = "N", invert = T),]
+
+    # Re-calculate metrics
+      
+    t_filt$Rank <- as.numeric(rownames(t_filt))
+    t_filt$RankNorm <- t_filt$Rank / nrow(t_filt)
+    t_filt$Percentage <- t_filt$Counts / sum(t_filt$Counts) * 100
+    t_filt$Saturation <- cumsum(t_filt$Percentage)
+    t_filt$logCPM <- log1p(t_filt$Counts / sum(t_filt$Counts)*1000000)
+      
+    fulldata_clean_list[[sample]] <- t_filt
+      
+  }
+  
+  dev.off()
+
+  # Re-create fulldata and run the charts
+  
+  fulldata <- NULL
+  fulldata <- fulldata_clean_list[[sampleids[1]]]
+  for(id in sampleids[2:length(sampleids)]){
+    fulldata <- rbind(fulldata, fulldata_clean_list[[id]])
+  }
+  
+  saveRDS(fulldata, paste0(outfolder, "fulldata_post_curve_filt_objects.rds"))
+  
+  #### RUN FILTERING ACCORDING TO SHARING WITHIN GROUP ####
+  
+  if(isTRUE(removeMaxN)){
+    bc_by_group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")] 
+    
+    bcode2remove <- NULL
+    
+    for (g in unique(annot$Group)) {
+      print(paste0("Collecting barcodes present in more than ", max(annot[annot$Group == g,]$MaxN), " samples in group ", g, " ..."))
+      bcode2remove <- c(bcode2remove,bc_by_group[bc_by_group$Group == g & bc_by_group$cnt > max(annot[annot$Group == g,]$MaxN),]$Barcode)
+      print(paste0("Number of barcodes identified:  ", length(bcode2remove)))
+    }
+    
+    print(paste0("Total number of barcodes to remove from analysis: ", length(unique(bcode2remove))))
+    removed_data <- fulldata[fulldata$Barcode %in% unique(bcode2remove),]
+    fulldata <- fulldata[!fulldata$Barcode %in% unique(bcode2remove),]
+    
+    print(x = paste0("Prima stampa"))
+    print(head(fulldata))
+    
+    fulldata <- as.data.table(fulldata %>% group_by(SampleID) %>% 
+                                mutate(Percentage = Counts / sum(Counts) * 100) %>%
+                                mutate(Saturation = cumsum(Percentage)) %>% 
+                                mutate(Rank = row_number()) %>% 
+                                mutate(RankNorm = Rank/max(Rank)))
+    
+    print(paste0("Seconda stampa .. post-ricalcolo percentuali"))
+    print(head(fulldata))
+    
+    write.xlsx(x = list(full = as.data.frame(removed_data)), file = paste0(tabledir,projectid,"_removed_data.xlsx"),
+               overwrite = T,
+               asTable = T, colWidths = "auto", firstRow = T, rowNames = F,  headerStyle = hs)
+  }
+
+    # Apply eventual re-assignment of barcodes to most expressing sample - according to the bcodes2reassign vector #
+  
+  #saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp.rds"))
+  
+  if(!is.null(bcodes2reassign)){
+    
+    tmpdf_reass <- as.data.frame(bcodes2reassign)
+    colnames(tmpdf_reass) <- c("bcodes2reassign")
+    tmpdf_reass$SampleID <- names(bcodes2reassign)
+    tmpdf_reass$SampleBC <- paste0(tmpdf_reass$SampleID,"_", tmpdf_reass$bcodes2reassign)
+    
+    fulldata$SampleBC <- paste0(fulldata$SampleID, "_", fulldata$Barcode)
+    fulldata_good <- fulldata[!fulldata$Barcode %in% bcodes2reassign, ]
+    print("fulldata_good print - elements shared within groups or not shared between groups")
+    print(dim(fulldata_good))
+    fulldata_reassigned <- fulldata[fulldata$SampleBC %in% tmpdf_reass$SampleBC, ]
+    print("fulldata_reassigned print - elements to reassing")
+    print(dim(fulldata_reassigned))
+    fulldata2 <- rbind(fulldata_good, fulldata_reassigned)
+    print(dim(fulldata2))
+    fulldata2 <- fulldata2[order(factor(fulldata2$SampleID, levels=rev(sampleids)), 
+                    fulldata2$Counts, decreasing = T
+                    ),]
+    fulldata2$SampleBC <- NULL
+    
+    fulldata2 <- as.data.table(fulldata2 %>% group_by(SampleID) %>% 
+                                mutate(Percentage = Counts / sum(Counts) * 100) %>% 
+                                mutate(Saturation = cumsum(Percentage)) %>% 
+                                mutate(Rank = row_number()) %>% 
+                                mutate(RankNorm = Rank/max(Rank)))
+    
+    print("Fulldata 2 print .....")
+    #head(fulldata2)
+    
+    # fulldata2 <- as.data.frame(fulldata2 %>% group_by(SampleID) %>% 
+    #                              mutate(Percentage_new = Counts / sum(Counts) * 100) %>% 
+    #                              mutate(Saturation_new = cumsum(Percentage_new))) %>% 
+    #   mutate(Rank_new = row_number()) %>% 
+    #   mutate(RankNorm_new = Rank_new/max(Rank_new))
+    dim(fulldata2)
+    fulldata <- fulldata2
+    print(dim(fulldata))
+    rm(fulldata2)
+  }
+  
+  print(dim(fulldata))
+  
+  #### Collect data and stats after filtering ####
+  
+  filtered_objects <- datacharts(indata = fulldata, prefix = "filtered", plotdir = plotdir, projectid = projectid, mc = mc, hs = hs, tabledir = tabledir)
+  
+  #saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp2.rds"))
+  
+   # Creating jaccard matrix
+  
+  jaccard <- function(a, b) {
+      intersection = length(intersect(a, b))
+      union = length(a) + length(b) - intersection
+      return (intersection/union)
+  }  
+  
+  jacc_mt <- matrix(0, nrow = length(unique(fulldata$SampleID)),
+                     ncol = length(unique(fulldata$SampleID)))
+  
+  rownames(jacc_mt) <- unique(fulldata$SampleID)
+  colnames(jacc_mt) <- rownames(jacc_mt)
+  jacc_df <- as.data.frame(jacc_mt)
+  intersect_df <- jacc_df
+  sharing_df <- jacc_df
+  
+  for (k in unique(fulldata$SampleID)) {
+    for (j in unique(fulldata$SampleID)){
+      jacc_df[k,j] <- jaccard(a = fulldata[fulldata$SampleID == k]$Barcode, b = fulldata[fulldata$SampleID == j]$Barcode)
+      intersect_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode))
+      sharing_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode)) / length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == k]$Barcode)) * 100
+    }
+  }
+  
+  all_counts_casted_binary <- as.matrix(filtered_objects$Binary)
+  #print(head(all_counts_casted_binary))
+
+  jacc_mt <- as.matrix(jacc_df)
+  intersect_mt <- as.matrix(intersect_df)
+  sharing_mt <- as.matrix(sharing_df)
+
+  unique_shared <- as.data.frame(matrix(nrow=length(sampleids),ncol=4))
+  colnames(unique_shared) <- c("Total","Unique","Shared","Shared_Pct")
+  rownames(unique_shared) <- colnames(x = all_counts_casted_binary)
+  
+  for(i in 1:nrow(unique_shared)){
+    
+    unique_shared$Total[i] <- colSums(all_counts_casted_binary)[i]
+    a <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) == 0),]))
+   # print(a)
+    unique_shared$Unique[i] <- a
+    b <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) > 0),]))
+  #  print(b)
+    unique_shared$Shared[i] <- b
+    unique_shared$Shared_Pct[i] <- unique_shared$Shared[i]/unique_shared$Total[i] * 100
+  }
+  
+  pheatmap::pheatmap(mat = intersect_mt, main = "Barcodes intersection",
+                     display_numbers = T,number_format = "%.0f",
+                     cluster_cols = F, cluster_rows = F, 
+                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
+                     filename = paste0(plotdir, projectid,"_Intersection_pairwise.pdf"), width = 12, height = 12)
+  
+  numsize = ifelse(nrow(sharing_mt) > 15, 4, 10)
+  
+  pheatmap::pheatmap(mat = sharing_mt, main = "Percent of sharing",
+                     display_numbers = T,number_format = "%.1f",
+                     cluster_cols = F, cluster_rows = F, fontsize_number = numsize,
+                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
+                     filename = paste0(plotdir, projectid,"_Sharing_pairwise.pdf"), 
+                     width = 12, height = 12)
+  
+  pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity",
+                     display_numbers = T,number_format = "%.2f",
+                     cluster_cols = F, cluster_rows = F, fontsize_number = numsize,
+                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
+                     filename = paste0(plotdir, projectid,"_Jaccard_pairwise.pdf"), 
+                     width = 12, height = 12)
+  
+  pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity",
+                     display_numbers = T,number_format = "%.2f",
+                     cluster_cols = T, cluster_rows = T, fontsize_number = numsize,
+                     color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
+                     filename = paste0(plotdir, projectid,"_Jaccard_pairwise_auto.pdf"), 
+                     width = 12, height = 12)
+  
+
+  # Sharing between groups present in the matrix
+
+  intersect_vectors <- list()
+  intersect_vectors_ori <- list()
+  #saveRDS(fulldata, file = "Fulldata.rds")
+  
+  sum.in.group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")] 
+  
+  sum.in.group.melt <- as.data.frame(dcast(data = sum.in.group, formula = Barcode ~ Group, value.var = "cnt"))
+  sum.in.group.melt[is.na(sum.in.group.melt)] <- 0
+  rownames(sum.in.group.melt) <- sum.in.group.melt$Barcode
+  sum.in.group.melt$Barcode <- NULL
+  
+  if(length(unique(sum.in.group$Group)) > 1){
+  
+  l <- list()
+  
+  for (gr in unique(sum.in.group$Group)) {
+    l[[gr]] <- c(0,1)
+  }
+  
+  j <- expand.grid(l)
+  if(ncol(j) == 2){
+    j <- matrix(j[rowSums(j) == 2,], nrow=1)
+  }else{
+    j <- as.matrix(j[rowSums(j) > 1,])
+  }
+  
+  colnames(j) <- unique(sum.in.group$Group)
+  
+  for(i in 1:nrow(j)){
+    rownames(j)[i] <- paste(x = colnames(j)[which(j[i,] == 1)],collapse = "_vs_")
+  }
+
+  splitdata <- split(sum.in.group, sum.in.group$Group)
+  for(i in names(splitdata)){
+    splitdata[[i]] <- splitdata[[i]]$Barcode
+  }
+  
+  ###### UPSET ######
+  
+  upsetr.input <- fromList(splitdata)
+  rownames(upsetr.input) <- unique(unlist(splitdata))
+  upsetr.input <- upsetr.input[,colnames(j)]
+  
+  for(comparison in rownames(j)){
+    matching.vect <- NULL
+    for(n in 1:nrow(upsetr.input)){
+      
+      a <- as.character(upsetr.input[n,])
+      b <- as.character(j[comparison,])
+      
+      if(identical(a,b)){
+        matching.vect <- c(matching.vect , n)
+      }
+    }
+    bcodes <- rownames(upsetr.input[matching.vect,])
+    intersect_vectors[[comparison]] <- sum.in.group.melt[bcodes,]
+  }
+  
+  counts_by_sample <- fulldata[, .(count = .N), by = SampleID]
+  counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T)
+  counts_by_sample_annot$Cutoff <- NULL
+  
+  pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9)
+  print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count)) + geom_boxplot())
+  dev.off()
+  
+  counts_by_sample <- fulldata[, .(count = .N), by = SampleID]
+  counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T)
+  counts_by_sample_annot$Cutoff <- NULL
+  
+  pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9)
+  print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count, fill = Group)) + geom_boxplot() + geom_point())
+  dev.off()
+  
+  # UpSet R (only if groups >= 2)
+  
+  pdf(file = paste0(plotdir, projectid,"_UpSetR_by_groups.pdf"), width = 18, height = 9)
+  print(upset(upsetr.input, order.by = "freq", sets = rev(colnames(upsetr.input)), keep.order = TRUE, nintersects = NA))
+  dev.off()
+  
+  #saveRDS(intersect_vectors, "intersect_vectors.rds")
+  intersect_vectors_ori <- intersect_vectors
+  
+  for(i in 1:length(names(intersect_vectors))){
+    if(nchar(names(intersect_vectors)[i]) > 30){
+      newname <- str_sub(string = stri_reverse(names(intersect_vectors)[i]), start = 1, end = 30)
+      names(intersect_vectors)[i] <- newname
+    }
+    if(isTRUE(duplicated(names(intersect_vectors)))){
+      stop()
+    }
+  }
+  
+  write.xlsx(x = intersect_vectors, file = paste0(tabledir, projectid,"_SharingByGroup.xlsx"), overwrite = T,
+             asTable = T, rowNames = T, colWidths = "auto", firstRow = T,  headerStyle = hs)
+  
+  write.xlsx(x = sum.in.group.melt, file = paste0(tabledir, projectid,"_AggregationByGroup.xlsx"), overwrite = T,
+  asTable = T, rowNames = T, colWidths = "auto", firstRow = T,  headerStyle = hs)
+  
+  }
+  
+  write.xlsx(x = list(Jaccard = as.data.frame(jacc_mt),
+                      Intersection = as.data.frame(intersect_mt), 
+                      Sharing = as.data.frame(sharing_mt),
+                      Unique_and_shared = as.data.frame(unique_shared)
+                      ), 
+             file = paste0(tabledir, projectid,"_Sharing.xlsx"), overwrite = T,
+             asTable = T, rowNames = T, colWidths = "auto", firstRow = T,  headerStyle = hs)
+
+  # Shared barcodes in the group (only if N° groups == 1)
+  
+  if(length(unique(fulldata$Group)) == 1){
+    
+    shared_barcodes_list <- list()
+    file.remove(paste0(tabledir, "/","Sharing_", projectid,".txt"))
+    
+    for(i in unique(fulldata$SampleID)){
+      shared_barcodes_list[[i]] <- intersect(fulldata[fulldata$SampleID == i,]$Barcode, 
+                                             fulldata[fulldata$SampleID != i,]$Barcode)
+      cur.pct <- length(shared_barcodes_list[[i]]) / length(fulldata[fulldata$SampleID == i,]$Barcode)
+      write(x = paste0("Sharing ", i, " vs all: ", round(cur.pct * 100,2) , "%"),
+            file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T)
+    }
+    
+    shared_barcodes <- unique(unlist(shared_barcodes_list))
+    cur.pct <- length(shared_barcodes) / length(unique(fulldata$Barcode))
+    write(x = paste0("Sharing ", "within ", " group: ", round(cur.pct * 100,2) , "%"),
+          file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T)
+    
+  }
+  
+
+  circosdata <- NULL
+  
+  if(length(sampleids) > 2){
+
+    circosdata <- as.data.frame(sharing_mt)
+    diag(circosdata) <- 0
+    
+    circosdata$SampleID <- rownames(circosdata)
+    
+    mydata_long <- melt(circosdata, id.vars = "SampleID")
+    colnames(mydata_long) <- c("rowname","key","value")
+    
+    pdf(file= paste0(plotdir, projectid,"_Circos_by_sampleIDs.pdf"), 
+        width = 16, height = 16)
+      chordDiagram(mydata_long,
+                 grid.col = colorRampPalette(brewer.pal(12, "Paired"))(length(unique(mydata_long$rowname))))
+    dev.off()
+  }
+  
+  saveRDS(object = list(
+              fulldata = fulldata,
+              counts_abund_matrices = filtered_objects,
+              intersect_mt = intersect_mt,
+              sharing_mt = sharing_mt,
+              jacc_mt = jacc_mt,
+              aggregationbygroup = sum.in.group,
+              aggregationbygroup_melt = sum.in.group.melt,
+              sharingbygroup = intersect_vectors_ori,
+              #sorted_barcodes_all = sorted_barcodes_all,
+              #sorted_barcodes_shared = sorted_barcodes_shared,
+              sampleinfo = annot,
+              raw_objects = raw_objects,
+              circosdata = circosdata
+              ), file = paste0(outfolder, projectid, "_objects.rds"))
+}
--- a/DP-like_EX-U/DP-like_EX-U_analysis.R
+++ b/DP-like_EX-U/DP-like_EX-U_analysis.R
+# Run Analysis (MM samples + Expanded)
+
+library(vegan)
+library(dplyr)
+source(file = "../ChartingBCs_Light.R")
+library(RColorBrewer)
+library(ggrepel)
+library(openxlsx)
+
+paperdata <- "output/"
+dir.create(path = paperdata)
+
+rootdir <- "./"
+bseqdir <- paste0(rootdir,"Input/") # output from barseq
+
+samplesheets <- list(MinPct = paste0(rootdir,"Input/DP-like_EX-U_samplesheet.txt"))
+
+# SampleIDs 
+# MMs :  MM1 MM3 MM4 (Minimally manipulated - DP-like group)
+# Ehigh: E09 E10 E11 E12 E13 E14
+# Elow:  E16 E17 E18 E19 E20 E21
+
+MM = c("MM1","MM3","MM4")
+Ehigh = c("E09", "E10", "E11", "E12", "E13", "E14")
+Elow = c( "E16", "E17", "E18", "E19", "E20", "E21")
+
+SampleIDs <- list(MM = MM, 
+                  Ehigh = Ehigh,
+                  Elow = Elow,
+                  MM_E = c(MM,Ehigh,Elow),
+                  MM_Elow = c(MM,Elow),
+                  MM_Ehigh = c(MM,Ehigh),
+)
+
+#### Basic analysis ####
+
+for(setting in c("MinPct"))
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in names(SampleIDs)) {
+    
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting,
+                      bcodes2reassign = NULL,
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"), 
+                      outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+# Remove barcodes that are shared between MM vs Es (Elow + Ehigh)
+# loading dataframe with all counts passing threshold (0.05) including all replicates and after removal of 
+# barcodes that are present in more than N replicates
+
+MM_E_replicates <- readRDS(file = paste0(rootdir,"MinPct/MM_E_MaxN_removal_seqtk/MinPct_MM_E_MaxN_removal_objects.rds"))
+
+# Findout shared barcodes between MM and Es and assign to the higher replicate
+# data input: MM_E_replicates$sharingbygroup$MM_vs_E_highdose
+
+shared_MM_and_Es_with_replicates <- unique(c(rownames(MM_E_replicates$sharingbygroup$MM_vs_E_highdose),
+                                          rownames(MM_E_replicates$sharingbygroup$MM_vs_E_lowdose),
+                                          rownames(MM_E_replicates$sharingbygroup$MM_vs_E_highdose_vs_E_lowdose)))
+
+saveRDS(object = shared_MM_and_Es_with_replicates, paste0(paperdata,"/shared_MM_and_Es_with_replicates_step01.rds"))
+
+shared_MM_and_Es_with_replicates_mt <- MM_E_replicates$counts_abund_matrices$logCPM[shared_MM_and_Es_with_replicates,c(MM,Ehigh,Elow)]
+shared_MM_and_Es_with_replicates_mt <- as.matrix(arrange_all(shared_MM_and_Es_with_replicates_mt,desc))
+
+shared_MM_and_Es_with_replicates_mt_pcts <- MM_E_replicates$counts_abund_matrices$pct_abund[shared_MM_and_Es_with_replicates,c(MM,Ehigh,Elow)]
+shared_MM_and_Es_with_replicates_mt_pcts <- as.matrix(arrange_all(shared_MM_and_Es_with_replicates_mt_pcts,desc))
+
+# retain barcodes that are shared within dataset ####
+
+MM_obj <- readRDS("MinPct/MM_MaxN_removal_seqtk/MinPct_MM_MaxN_removal_objects.rds")
+Ehigh_obj <- readRDS("MinPct/Ehigh_MaxN_removal_seqtk/MinPct_Ehigh_MaxN_removal_objects.rds")
+Elow_obj <- readRDS("MinPct/Elow_MaxN_removal_seqtk/MinPct_Elow_MaxN_removal_objects.rds")
+
+shared_within_MM <- rownames(subset(MM_obj$aggregationbygroup_melt, subset = MM > 1))
+shared_within_Ehigh <- rownames(subset(Ehigh_obj$aggregationbygroup_melt, subset = E_highdose > 1))
+shared_within_Elow <- rownames(subset(Elow_obj$aggregationbygroup_melt, subset = E_lowdose > 1))
+
+writeLines(text = shared_within_MM, con = paste0(paperdata,"/Shared_barcodes_within_MM.txt"))
+writeLines(text = shared_within_Ehigh, con = paste0(paperdata,"/Shared_barcodes_within_Ehigh.txt")
+writeLines(text = shared_within_Elow, con = paste0(paperdata,"/Shared_barcodes_within_Elow.txt")
+
+dim(shared_MM_and_Es_with_replicates_mt_pcts)
+
+index_to_retain <- which(!rownames(shared_MM_and_Es_with_replicates_mt_pcts) %in% unique(c(shared_within_MM, shared_within_Ehigh, shared_within_Elow)))
+shared_MM_and_Es_with_replicates_mt_pcts <- shared_MM_and_Es_with_replicates_mt_pcts[index_to_retain,]
+shared_MM_and_Es_with_replicates_mt <- shared_MM_and_Es_with_replicates_mt[index_to_retain,]
+
+sds <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = function(x) {sd(x[x > 0])})
+maxs <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = max)
+second <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]}) 
+means <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = mean)
+
+ratio_max_to_2nd <- apply(shared_MM_and_Es_with_replicates_mt_pcts, 
+                               MARGIN = 1, FUN = max) / apply(shared_MM_and_Es_with_replicates_mt_pcts, 
+                                                              MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
+mydf_weightedratio <- as.data.frame(ratio_max_to_2nd)
+colnames(mydf_weightedratio) <- "ratioMax2Second"
+mydf_weightedratio$weighted <- mydf_weightedratio$ratioMax2Second * means
+mydf_weightedratio$logged <- log10(mydf_weightedratio$weighted)
+
+reassignment_rule <- ifelse(mydf_weightedratio$logged > -1, 1, 0)
+names(reassignment_rule) <- rownames(mydf_weightedratio)
+
+saveRDS(object = ratio_max_to_2nd, file = paste0(paperdata,"/ratio_max_to_2nd_pcts_nofilt_within.rds"))
+
+png(paste0(paperdata,"Baplot_ratio_max_2nd.png"), width = 9, height = 6, units = "in", res = 200)
+barplot(log10(ratio_max_to_2nd[order(ratio_max_to_2nd, decreasing = T)]), xaxt='n',
+        xlab = "Shared barcodes - MM vs EX450K EX150K (replicates)",
+        main = "log10 ratio max value vs 2nd")
+abline(h = 1, col = "red")
+dev.off()
+
+barcodes_to_reassign_A <- names(ratio_max_to_2nd[ratio_max_to_2nd > 10]) # filtering 
+barcodes_to_reassign_A_names <- colnames(MM_E_replicates$counts_abund_matrices$pct_abund[barcodes_to_reassign_A,])[apply(X = MM_E_replicates$counts_abund_matrices$pct_abund[barcodes_to_reassign_A,], 
+                                                                                                                         MARGIN = 1, FUN = function(x){which.max(x)})]
+names(barcodes_to_reassign_A) <- barcodes_to_reassign_A_names
+
+saveRDS(object = barcodes_to_reassign_A, file = paste0(paperdata,"/barcodes_to_reassign_A.rds"))
+
+pheatmap::pheatmap(mat = shared_MM_and_Es_with_replicates_mt, cluster_cols = F, cluster_rows = F,
+         scale = "none", filename = paste0(paperdata, "Shared_barcodes_MM_Es_logCPM_before_reassignment.png"), 
+         main = paste0("Shared barcodes (",length(shared_MM_and_Es_with_replicates),") between MM and Expanded", "\n", 
+                       "logCPM counts"),
+         show_rownames = F,
+         width = 9, height = 6, color = rev(brewer.pal(n = 9, "Spectral")))
+
+shared_MM_and_Es_with_replicates_mt_melt <- as.data.frame(melt(shared_MM_and_Es_with_replicates_mt))
+
+png(filename = paste0(paperdata, "Shared_barcodes_MM_Es_logCPM_ggplot_before_reassignment.png"), 
+    width = 9, height = 6, units = "in", res = 300)
+ggplot(data = shared_MM_and_Es_with_replicates_mt_melt, 
+       mapping = aes(x = Var2, y = Var1, fill = value)) + 
+  geom_tile() + xlab("Samples") + ylab("Barcodes") + 
+  ggtitle(paste0("Shared barcodes (",length(shared_MM_and_Es_with_replicates),") between MM and Expanded"), subtitle = "logCPM counts") +
+  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), 
+        axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank()) +
+  scale_fill_distiller(palette = "Spectral", direction = -1)
+dev.off()
+
+# re-running with re-assigning of shared barcodes between MM and Es ##
+
+for(setting in c("MinPct")){
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in names(SampleIDs)) {
+    
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets_replicates[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting, bcodes2reassign = barcodes_to_reassign_A,
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal_reassignA"), 
+                      outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk_reassignA"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+### Create final plots ####
+
+MM_E_replicates_reassigned <- readRDS(file = paste0(rootdir,"MinPct/MM_E_MaxN_removal_seqtk_reassignA/MinPct_MM_E_MaxN_removal_reassignA_objects.rds"))
+
+## Adding new column to 
+
+mydata <- as.data.frame(table(MM_E_replicates_reassigned$fulldata$SampleID))
+colnames(mydata) <- c("SampleID","nBarcodes")
+groupannot <- MM_E_replicates_reassigned$sampleinfo[,c("SampleID","Group")]
+mydata <- merge.data.frame(mydata, groupannot, by = "SampleID", all.x = T,sort = F)
+
+mydata$Group <- gsub(x = mydata$Group, pattern = "E_lowdose", "EX-U150K")
+mydata$Group <- gsub(x = mydata$Group, pattern = "E_highdose", "EX-U450K")
+mydata$Group <- factor(mydata$Group, levels = c("MM","EX-U450K","EX-U150K"))
+
+png(filename = paste0(paperdata,"/Number_of_unique_barcodes_MM_Es_post_reassignment.png"), 
+    width = 4, height = 6, units = "in", res = 300)
+ggplot(data = mydata, mapping = aes(Group, y = nBarcodes, color = Group,
+                                    label = paste0("(",nBarcodes,")"))) + 
+  geom_boxplot(outlier.shape = NA, fatten = 1, lwd=1) +
+  geom_point(position=position_jitterdodge(seed = 500)) +
+  geom_text_repel(show_legend = FALSE, size = 3, seed = 500) + 
+  scale_y_continuous(trans = "log10") +
+  ggtitle(label = "Unique barcodes MM and EXs", subtitle = "filter: % > 0.005") + 
+  #theme_bw() +
+  theme(legend.position = "none", axis.title = element_blank(),
+        panel.background = element_rect(fill = "white"),
+        plot.title = element_text(hjust = 0.5),
+        plot.subtitle = element_text(hjust = 0.5),
+        axis.line = element_line(linetype = 1))
+dev.off()
+
+# AIM: plotting BC sharing within group (MM, Elow and Ehigh) - boxplot
+
+library(openxlsx)
+library(ggplot2)
+library(ggrepel)
+
+# load input 
+
+Ehigh <- readRDS("MinPct/Ehigh_MaxN_removal_seqtk_reassignA/MinPct_Ehigh_MaxN_removal_reassignA_objects.rds")
+Elow <- readRDS("MinPct/Elow_MaxN_removal_seqtk_reassignA/MinPct_Elow_MaxN_removal_reassignA_objects.rds")
+MM <- readRDS("MinPct/MM_MaxN_removal_seqtk_reassignA/MinPct_MM_MaxN_removal_reassignA_objects.rds")
+
+# Sharing intra group 
+
+sharing_intra <- list()
+sharing_intra[["MM"]] <- read.table(file = "MinPct/MM_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_MM_MaxN_removal_reassignA.txt", 
+                                    nrows = 3)
+sharing_intra[["Ehigh"]] <- read.table(file = "MinPct/Ehigh_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_Ehigh_MaxN_removal_reassignA.txt",
+                                       nrows = 6)
+sharing_intra[["Elow"]] <- read.table(file = "MinPct/Elow_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_Elow_MaxN_removal_reassignA.txt",
+                                      nrows = 6)
+
+for(j in names(sharing_intra)){
+  sharing_intra[[j]] <- subset(sharing_intra[[j]], select = c("V2","V5")) 
+  sharing_intra[[j]]$Group <- j
+  colnames(sharing_intra[[j]]) <- c("SampleID","pct.share_1vsAll","Group")
+  sharing_intra[[j]]$pct.share_1vsAll <- gsub(x = sharing_intra[[j]]$pct.share_1vsAll, pattern = "%", replacement = "")
+}
+
+df <- rbind(sharing_intra[["MM"]], sharing_intra[["Ehigh"]])
+df <- rbind(df, sharing_intra[["Elow"]])
+df$pct.share_1vsAll <- as.numeric(df$pct.share_1vsAll)
+df$Group <- gsub(x = df$Group, pattern = "Elow", "EX-U150K")
+df$Group <- gsub(x = df$Group, pattern = "Ehigh", "EX-U450K")
+df$Group <- factor(df$Group, levels = c("MM","EX-U450K","EX-U150K"))
+
+png("PaperData/Pct_sharing_1_vs_All_intragroup_MM_Es.png", 
+    width = 4, height = 4, units = "in", res = 300)
+ggplot(df, mapping = aes(x = Group, y = pct.share_1vsAll, color = Group, label = SampleID)) + 
+  geom_boxplot(outlier.shape = NA) + geom_jitter(position = position_jitter(seed = 6), show.legend = F) +
+  geom_text_repel(position = position_jitter(seed = 6), show.legend = F) +
+  theme_bw() + ylab("% sharing") +
+  ggtitle(label = "Barcoding sharing intra-group", subtitle = "One vs All") +
+  theme(axis.title.x = element_blank(), 
+        #legend.position = c(0.25, 0.8),
+        legend.position = "none",
+        plot.title = element_text(hjust = 0.5), 
+        plot.subtitle = element_text(hjust = 0.5))
+dev.off()
+
+write.xlsx(df, file = "PaperData/Pct_sharing_1_vs_All_intragroup_MM_Es.xlsx", asTable = T)
+
+### Entropy measures #
+
+MM_E_reassigned <- readRDS(file = paste0(rootdir, "MinPct/MM_E_MaxN_removal_seqtk_reassignA/MinPct_MM_E_MaxN_removal_reassignA_objects.rds"))
+
+shannon_MM_E <- diversity(MM_E_reassigned$counts_abund_matrices$Raw, index = "shannon", MARGIN = 2)
+shannon_MM_E_df <- as.data.frame(shannon_MM_E)
+colnames(shannon_MM_E_df) <- "Shannon"
+
+shannon <- shannon_MM_E_df
+
+saveRDS(shannon, file = "PaperData/Shannon_diversity.rds")
+
--- a/DP-like_EX-U/Input/DP-like_EX-U_samplesheet.txt
+++ b/DP-like_EX-U/Input/DP-like_EX-U_samplesheet.txt
+SampleID  Group Cutoff  MaxN  VCN Engraftment PctNGFR
+MM1 MM 0.005 2  2.84  25.5  66.39
+MM3 MM 0.005 2  1.61  52.77 95.53
+MM4 MM 0.005 2  2.11  26.56 88.79
+E09 E_highdose 0.005 3  1.86  36.76 81.17
+E10 E_highdose 0.005 3  1.77  21.19 73.39
+E11 E_highdose 0.005 3  2.07  36.88 92.76
+E12 E_highdose 0.005 3  1.85  20.83 91.43
+E13 E_highdose 0.005 3  2.01  43.92 97.08
+E14 E_highdose 0.005 3  1.35  24.26 89.71
+E16 E_lowdose 0.005 3 1.76  12.11 51.44
+E17 E_lowdose 0.005 3 1.87  4.37  28.89
+E18 E_lowdose 0.005 3 1.73  6.05  48.25
+E19 E_lowdose 0.005 3 1.33  23.58 73.31
+E20 E_lowdose 0.005 3 1.53  1.21  43.94
+E21 E_lowdose 0.005 3 1.74  6.65  67.06
+E09E14  E_highdose  0.005 4 1.86  30.51 90.57
+E16E21 E_lowdose  0.005 4 1.73  6.35  49.845
+MM  MM  0.005 4 2.19 34.94  88.79
\ No newline at end of file
--- a/SCGM/Input/SCGM_samplesheet.txt
+++ b/SCGM/Input/SCGM_samplesheet.txt
+SampleID  Group Cutoff  MaxN  VCN Engraftment PctNGFR
+CG_UM_01 CG_UM  0.005 4 1.244  49.27  56.56
+CG_UM_02 CG_UM  0.005 4 0.862 39.51 48.46
+CG_UM_03 CG_UM  0.005 4 1.13  24.89 48.29
+CG_UM_04 CG_UM  0.005 4 0.779 49.62 38.2
+CG_UM_05 CG_UM  0.005 4 0.96  22  48.44
+CG_UM_06 CG_UM  0.005 4 0.844 57.05 50.48
+CG_UM_07 CG_UM  0.005 4 1.215 71.27 63.59
+CG_UM_08 CG_UM  0.005 4 0.711 49.7  41.95
+CG_UM_2nd_01 CG_UM_2nd  0 4 1 4.51  50.29
+CG_UM_2nd_02 CG_UM_2nd  0 4 1 7.56  15.25
+CG_UM_2nd_03 CG_UM_2nd  0 4 1 1.41  42.31
+CG_UM_2nd_04 CG_UM_2nd  0 4 1 1.56  63.85
+CG_UMSR_12 CG_UMSR  0.005 4 0.769 47.88 46.08
+CG_UMSR_13 CG_UMSR  0.005 4 0.709 51.83 44.16
+CG_UMSR_14 CG_UMSR  0.005 4 0.803 24.77 53.88
+CG_UMSR_15 CG_UMSR  0.005 4 0.53  40.5  32.81
+CG_UMSR_16 CG_UMSR  0.005 4 0.463 63.83 32.13
+CG_UMSR_17 CG_UMSR  0.005 4 0.613 33.51 34.26
+CG_UMSR_18 CG_UMSR  0.005 4 0.506 67.05 35.42
+CG_UMSR_19 CG_UMSR  0.005 4 0.563 61.64 34.02
+CG_UMSR_20 CG_UMSR  0.005 4 0.595 42.11 33.91
+CG_UMSR_21 CG_UMSR  0.005 4 0.428 65.74 29.73
+CG_UMSR_2nd_05 CG_UMSR_2nd  0 4 1 5.62  44.82
+CG_UMSR_2nd_06 CG_UMSR_2nd  0 4 1 2.58  23.98
+CG_UMSR_2nd_07 CG_UMSR_2nd  0 4 1 16.01 25.11
+CG_UMSR_2nd_08 CG_UMSR_2nd  0 4 1 11.19 89.78
--- a/SCGM/Input/SCGM_samplesheet_1st_2nd.txt
+++ b/SCGM/Input/SCGM_samplesheet_1st_2nd.txt
+SampleID  Group Cutoff  MaxN  VCN Engraftment PctNGFR
+CG_UM_01 CG_UM  0 4 1.244  49.27  56.56
+CG_UM_02 CG_UM  0 4 0.862 39.51 48.46
+CG_UM_03 CG_UM  0 4 1.13  24.89 48.29
+CG_UM_04 CG_UM  0 4 0.779 49.62 38.2
+CG_UM_05 CG_UM  0 4 0.96  22  48.44
+CG_UM_06 CG_UM  0 4 0.844 57.05 50.48
+CG_UM_07 CG_UM  0 4 1.215 71.27 63.59
+CG_UM_08 CG_UM  0 4 0.711 49.7  41.95
+CG_UM_2nd_01 CG_UM_2nd  0 4 1 4.51  50.29
+CG_UM_2nd_02 CG_UM_2nd  0 4 1 7.56  15.25
+CG_UM_2nd_03 CG_UM_2nd  0 4 1 1.41  42.31
+CG_UM_2nd_04 CG_UM_2nd  0 4 1 1.56  63.85
+CG_UMSR_12 CG_UMSR  0 4 0.769 47.88 46.08
+CG_UMSR_13 CG_UMSR  0 4 0.709 51.83 44.16
+CG_UMSR_14 CG_UMSR  0 4 0.803 24.77 53.88
+CG_UMSR_15 CG_UMSR  0 4 0.53  40.5  32.81
+CG_UMSR_16 CG_UMSR  0 4 0.463 63.83 32.13
+CG_UMSR_17 CG_UMSR  0 4 0.613 33.51 34.26
+CG_UMSR_18 CG_UMSR  0 4 0.506 67.05 35.42
+CG_UMSR_19 CG_UMSR  0 4 0.563 61.64 34.02
+CG_UMSR_20 CG_UMSR  0 4 0.595 42.11 33.91
+CG_UMSR_21 CG_UMSR  0 4 0.428 65.74 29.73
+CG_UMSR_2nd_05 CG_UMSR_2nd  0 4 1 5.62  44.82
+CG_UMSR_2nd_06 CG_UMSR_2nd  0 4 1 2.58  23.98
+CG_UMSR_2nd_07 CG_UMSR_2nd  0 4 1 16.01 25.11
+CG_UMSR_2nd_08 CG_UMSR_2nd  0 4 1 11.19 89.78
--- a/SCGM/SCGM_primary_analysis.R
+++ b/SCGM/SCGM_primary_analysis.R
+# Run Analysis CG (Cell growth) samples 
+
+library(vegan)
+library(dplyr)
+source(file = "../ChartingBCs_Light.R")
+library(RColorBrewer)
+library(ggrepel)
+
+paperdata <- "PaperData/"
+dir.create(path = paperdata)
+
+rootdir <- "./"
+bseqdir <- paste0(rootdir,"Input/")
+# using only the minpct version - here renamed simply to samplesheet
+samplesheets <- list(MinPct = paste0(bseqdir,"SCGM_samplesheet.txt"))
+
+# SampleIDs / groups present in the paper
+
+# CG_UMSR     : CG_UMSR_12-21
+# CG_UM       : CG_UM_01-08
+# CG_UM_2nd   : CG_UM_2nd_01-04
+# CG_UMSR_2nd   : CG_UMSR_2nd_05-08
+
+CG_UMSR = c("CG_UMSR_12","CG_UMSR_13","CG_UMSR_14","CG_UMSR_15","CG_UMSR_16","CG_UMSR_17",
+            "CG_UMSR_18","CG_UMSR_19","CG_UMSR_20","CG_UMSR_21")
+CG_UM = c("CG_UM_01","CG_UM_02","CG_UM_03","CG_UM_04",
+          "CG_UM_05","CG_UM_06","CG_UM_07","CG_UM_08")
+
+CG_UM_2nd = c("CG_UM_2nd_01","CG_UM_2nd_02","CG_UM_2nd_03","CG_UM_2nd_04")
+CG_UMSR_2nd = c("CG_UMSR_2nd_05","CG_UMSR_2nd_06","CG_UMSR_2nd_07","CG_UMSR_2nd_08")
+
+SampleIDs <- list(CG_UMSR = CG_UMSR, 
+                  CG_UM = CG_UM,
+                  CG_UMSR_2nd = CG_UMSR_2nd,
+                  CG_UM_2nd = CG_UM_2nd,
+                  UM = c(CG_UM,CG_UM_2nd),
+                  UMSR = c(CG_UMSR,CG_UMSR_2nd),
+                  UM_UMSR_1st = c(CG_UMSR,CG_UM),
+                  UM_UMSR_2nd = c(CG_UMSR_2nd,CG_UM_2nd)
+)
+
+#### Basic analysis ####
+
+for(setting in c("MinPct")){
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in names(SampleIDs)) {
+    
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting,
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"), 
+                      outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+# Defining Barcodes to re-assign using the 2 group dataset with CG (UM and UMSR) ####
+
+MinPct_UM_UMSR_1st_MaxN_removal_objects <- readRDS("MinPct/UM_UMSR_1st_MaxN_removal_seqtk/MinPct_UM_UMSR_1st_MaxN_removal_objects.rds")
+tmp.obj <- MinPct_UM_UMSR_1st_MaxN_removal_objects
+
+# potentially to re-assign
+shared_CG_UM_UMSR_first <- rownames(tmp.obj$sharingbygroup$CG_UM_vs_CG_UMSR)
+saveRDS(object = shared_CG_UM_UMSR_first, paste0(paperdata,"PaperData/shared_CG_UM_UMSR_first_step01.rds"))
+
+shared_CG_UM_UMSR_first_mt <- tmp.obj$counts_abund_matrices$logCPM[shared_CG_UM_UMSR_first,c(CG_UM,CG_UMSR)]
+shared_CG_UM_UMSR_first_mt <- as.matrix(arrange_all(shared_CG_UM_UMSR_first_mt,desc))
+
+shared_CG_UM_UMSR_first_mt_pcts <- tmp.obj$counts_abund_matrices$pct_abund[shared_CG_UM_UMSR_first,c(CG_UM,CG_UMSR)]
+shared_CG_UM_UMSR_first_mt_pcts <- as.matrix(arrange_all(shared_CG_UM_UMSR_first_mt_pcts,desc))
+
+# exclude BCs that are shared within group from the list of BCs to be re-assigned ####
+
+shared_within_CG_UM <- rownames(subset(tmp.obj$aggregationbygroup_melt, subset = CG_UM > 1))
+shared_within_CG_UMSR <- rownames(subset(tmp.obj$aggregationbygroup_melt, subset = CG_UMSR > 1))
+
+writeLines(text = shared_within_CG_UM, con = paste0(paperdata,"/Shared_barcodes_within_CG_UM.txt"))
+writeLines(text = shared_within_CG_UMSR, con = paste0(paperdata,"/Shared_barcodes_within_CG_UMSR.txt"))
+
+# from the potentially BCs to be re-assigned exclude the one that are also shared within group
+index_to_retain <- which(!rownames(shared_CG_UM_UMSR_first_mt_pcts) %in% union(shared_within_CG_UMSR, shared_within_CG_UM)) # if not intragroup sharing is affected
+shared_CG_UM_UMSR_first_mt_pcts <- shared_CG_UM_UMSR_first_mt_pcts[index_to_retain,]
+shared_CG_UM_UMSR_first_mt <- shared_CG_UM_UMSR_first_mt[index_to_retain,]
+
+# adding new method of re-assignment based on std dev
+
+sds <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = function(x) {sd(x[x > 0])})
+maxs <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = max)
+second <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]}) 
+means <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = mean)
+
+ratio_max_to_2nd <- apply(shared_CG_UM_UMSR_first_mt_pcts, 
+                          MARGIN = 1, FUN = max) / apply(shared_CG_UM_UMSR_first_mt_pcts, 
+                                                         MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
+
+mydf_weightedratio <- as.data.frame(ratio_max_to_2nd)
+colnames(mydf_weightedratio) <- "ratioMax2Second"
+mydf_weightedratio$weighted <- mydf_weightedratio$ratioMax2Second * means
+mydf_weightedratio$logged <- log10(mydf_weightedratio$weighted)
+
+reassignment_rule <- ifelse(mydf_weightedratio$logged > -1, 1, 0)
+names(reassignment_rule) <- rownames(mydf_weightedratio)
+
+saveRDS(object = ratio_max_to_2nd, file =  paste0(paperdata,"/ratio_max_to_2nd_CG_UM_UMSR_first.rds"))
+saveRDS(object = mydf_weightedratio, file =  paste0(paperdata,"/mydf_weightedratio_CG_UM_UMSR_first.rds"))
+saveRDS(object = means, file =  paste0(paperdata,"/means_CG_UM_UMSR_first.rds"))
+saveRDS(object = reassignment_rule, file =  paste0(paperdata,"/reassignment_rule_CG_UM_UMSR_first.rds"))
+
+####
+
+barcodes_to_reassign <- names(ratio_max_to_2nd[ratio_max_to_2nd > 10]) # filtering
+barcodes_to_reassign_names <- colnames(tmp.obj$counts_abund_matrices$pct_abund[barcodes_to_reassign,])[apply(X = tmp.obj$counts_abund_matrices$pct_abund[barcodes_to_reassign,], MARGIN = 1, FUN = function(x){which.max(x)})]
+names(barcodes_to_reassign) <- barcodes_to_reassign_names
+saveRDS(object = barcodes_to_reassign, file = "PaperData/barcodes_to_reassign_CG_UM_UMSR_first.rds")
+
+# re-running with re-assigning of shared barcodes between CG_UM and CG_UMSR ##
+# Note: barcodes shared also within condition are removed from re-assignment ####
+
+for(setting in c("MinPct")){
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in names(SampleIDs[c(1,2,5,6,7)])) {  # pickup only groups that contain 1st transplant samples, exclude only 2nd 
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting, bcodes2reassign = barcodes_to_reassign,
+                      removeMaxN = FALSE, MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[c(1,2,5,6,7)][[SampleGroup]], 
+                      ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup), 
+                      outfolder = paste(setting, paste0(SampleGroup,"_seqtk_reassign"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+  
+  for (SampleGroup in names(SampleIDs[c(1,2,5,6,7)])) { # pickup only groups that contain 1st transplant samples
+    
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting, bcodes2reassign = barcodes_to_reassign,
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[c(1,2,5,6,7)][[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"), 
+                      outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk_reassign"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+### Create final plots ####
+
+UM_UMSR_1st_reassigned <- readRDS(file = paste0(rootdir,"MinPct/UM_UMSR_1st_MaxN_removal_seqtk_reassign/MinPct_UM_UMSR_1st_MaxN_removal_objects.rds"))
+
+# look at numbers #
+
+table(UM_UMSR_1st_reassigned$fulldata$Group)
+
+# CG_UM CG_UMSR 
+# 4466    5127 
+
+## Adding new column to 
+
+mydata <- as.data.frame(table(UM_UMSR_1st_reassigned$fulldata$SampleID))
+colnames(mydata) <- c("SampleID","nBarcodes")
+mydata$Group <- c(rep(x = "CG_UM", 8), rep("CG_UMSR",10)) 
+
+png(filename = "PaperData/Number_of_unique_barcodes_CG_UM_UMSR_post_reassignment.png", 
+    width = 9, height = 6, units = "in", res = 300)
+ggplot(data = mydata, mapping = aes(Group, y = nBarcodes, color = Group,
+                                    label = paste0(SampleID,"_",nBarcodes))) + 
+  geom_point() + geom_label_repel(show_guide = FALSE) + 
+  ggtitle(label = "Unique barcodes CG_UM and CG_UMSR", subtitle = "filter: % > 0.005") + 
+  theme(legend.position = "none", axis.title = element_blank(), 
+        plot.title = element_text(hjust = 0.5), 
+        plot.subtitle = element_text(hjust = 0.5))
+dev.off()
+
+
+# shared barcodes after reassignment ####
+
+UM_UMSR_1st_reassigned_shared <- rownames(UM_UMSR_1st_reassigned$sharingbygroup$CG_UM_vs_CG_UMSR)
+
+UM_UMSR_1st_reassigned_shared_mt <- UM_UMSR_1st_reassigned$counts_abund_matrices$logCPM[UM_UMSR_1st_reassigned_shared,c(CG_UM,CG_UMSR)]
+UM_UMSR_1st_reassigned_shared_mt <- as.matrix(arrange_all(UM_UMSR_1st_reassigned_shared_mt))
+UM_UMSR_1st_reassigned_shared_mt_melt <- as.data.frame(melt(UM_UMSR_1st_reassigned_shared_mt))
+colnames(UM_UMSR_1st_reassigned_shared_mt_melt) <- c("Barcode","SampleID","logCPM")
+fulldata_subset <- subset(UM_UMSR_1st_reassigned$fulldata, select = c("Barcode", "SampleID", "Rank"), 
+                          subset =  Barcode %in% UM_UMSR_1st_reassigned_shared_mt_melt$Barcode)
+UM_UMSR_1st_reassigned_shared_mt_melt_rank <- merge.data.frame(UM_UMSR_1st_reassigned_shared_mt_melt, fulldata_subset, sort = F)
+
+png(filename = paste0(paperdata, "Shared_barcodes_UM_UMSR_1st_logCPM_ggplot_post_reassignment.png"), 
+    width = 6, height = 6, units = "in", res = 300)
+ggplot(data = UM_UMSR_1st_reassigned_shared_mt_melt_rank, 
+       mapping = aes(x = SampleID, y = Barcode, fill = logCPM, label = Rank)) + 
+  geom_tile() + xlab("Samples") + ylab("Barcodes") + geom_text(size = 3) +
+  #scale_x_discrete(guide = guide_axis(n.dodge=3)) +
+  ggtitle(paste0("Shared barcodes (",length(UM_UMSR_1st_reassigned_shared),") between CG_UM and CG_UMSR"), subtitle = "logCPM counts (rank)") +
+  theme_minimal() + 
+  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), 
+        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 
+        axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank()) +
+  scale_fill_distiller(palette = "Spectral", direction = -1) 
+dev.off()
+
+# shared barcodes after reassignment ####
+tmp2_data <- rownames(do.call(what = rbind, UM_UMSR_1st_reassigned$sharingbygroup))
+CG_UM_UMSR_1st_reassigned_shared <- unique(gsub(x = tmp2_data, pattern = ".*\\.", replacement = ""))
+
+CG_UM_UMSR_1st_reassigned_shared_mt <- UM_UMSR_1st_reassigned$counts_abund_matrices$logCPM[CG_UM_UMSR_1st_reassigned_shared,c(CG_UM,CG_UMSR)]
+CG_UM_UMSR_1st_reassigned_shared_mt <- as.matrix(arrange_all(CG_UM_UMSR_1st_reassigned_shared_mt))
+CG_UM_UMSR_1st_reassigned_shared_mt_melt <- as.data.frame(melt(CG_UM_UMSR_1st_reassigned_shared_mt))
+colnames(CG_UM_UMSR_1st_reassigned_shared_mt_melt) <- c("Barcode","SampleID","logCPM")
+fulldata_subset <- subset(UM_UMSR_1st_reassigned$fulldata, select = c("Barcode", "SampleID", "Rank"), 
+                          subset =  Barcode %in% CG_UM_UMSR_1st_reassigned_shared_mt_melt$Barcode)
+CG_UM_UMSR_1st_reassigned_shared_mt_melt_rank <- merge.data.frame(CG_UM_UMSR_1st_reassigned_shared_mt_melt, fulldata_subset, sort = F)
+
+png(filename = paste0(paperdata, "Shared_barcodes_CG_UM_UMSR_1st_logCPM_ggplot_post_reassignment.png"), 
+    width = 6, height = 6, units = "in", res = 300)
+ggplot(data = CG_UM_UMSR_1st_reassigned_shared_mt_melt_rank, 
+       mapping = aes(x = SampleID, y = Barcode, fill = logCPM, label = Rank)) + 
+  geom_tile() + xlab("Samples") + ylab("Barcodes") + geom_text(size = 3) +
+  ggtitle(paste0("Shared barcodes (",length(CG_UM_UMSR_1st_reassigned_shared),") between CG UM and UMSR"), subtitle = "logCPM counts (rank)") +
+  theme_minimal() + 
+  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), 
+        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 
+        axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank()) +
+  scale_fill_distiller(palette = "Spectral", direction = -1) 
+dev.off()
+
+library(openxlsx)
+library(ggplot2)
+library(ggrepel)
+
+# sharing within groups
+
+sharing_intra <- list()
+sharing_intra[["CG_UM"]] <- read.table(file = "MinPct/CG_UM_MaxN_removal_seqtk_reassign/Tables/Sharing_MinPct_CG_UM_MaxN_removal.txt", 
+                                       nrows = 8)
+sharing_intra[["CG_UMSR"]] <- read.table(file = "MinPct/CG_UMSR_MaxN_removal_seqtk_reassign/Tables/Sharing_MinPct_CG_UMSR_MaxN_removal.txt",
+                                         nrows = 10)
+
+for(j in names(sharing_intra)){
+  sharing_intra[[j]] <- subset(sharing_intra[[j]], select = c("V2","V5")) 
+  sharing_intra[[j]]$Group <- j
+  colnames(sharing_intra[[j]]) <- c("SampleID","pct.share_1vsAll","Group")
+  sharing_intra[[j]]$pct.share_1vsAll <- gsub(x = sharing_intra[[j]]$pct.share_1vsAll, pattern = "%", replacement = "")
+}
+
+df <- rbind(sharing_intra[["CG_UM"]], sharing_intra[["CG_UMSR"]])
+df$pct.share_1vsAll <- as.numeric(df$pct.share_1vsAll)
+df$Group <- gsub(x = df$Group, pattern = "UM",replacement = "U")
+df$Group <- gsub(x = df$Group, pattern = "SR",replacement = "S")
+df$Group <- factor(df$Group, levels = c("CG_U", "CG_US"))
+
+write.xlsx(df, file = paste0(paperdata,"/Pct_sharing_1_vs_All_intragroup_CG_UM_and_UMSR.xlsx"), 
+           asTable = T)
+
+### Entropy measures #
+
+shannon_ALL <- diversity(UM_UMSR_1st_reassigned$counts_abund_matrices$Raw, index = "shannon", MARGIN = 2)
+shannon_ALL_df <- as.data.frame(shannon_ALL)
+colnames(shannon_ALL_df) <- "Shannon"
+saveRDS(shannon_ALL_df, paste0(paperdata, "Shannons.rds"))
+
--- a/SCGM/SCGM_primary_vs_secondary_analysis.R
+++ b/SCGM/SCGM_primary_vs_secondary_analysis.R
+library(ggplot2)
+library(vegan)
+library(pheatmap)
+library(vegan)
+library(dplyr)
+source(file = "../ChartingBCs_Light.R")
+library(RColorBrewer)
+library(ggrepel)
+library(ggalluvial)
+
+paperdata <- "PaperData/"
+dir.create(path = paperdata)
+
+rootdir <- "./"
+bseqdir <- paste0(rootdir,"Input/")
+# using only the minpct version - here renamed simply to samplesheet
+samplesheets <- list(MinPct = paste0(bseqdir,"SCGM_samplesheet_1st_2nd.txt"))
+
+# SampleIDs / groups present in the paper
+
+# CG_UMSR     : CG_UMSR_12-21
+# CG_UM       : CG_UM_01-08
+# CG_UM_2nd   : CG_UM_2nd_01-04
+# CG_UMSR_2nd   : CG_UMSR_2nd_05-08
+
+CG_UMSR = c("CG_UMSR_12","CG_UMSR_13","CG_UMSR_14","CG_UMSR_15","CG_UMSR_16","CG_UMSR_17",
+            "CG_UMSR_18","CG_UMSR_19","CG_UMSR_20","CG_UMSR_21")
+CG_UM = c("CG_UM_01","CG_UM_02","CG_UM_03","CG_UM_04",
+          "CG_UM_05","CG_UM_06","CG_UM_07","CG_UM_08")
+
+CG_UM_2nd = c("CG_UM_2nd_01","CG_UM_2nd_02","CG_UM_2nd_03","CG_UM_2nd_04")
+CG_UMSR_2nd = c("CG_UMSR_2nd_05","CG_UMSR_2nd_06","CG_UMSR_2nd_07","CG_UMSR_2nd_08")
+
+SampleIDs <- list(UM = c(CG_UM,CG_UM_2nd),
+                  UMSR = c(CG_UMSR,CG_UMSR_2nd)
+)
+
+#### Basic analysis ####
+
+for(setting in c("MinPct")){
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in names(SampleIDs)) { 
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting,
+                      removeMaxN = FALSE, MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], 
+                      ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_noFILT_",SampleGroup), 
+                      outfolder = paste(paste0(setting,"_noFILT/"), paste0(SampleGroup,"_seqtk"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+  
+  for (SampleGroup in names(SampleIDs)) {
+    
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting,
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_noFILT_",SampleGroup,"_MaxN_removal"), 
+                      outfolder = paste(paste0(setting,"_noFILT/"), paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+### Plotting Sharing 1st and 2nd ####
+
+CG_UM_unfiltered <- readRDS(file = paste0(rootdir,"MinPct_noFILT/UM_MaxN_removal_seqtk/MinPct_noFILT_UM_MaxN_removal_objects.rds"))
+shared_elements <- CG_UM_unfiltered$counts_abund_matrices$pct_abund[rownames(CG_UM_unfiltered$sharingbygroup$CG_UM_vs_CG_UM_2nd),]
+
+shared_elements$Barcode <- rownames(shared_elements)
+shared_elements_melt <- reshape2::melt(data = shared_elements, id.vars = c("Barcode"))
+shared_elements_melt_annot <- merge.data.frame(shared_elements_melt,
+                                               CG_UM_unfiltered$sampleinfo, 
+                                               by.x = "variable", by.y = "SampleID", 
+                                               all.x = T, sort = F)
+shared_elements_melt_annot$variable <- gsub(x = shared_elements_melt_annot$variable, pattern = "CG_UM_", replacement = "")
+shared_elements_melt_annot$value[which(shared_elements_melt_annot$value == 0)] <- NA 
+
+shared_elements_melt_annot$Group <- gsub(x = shared_elements_melt_annot$Group, pattern = "UM",replacement = "U")
+shared_elements_melt_annot$Group <- factor(shared_elements_melt_annot$Group, levels = c("CG_U", "CG_U_2nd"))  
+
+png(filename = paste0(paperdata,"CG_UM_1st_2nd_shared_heatmap_full.png"),width = 8, height = 6, units = "in", res = 300)
+ggplot(data = shared_elements_melt_annot, mapping = aes(y = Barcode, x = variable, fill = value, label = round(value,4))) + 
+  geom_tile()  + 
+  geom_text(size = 2, color = "black") + 
+  facet_grid( . ~ Group, scale = "free", space = "free")  + 
+  scale_x_discrete(guide = guide_axis(n.dodge=2)) +
+  scale_fill_distiller(palette = "Spectral", direction = -1, na.value = "white") +
+  theme_bw() +
+  theme(axis.title = element_blank(), axis.text = element_text(size = 10), 
+        legend.position = "none",
+        strip.text = element_text(size = 10, face = "bold"), 
+        legend.title = element_blank(),
+        plot.title = element_text(hjust = 0.5),
+        plot.subtitle = element_text(hjust = 0.5)
+        ) + 
+  ggtitle(label = "Shared barcodes between 1st and 2nd transplant", subtitle = "CellGro Cell Culture Medium") +
+  geom_rect(data = data.frame(Group = 'CG_U'), 
+              aes(xmin=c(0.5), xmax=c(1.5), ymin=c(24), ymax=c(26)), 
+              colour="red", fill="transparent", size=1, inherit.aes = FALSE)
+dev.off()
+
+
+CG_UMSR_unfiltered <- readRDS(file = paste0(rootdir,"MinPct_noFILT/UMSR_MaxN_removal_seqtk/MinPct_noFILT_UMSR_MaxN_removal_objects.rds"))
+shared_elements <- CG_UMSR_unfiltered$counts_abund_matrices$pct_abund[rownames(CG_UMSR_unfiltered$sharingbygroup$CG_UMSR_vs_CG_UMSR_2nd),]
+
+shared_elements$Barcode <- rownames(shared_elements)
+shared_elements_melt <- reshape2::melt(data = shared_elements, id.vars = c("Barcode"))
+shared_elements_melt_annot <- merge.data.frame(shared_elements_melt,
+                                               CG_UMSR_unfiltered$sampleinfo, 
+                                               by.x = "variable", by.y = "SampleID", 
+                                               all.x = T, sort = F)
+shared_elements_melt_annot$variable <- gsub(x = shared_elements_melt_annot$variable, pattern = "CG_UMSR_", replacement = "")
+shared_elements_melt_annot$value[which(shared_elements_melt_annot$value == 0)] <- NA 
+
+shared_elements_melt_annot$Group <- gsub(x = shared_elements_melt_annot$Group, pattern = "UM",replacement = "U")
+shared_elements_melt_annot$Group <- gsub(x = shared_elements_melt_annot$Group, pattern = "SR",replacement = "S")
+shared_elements_melt_annot$Group <- factor(shared_elements_melt_annot$Group, levels = c("CG_US", "CG_US_2nd"))  
+
+png(filename = paste0(paperdata,"CG_UMSR_1st_2nd_shared_heatmap_full.png"),width = 8, height = 6, units = "in", res = 300)
+ggplot(data = shared_elements_melt_annot, mapping = aes(y = Barcode, x = variable, fill = value, label = round(value,4))) + 
+  geom_tile()  + 
+  geom_text(size = 2, color = "black") + 
+  facet_grid( . ~ Group, scale = "free", space = "free")  + 
+  scale_x_discrete(guide = guide_axis(n.dodge=2)) +
+  scale_fill_distiller(palette = "Spectral", direction = -1, na.value = "white") +
+  #scale_fill_viridis_c(alpha = 0.9,option = "A", 
+  #                     breaks = c(0,0.01, 0.05, 0.1, 0.5, 1,2,5,10,20,30,50,80,100), direction = -1)
+  theme_bw() +
+  theme(axis.title = element_blank(), axis.text = element_text(size = 8), 
+        legend.position = "none",
+        strip.text = element_text(size = 10, face = "bold"), 
+        legend.title = element_blank(),
+        plot.title = element_text(hjust = 0.5),
+        plot.subtitle = element_text(hjust = 0.5)
+  ) + 
+  ggtitle(label = "Shared barcodes between 1st and 2nd transplant", subtitle = "CellGro Cell Culture MediUMSR") +
+  geom_rect(data = data.frame(Group = 'CG_US'), 
+            aes(xmin=c(6.5), xmax=c(7.5), ymin=c(0.5), ymax=c(1.5)), 
+            colour="red", fill="transparent", size=1, inherit.aes = FALSE)
+dev.off()
+
+
+# Evaluate N° of clones in 1st and 2nd transplant ####
+
+mydataU <- as.data.frame(table(CG_UM_unfiltered$fulldata$SampleID))
+colnames(mydataU) <- c("SampleID","nBarcodes")
+groupannot <- CG_UM_unfiltered$sampleinfo[,c("SampleID","Group")]
+mydataU <- merge.data.frame(mydataU, groupannot, by = "SampleID", all.x = T,sort = F)
+
+mydataUS <- as.data.frame(table(CG_UMSR_unfiltered$fulldata$SampleID))
+colnames(mydataUS) <- c("SampleID","nBarcodes")
+groupannot <- CG_UMSR_unfiltered$sampleinfo[,c("SampleID","Group")]
+mydataUS <- merge.data.frame(mydataUS, groupannot, by = "SampleID", all.x = T,sort = F)
+
+mydata <- rbind(mydataU, mydataUS)
+mydata$Group <- gsub(x = mydata$Group, pattern = "UM",replacement = "U")
+mydata$Group <- gsub(x = mydata$Group, pattern = "SR",replacement = "S")
+mydata$Group <- factor(x = mydata$Group, levels = c("CG_U","CG_US","CG_U_2nd","CG_US_2nd"))
+
+png(filename = "PaperData/Number_of_unique_barcodes_CG_1st_2nd.png", 
+    width = 3, height = 4, units = "in", res = 300)
+ggplot(data = mydata, mapping = aes(Group, y = nBarcodes, color = Group,
+                                    label = paste0("(",nBarcodes,")"))) + 
+  geom_boxplot(outlier.shape = NA, fatten = 0.4, lwd=0.8) +
+  scale_x_discrete(guide = guide_axis(n.dodge=2)) +
+  geom_point(position=position_jitterdodge(seed = 500, jitter.width = 0.8)) +
+  geom_text_repel(show_legend = FALSE, size = 3, seed = 500) + 
+  ggtitle(label = "n° barcodes (1st and 2nd)", subtitle = "no filtering") + 
+  theme(legend.position = "none", axis.title = element_blank(),
+        panel.background = element_rect(fill = "white"),
+        plot.title = element_text(hjust = 0.5),
+        plot.subtitle = element_text(hjust = 0.5),
+        axis.line = element_line(linetype = 1))
+dev.off()
+
+png(filename = "PaperData/Number_of_unique_barcodes_CG_2nd.png", 
+    width = 2, height = 4, units = "in", res = 300)
+ggplot(data = subset(mydata, subset = Group %in% c("CG_U_2nd","CG_US_2nd")), 
+       mapping = aes(Group, y = nBarcodes, color = Group,
+                                    label = paste0("(",nBarcodes,")"))) + 
+  scale_fill_manual(values = c("#357BCF","#8DD3FB")) + 
+  scale_color_manual(values = c("#357BCF","#8DD3FB")) + 
+  geom_boxplot(outlier.shape = NA, fatten = 0.8, lwd=0.8) +
+  scale_x_discrete(guide = guide_axis(n.dodge=2)) +
+  geom_point(position=position_jitterdodge(seed = 500, jitter.width = 0.8)) +
+  ggtitle(label = "n° barcodes 2nd", subtitle = "No filtering") + 
+  theme(panel.background = element_rect(fill = "white"), 
+        plot.title = element_text(hjust = 0.5),
+        plot.subtitle = element_text(hjust = 0.5),
+        axis.line = element_line(linetype = 1),
+        legend.position = "none",
+        legend.box.background = element_blank(),
+        legend.key = element_rect(fill = "white"),
+        legend.text = element_text(size = 8),
+        legend.title = element_blank(),
+        axis.title.x = element_blank(),
+        axis.title.y = element_blank(),
+        axis.text.y = element_text(size = 12,  color = "black"),
+        axis.text.x = element_text(angle = 0, size = 12, color = "black"), 
+        strip.text = element_text(size = 8))
+dev.off()
+
+
+##### Creating plots ofr showing the clonal structure in 1st vs 2nd ###
+# Idea: subdivide in bins of abundance and plots classes in 1st and second on shared elements only
+
+library(ggalluvial)
+first_second_sharing_fold <- paste0(paperdata, "1st_vs_2nd_sharing/")
+dir.create(first_second_sharing_fold)
+
+clonal_structure_1st_2nd <- list(objects = list(), 
+                                 shared_elements = list(),
+                                 shared_elements_grouped = list())
+
+for(group in c("UM","UMSR")){
+  
+  clonal_structure_1st_2nd[["objects"]][[group]] <- readRDS(file = paste0(rootdir,"MinPct_noFILT/",group,"_MaxN_removal_seqtk/MinPct_noFILT_",group,"_MaxN_removal_objects.rds"))
+  shared_elements <- clonal_structure_1st_2nd[["objects"]][[group]]$counts_abund_matrices$pct_abund[rownames(clonal_structure_1st_2nd[["objects"]][[group]]$sharingbygroup[[paste0("CG_",group,"_vs_CG_",group,"_2nd")]]),]
+  
+  shared_elements$Barcode <- rownames(shared_elements)
+  shared_elements_melt <- reshape2::melt(data = shared_elements, id.vars = c("Barcode"))
+  shared_elements_melt_annot <- merge.data.frame(shared_elements_melt,
+                                                 clonal_structure_1st_2nd[["objects"]][[group]]$sampleinfo, 
+                                                 by.x = "variable", by.y = "SampleID", 
+                                                 all.x = T, sort = F)
+  shared_elements_melt_annot$variable <- gsub(x = shared_elements_melt_annot$variable, pattern = paste0("CG_",group,"_"), replacement = "")
+  shared_elements_melt_annot$value[which(shared_elements_melt_annot$value == 0)] <- NA 
+  clonal_structure_1st_2nd[["shared_elements"]][[group]] <- shared_elements_melt_annot
+  clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]] <- clonal_structure_1st_2nd[["shared_elements"]][[group]] %>% 
+    group_by(Group, Barcode) %>% 
+    summarise(mean_val = mean(value, na.rm = T), 
+              mean_VCN = mean(VCN, na.rm = T),
+              mean_Engraftment = mean(Engraftment, na.rm = T),
+              mean_PctNGFR = mean(PctNGFR, na.rm = T))
+  
+  clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]]$Category <- cut(x = clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]]$mean_val, 
+                                                                                 breaks = c(0,0.1,1,100),
+                                                                                 labels = c("rare (0 - 0.1%)",
+                                                                                            "medium (0.1 - 1%)",
+                                                                                            "expanded ( > 1%)"))
+  
+  png(filename = paste0(first_second_sharing_fold,"clonal_structure_1st_vs_2nd_",group,"_grouped_no0s.png"), 
+      width = 6, height = 6, units = "in", res = 300)
+  print(ggplot(clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]], 
+               mapping = aes(Group)) + 
+          geom_bar(mapping = aes(fill = Category), position = "fill") + 
+          ggtitle("Shared barcoded 1st vs 2nd", 
+                  subtitle = paste0("Abundance levels (CG-",group,") - grouped mean no 0s")) +
+          theme(plot.title = element_text(hjust = 0.5),
+                plot.subtitle = element_text(hjust = 0.5)))
+  dev.off()
+  
+  # Create an alluvial plot of this aggregated barcodes abundances between 1st and 2nd
+  
+  allbcs <- clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]]$Barcode
+  colpalette <- colorRampPalette(RColorBrewer::brewer.pal(12, name = "Paired"))(length(unique(allbcs)))
+  
+  png(filename = paste0(first_second_sharing_fold, "clonal_structure_1st_vs_2nd_",group,"_grouped_alluvium_track_no0s.png"), 
+      width = 6, height = 6, units = "in", res = 300)
+  print(ggplot(data = clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]], 
+               aes(x = Group, y = mean_val, alluvium = Barcode, label = Barcode)) +
+          ylab("Mean abundance %") + 
+          geom_alluvium(aes(fill = Barcode),
+                        alpha = .75, decreasing = FALSE, width = 1/4) +
+          geom_stratum(aes(stratum = Barcode, fill = Barcode), alpha = .50, decreasing = FALSE, width = 1/4) + 
+          scale_fill_manual(values = alpha(colour = colpalette, alpha = 0.50)) + 
+          theme_bw() + 
+          ggtitle(label = "Shared barcodes tracking", subtitle = paste0("CG-", group, " aggregation by group - mean no 0s")) +
+          theme(legend.position = "none", plot.title = element_text(hjust = 0.5), 
+                plot.subtitle = element_text(hjust = 0.5)))
+  dev.off()
+  
+}
+
+saveRDS(object = clonal_structure_1st_2nd, file = paste0(paperdata,"/1st_vs_2nd_sharing/clonal_structure_1st_2nd_mean_no0s.rds")
+
+library(openxlsx)
+
+write.xlsx(x = clonal_structure_1st_2nd$shared_elements_grouped, 
+           file = paste0(paperdata"/1st_vs_2nd_sharing/clonal_structure_1st_2nd_grouped_sharing_mean_no0s.xlsx"), asTable = T)
+
+# Combine image - larger size ####
+
+DataAnalysis <- readRDS(paste0(paperdata,"/1st_vs_2nd_sharing/clonal_structure_1st_2nd_mean_no0s.rds"))
+for(i in names(DataAnalysis$shared_elements_grouped)){
+  DataAnalysis$shared_elements_grouped[[i]]$Condition <- i
+}
+
+mydata <- rbind.data.frame(DataAnalysis$shared_elements_grouped$UM, DataAnalysis$shared_elements_grouped$UMSR)
+mydata$Transplant <- "Primary"
+mydata$Transplant[grep(pattern = "2nd", x = mydata$Group)] <- "Secondary"
+mydata$Transplant <- gsub(x = mydata$Transplant, pattern = "Primary", replacement = "1st")
+mydata$Transplant <- gsub(x = mydata$Transplant, pattern = "Secondary", replacement = "2nd")
+
+allbcs <- mydata$Barcode
+colpalette <- colorRampPalette(RColorBrewer::brewer.pal(12, name = "Paired"))(length(unique(allbcs)))
+
+png(filename = paste0(paperdata,"Combo_clonal_structure_1st_vs_2nd_alluvium_track.png"), 
+    width = 6, height = 6, units = "in", res = 300)
+print(ggplot(data = mydata, 
+             aes(x = Transplant, y = mean_val, alluvium = Barcode, label = Barcode)) +
+        ylab("Mean abundance % - cumulative") + 
+        geom_alluvium(aes(fill = Barcode),
+                      alpha = .75, decreasing = FALSE, width = 1/4) +
+        geom_stratum(aes(stratum = Barcode, fill = Barcode), alpha = .50, decreasing = FALSE, width = 1/4) + 
+        scale_fill_manual(values = alpha(colour = colpalette, alpha = 0.50)) + 
+        facet_grid(. ~ Condition,scales = "free") +
+        ggtitle(label = "Shared BC tracking", subtitle = paste0("aggregation by group - mean (na.rm)")) +
+        theme(legend.position = "none", 
+              axis.text = element_text(size = 20, color = "black"),
+              axis.title = element_blank(),
+              axis.text.y.left = element_text(size = 20, angle = -270, hjust = 0.5, color = "black"),
+              panel.background = element_rect(fill = "white"),
+              strip.text.x = element_text(size = 20),
+              axis.line = element_line(linetype = 1),
+              strip.background.x = element_rect(fill = "white"),
+              plot.subtitle = element_blank(),
+              plot.title = element_blank(),
+      ))
+dev.off()
+
--- a/TD_Timing/Input/TD_Timing_samplesheet.txt
+++ b/TD_Timing/Input/TD_Timing_samplesheet.txt
+SampleID Group Cutoff MaxN VCN Engraftment PctNGFR
+ms01 DPlike 0.005 3 0.3164 52 23.8
+ms02 DPlike 0.005 3 0.3096 47.82 18.87
+ms03 DPlike 0.005 3 0.3579 44.54 29.45
+ms04 DPlike 0.005 3 0.5771 27.97 29.82
+ms05 DPlike 0.005 3 0.3527 34.71 18.85
+ms06 DPlike 0.005 3 0.4915 35.2 28.92
+ms09 EXU1 0.005 4 0.533 31.16 33.22
+ms10 EXU1 0.005 4 0.268 24.41 18.49
+ms11 EXU1 0.005 4 0.211 30.44 16.98
+ms12 EXU1 0.005 4 0.26	32.4 23.76
+ms13 EXU1 0.005 4 0.16	38.08 17.94
+ms14 EXU1 0.005 4 0.263 62.23 28.24
+ms15 EXU1 0.005 4 0.457 57.21 25.87
+ms16 EXU1 0.005 4 0.174 79.3	17.35
+ms17 EXU3 0.005 4 3.005 9.22	94.96
+ms18 EXU3 0.005 4 1.256 7.16	40.59
+ms19 EXU3 0.005 4 2.849 1.64 83.84
+ms20 EXU3 0.005 4 2.086 3.19	46.81
+ms21 EXU3 0.005 4 1.638 15.03 76.04
+ms22 EXU3 0.005 4 0.339 13.35 24.1
+ms23 EXU3 0.005 4 1.802 4.04 74.94
+ms24 EXU3 0.005 4 0.252 47.76 14.87
+ms25 EXU2 0.005 3 1.028 37.38 54.43
+ms26 EXU2 0.005 3 0.807 68.93 49.12
+ms27 EXU2 0.005 3 0.649 59.51 40.94
+ms29 EXU2 0.005 3 0.664 35.69 41.24
+ms30 EXU2 0.005 3 0.588 53.67 37.27
+ms31 EXU2 0.005 3 0.623 39.7 36.04
+ms32 EXU2 0.005 3 0.718 62.29 41.74
--- a/TD_Timing/TD_Timing_analysis.R
+++ b/TD_Timing/TD_Timing_analysis.R
+# Run Analysis RI (DP-like) and TI (EXU1-3) samples 
+
+library(vegan)
+library(dplyr)
+source(file = "../ChartingBCs_Light.R")
+library(RColorBrewer)
+library(ggplot2)
+
+paperdata <- "PaperData/"
+dir.create(path = paperdata)
+
+rootdir <- "./"
+bseqdir <- paste0(rootdir,"Input/")
+samplesheets <- list(MinPct = paste0(bseqdir,"TD_Timing_samplesheet.txt"))
+
+# SampleIDs / groups present in the paper
+
+DPlike_Exp2022_05 <- c("ms01","ms02","ms03","ms04","ms05","ms06")
+EXU1_Exp2022_05 = c("ms09","ms10","ms11","ms12","ms13","ms14","ms15","ms16")
+EXU3_Exp2022_05 = c("ms17","ms18","ms19","ms20","ms22","ms23","ms24")
+EXU2_Exp2022_05 = c("ms25","ms26","ms27","ms29","ms30","ms31","ms32")
+
+SampleIDs <- list(
+                  DPlike_Exp2022_05 = DPlike_Exp2022_05,
+                  EXU1_Exp2022_05 = EXU1_Exp2022_05,
+                  EXU2_Exp2022_05 = EXU2_Exp2022_05,
+                  EXU3_Exp2022_05 = EXU3_Exp2022_05,
+		  RITIs_Exp2022_05 = c(DPlike_Exp2022_05,EXU1_Exp2022_05,EXU2_Exp2022_05,EXU3_Exp2022_05)
+                  )
+
+#### Basic analysis ####
+
+for(setting in c("MinPct")){
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in names(SampleIDs)) {
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting,
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"), 
+                      outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+MinPct_RITIs_Exp2022_05_MaxN_removal_objects <- readRDS("MinPct_RITIs_Exp2022_05_MaxN_removal_seqtk/MinPct_RITIs_Exp2022_05_MaxN_removal_objects.rds")
+  
+mylist <- list(Exp2022_05 = MinPct_RITIs_Exp2022_05_MaxN_removal_objects)
+
+shared_RITIs_list <- list()
+
+for(j in names(mylist)){
+  tmp.obj <- mylist[[j]]
+  for(comp in names(tmp.obj$sharingbygroup)){
+    else{
+      shared_RITIs_list[[paste0(j,"_",comp)]] <- rownames(tmp.obj$sharingbygroup[[comp]])  
+    }
+  }  
+}
+
+shared_RITIs <- list(Exp2022_05 = unique(unlist(shared_RITIs_list[grep(pattern = "Exp2022_05", value = T, x = names(shared_RITIs_list))])))
+saveRDS(object = shared_RITIs, "PaperData/shared_RITIs_step01.rds")
+
+library(dplyr)
+
+shared_RITIs_mt <- list()
+shared_RITIs_mt_pcts <- list()
+
+for(j in names(mylist)){
+  tmp.obj <- mylist[[j]]
+  shared_RITIs_mt[[j]] <- tmp.obj$counts_abund_matrices$logCPM[shared_RITIs[[j]],c(DPlike_Exp2022_05,EXU1_Exp2022_05,EXU2_Exp2022_05,EXU3_Exp2022_05)]
+  shared_RITIs_mt[[j]] <- as.matrix(arrange_all(shared_RITIs_mt[[j]],desc))
+  shared_RITIs_mt_pcts[[j]] <- tmp.obj$counts_abund_matrices$pct_abund[shared_RITIs[[j]],c(DPlike_Exp2022_05,EXU1_Exp2022_05,EXU2_Exp2022_05,EXU3_Exp2022_05)]
+  shared_RITIs_mt_pcts[[j]] <- as.matrix(arrange_all(shared_RITIs_mt_pcts[[j]],desc))
+}
+
+# exclude BCs that are shared within group from the list of BCs to be re-assigned ####
+
+within_objs <- list()
+
+for(j in names(mylist)){
+  ids <- NULL
+  if(j == "Exp2022_05"){
+    ids <- c("DPlike_Exp2022_05","EXU1_Exp2022_05","EXU2_Exp2022_05","EXU3_Exp2022_05")
+  }
+  for(id in ids){
+    within_objs[[j]][[id]] <- readRDS(paste0("MinPct/",id,"_MaxN_removal_seqtk/MinPct_",id,"_MaxN_removal_objects.rds"))
+    tmpdf <- as.data.frame(within_objs[[j]][[id]]$aggregationbygroup_melt)
+    print(head(rownames(tmpdf)))
+    within_objs[[j]][[id]] <- rownames(tmpdf)[which(tmpdf[[id]] > 1)]
+    writeLines(text = within_objs[[j]][[id]] , con = paste0(paperdata,"/Shared_barcodes_within_",id,".txt"))
+  }
+}
+
+# the barcodes above (within_objs lists) should be discarded from collision rule  - fetching their index ####
+
+index_to_retain <- list()
+ratio_max_to_2nd <- list()
+reassignment_rule <- list()
+mydf_weightedratio <- list()
+means <- list()
+
+for(j in names(mylist)){
+  index_to_retain[[j]] <- which(!rownames(shared_RITIs_mt_pcts[[j]]) %in% unique(unlist(within_objs[[j]])))
+  barcodes2exclude <- intersect(rownames(shared_RITIs_mt_pcts[[j]]), unique(unlist(within_objs[[j]])))
+  writeLines(text = barcodes2exclude, 
+             con = paste0("PaperData/barcodes_shared_both_within_excluded_from_collision_",j,".txt"))
+  
+  shared_RITIs_mt_pcts[[j]] <- shared_RITIs_mt_pcts[[j]][index_to_retain[[j]],]
+  shared_RITIs_mt[[j]] <- shared_RITIs_mt[[j]][index_to_retain[[j]],]
+  
+  sds <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = function(x) {sd(x[x > 0])})
+  maxs <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = max)
+  means[[j]] <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = mean)
+  second <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]}) 
+  #reassignment_rule[[j]] <- ifelse((maxs - sds) > second, 1, 0)
+  #print(reassignment_rule[[j]])
+  
+  ratio_max_to_2nd[[j]] <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = max) / apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
+  mydf_weightedratio[[j]] <- as.data.frame(ratio_max_to_2nd[[j]])
+  colnames(mydf_weightedratio[[j]]) <- "ratioMax2Second"
+  mydf_weightedratio[[j]]$weighted <- mydf_weightedratio[[j]]$ratioMax2Second * means[[j]]
+  mydf_weightedratio[[j]]$logged <- log10(mydf_weightedratio[[j]]$weighted)
+  
+  reassignment_rule[[j]] <- ifelse(mydf_weightedratio[[j]]$logged > -1, 1, 0)
+  names(reassignment_rule[[j]]) <- rownames(mydf_weightedratio[[j]])
+  
+  saveRDS(object = ratio_max_to_2nd[[j]], file = paste0("PaperData/ratio_max_to_2nd_pcts_",j,"_nofilt_within.rds"))
+
+  png(paste0("PaperData/",j,"_Baplot_ratio_max_2nd.png"), width = 9, height = 6, units = "in", res = 200)
+  barplot(log10(ratio_max_to_2nd[[j]][order(ratio_max_to_2nd[[j]], decreasing = T)]), xaxt='n',
+          xlab = "Shared barcodes",
+          main = "log10 ratio max value vs 2nd")
+  abline(h = 1, col = "red")
+  dev.off()
+}
+
+saveRDS(object = mydf_weightedratio, file = paste0(paperdata,"/mydf_weightedratio.rds"))
+saveRDS(object = ratio_max_to_2nd, file = paste0(paperdata,"/ratio_max_to_2nd.rds"))
+saveRDS(object = reassignment_rule, file = paste0(paperdata,"/reassignment_rule.rds"))
+saveRDS(object = means, file = paste0(paperdata,"/means.rds"))
+
+library(reshape2)
+
+shared_RITIs_mt_melt <- list()
+
+barcodes_to_reassign_A <- list()
+barcodes_to_reassign_A_names <- list()
+
+for(j in names(mylist)){
+  barcodes_to_reassign_A[[j]] <- names(ratio_max_to_2nd[[j]][ratio_max_to_2nd[[j]] > 10]) # filtering 
+  barcodes_to_reassign_A_names[[j]] <- colnames(mylist[[j]]$counts_abund_matrices$pct_abund[barcodes_to_reassign_A[[j]],])[apply(X = mylist[[j]]$counts_abund_matrices$pct_abund[barcodes_to_reassign_A[[j]],], MARGIN = 1, FUN = function(x){which.max(x)})]
+  names(barcodes_to_reassign_A[[j]]) <- barcodes_to_reassign_A_names[[j]]
+  
+  saveRDS(object = barcodes_to_reassign_A[[j]], file = paste0("PaperData/barcodes_to_reassign_A_",j,".rds"))
+  
+  pheatmap::pheatmap(mat = shared_RITIs_mt[[j]], cluster_cols = F, cluster_rows = F,
+                     scale = "none", filename = paste0(paperdata, "Shared_barcodes_RITIs_logCPM_before_reassignment_",j,".png"), 
+                     main = paste0("Shared barcodes (",nrow(shared_RITIs_mt[[j]]),") between TIs vs TIs vs RI (replicates)", "\n", 
+                                   "logCPM counts"),
+                     show_rownames = F,
+                     width = 9, height = 6, color = rev(brewer.pal(n = 9, "Spectral")))
+  
+  shared_RITIs_mt_melt[[j]] <- as.data.frame(melt(shared_RITIs_mt[[j]]))
+  
+  png(filename = paste0(paperdata, "Shared_barcodes_RITIs_logCPM_ggplot_before_reassignment_",j,".png"), 
+      width = 9, height = 6, units = "in", res = 300)
+  print(ggplot(data = shared_RITIs_mt_melt[[j]], 
+         mapping = aes(x = Var2, y = Var1, fill = value)) + 
+    geom_tile() + xlab("Samples") + ylab("Barcodes") + 
+   # scale_x_discrete(guide = guide_axis(n.dodge=2)) +
+    ggtitle(paste0("Shared barcodes (",nrow(shared_RITIs_mt[[j]]),") between TIs vs TIs vs RI"), subtitle = "logCPM counts") +
+    theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), 
+          axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank(),
+          axis.text.x = element_text(angle = 90, vjust = 0.5)) +
+    scale_fill_distiller(palette = "Spectral", direction = -1))
+  dev.off()
+}
+
+
+# Note: barcodes shared also within condition are removed from re-assignment ####
+
+for(setting in c("MinPct")){
+  
+  dir.create(path = setting, showWarnings = F)
+  
+  if(setting  == "Auto"){
+    custom.cut.param <- FALSE
+  }else{
+    custom.cut.param <- TRUE
+  }
+  
+  if(setting  == "MinCPM"){
+    MinlogCPMFilter.param <- TRUE
+  }else{
+    MinlogCPMFilter.param <- FALSE
+  }
+  
+  for (SampleGroup in c("DPlike_Exp2022_05","EXU1_Exp2022_05","EXU2_Exp2022_05","EXU3_Exp2022_05","RITIs_Exp2022_05")) { # pickup only groups that contain 1st transplant samples
+    
+    chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], 
+                      custom.cut = custom.cut.param, do.filter = T, 
+                      method = setting, bcodes2reassign =  barcodes_to_reassign_A[["Exp2022_05"]],
+                      removeMaxN = TRUE, 
+                      MinlogCPMFilter = MinlogCPMFilter.param, 
+                      sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3, 
+                      projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal_reassignedA"), 
+                      outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk_reassignA"),sep = "/"),
+                      counts_vector = c(5,20,50,100,200,500,5000,20000), 
+                      pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
+  }
+}
+
+#### sharing intra condition #####
+
+library(openxlsx)
+library(ggplot2)
+library(ggrepel)
+
+# load input for each batch ####
+
+processed_data_A <- list(Exp2022_05 = list())
+
+for(batch in names(processed_data_A)){
+  processed_data_A[[batch]][["DPlike"]] <- readRDS(paste0("MinPct/DPlike_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_DPlike_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
+  processed_data_A[[batch]][["EXU1"]] <- readRDS(paste0("MinPct/EXU1_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_EXU1_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
+  processed_data_A[[batch]][["EXU2"]] <- readRDS(paste0("MinPct/EXU2_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_EXU2_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
+  processed_data_A[[batch]][["EXU3"]] <- readRDS(paste0("MinPct/EXU3_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_EXU3_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
+}
+
+# Sharing intra group 
+
+sharing_intra_A <- list(Exp2022_05 = list()) # strategy A - final choice
+
+for(batch in names(sharing_intra_A)){
+  
+  for(k in c("DPlike","EXU1","EXU2","EXU3")){
+    sharing_intra_A[[batch]][[k]] <- read.table(file = paste0("MinPct/",k,"_",batch,"_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_",k,"_",batch,"_MaxN_removal_reassignedA.txt"),
+                                                   nrow = length(SampleIDs[[paste0(k,"_",batch)]]))
+  }
+}
+
+### Plotting intragroup sharing boxplots #####
+
+for(j in names(sharing_intra_A)){
+  for(d in names(sharing_intra_A[[j]])){
+    # A type reassignment
+    sharing_intra_A[[j]][[d]] <- subset(sharing_intra_A[[j]][[d]], select = c("V2","V5")) 
+    sharing_intra_A[[j]][[d]]$Group <- d
+    colnames(sharing_intra_A[[j]][[d]]) <- c("SampleID","pct.share_1vsAll","Group")
+    sharing_intra_A[[j]][[d]]$pct.share_1vsAll <- gsub(x = sharing_intra_A[[j]][[d]]$pct.share_1vsAll, pattern = "%", replacement = "")
+  }
+}
+
+
+colorvector <- c("#D6D6D6","#EA4125","#DCFA8A","#194375")
+names(colorvector) <- c("DPlike","EXU1","EXU2","EXU3")
+
+## Plotting 
+
+for(j in names(sharing_intra_A)){
+  df <- rbind(sharing_intra_A[[j]][["DPlike"]], sharing_intra_A[[j]][["EXU1"]])
+  df <- rbind(df, sharing_intra_A[[j]][["EXU2"]])
+  df <- rbind(df, sharing_intra_A[[j]][["EXU3"]])
+  df$pct.share_1vsAll <- as.numeric(df$pct.share_1vsAll)
+  df$Group <- factor(df$Group, levels = c("DPlike", "EXU1", "EXU2","EXU3"))
+  
+  write.xlsx(df, file = paste0(paperdata,"Pct_sharing_1_vs_All_intragroup_",j,"_RITIs_reassignmentA.xlsx"), asTable = T)
+  
+  png(paste0(paperdata,"Pct_sharing_1_vs_All_intragroup_",j,"_RITIs_reassignmentA.png"), width = 6, height = 6, units = "in", res = 300)
+  print(ggplot(df, mapping = aes(x = Group, y = pct.share_1vsAll, color = Group, label = SampleID)) + 
+    geom_boxplot(outlier.shape = NA) + geom_jitter(position = position_jitter(seed = 6), show.legend = F) +
+    #geom_text_repel(position = position_jitter(seed = 6), show.legend = F) +
+    theme_bw() + ylab("% sharing") +
+    ggtitle(label = "Barcoding sharing intra-group", subtitle = paste0( j, "- One vs All - reassign A")) +
+    theme(axis.title.x = element_blank(), 
+          legend.position = c(0.15, 0.8),
+          plot.title = element_text(hjust = 0.5), 
+          plot.subtitle = element_text(hjust = 0.5)))
+  dev.off()
+}
+
+## fix colors and labels for pubblication ####
+
+intra_batch_exp2022_05 <- read.xlsx(paste0(paperdata,"Pct_sharing_1_vs_All_intragroup_Exp2022_05_RITIs_reassignmentA.xlsx"))
+
+# Shannons
+
+shannons_list_A <- list()
+
+for(batch in names(processed_data_A)){
+  for(k in names(processed_data_A[[batch]])){
+    shannon <- diversity(processed_data_A[[batch]][[k]]$counts_abund_matrices$Raw, index = "shannon", MARGIN = 2)
+    shannon_df <- as.data.frame(shannon)
+    colnames(shannon_df) <- "Shannon"
+    shannons_list_A[[batch]][[k]] <- shannon_df
+  }
+}
+
+shannon_Exp2022_05_A <- rbind(shannons_list_A$Exp2022_05$DPlike, shannons_list_A$Exp2022_05$EXU1) 
+shannon_Exp2022_05_A <- rbind(shannon_Exp2022_05_A, shannons_list_A$Exp2022_05$EXU2)
+shannon_Exp2022_05_A <- rbind(shannon_Exp2022_05_A, shannons_list_A$Exp2022_05$EXU3)
+
+saveRDS(shannon_Exp2022_05_A, paste0(paperdata,"Shannons.rds"))
+