Commit 0cb7bdea authored by Matteo Barcella's avatar Matteo Barcella
Browse files

Script barcoding analysis

parent 6d9d4857
chartingBCs_light <- function(bseq.fold,
sampleids, # SampleIDs vector to include in the analysis
ed = 2, # edit distance
mc = 3, # minimum counts
bcodes2reassign = NULL, # vector of barcodes to be removed [is applied to each sample] : if null no barcodes are removed at all
samplesheet = NULL, # columns: sampleid,group,custom_threshold (rank to cut)
do.filter = TRUE, # TRUE/FALSE to apply filtering or not
projectid = "Exp2020_06_MMs", # prefix to files in output folder
outfolder, # Output folder
counts_vector = c(10,20,30,50,100,200), # vector with counts threshold for heatmap (all barcodes with n > cut-off in at least 1 sample)
pcts_vector = c(0.001,0.01,0.1,1,2,3,5,10,20), # vector with pct threshold for heatmap (all barcodes with % > cut-off in at least 1 sample)
custom.cut = FALSE, # TRUE/FALSE to apply filtering using custom_theshold / automatic
method = "MaxSat", # Possible values: MaxSat / MinPct
MinlogCPMFilter = FALSE, # with custom.cut TRUE and CutOff > 0; if FALSE: filter rank based, if TRUE: logCPM filter based
removeMaxN = TRUE # Performing removal of barcodes present in at least 2 samples within group
){
# Script for analyzing barcodes
# AIM:
# retrieve raw stats
# import data and calculate CPMs for each sample + other stats
# define threshold for each sampleID (with all barcodes including the ones with Ns)
# subset barcodes for each sample according to sd1_log rank value or custom specified thresholds
# rerun calculation of CPMs and abundances
# create plots including:
#
# circosplots
# Boxplots
library(data.table)
library(reshape2)
library(openxlsx)
library(ComplexHeatmap)
library(gridExtra)
library(stringr)
library(dplyr)
library(RColorBrewer)
library(edgeR)
library(ggplot2)
library(ggalluvial)
library(circlize)
library(chorddiag)
library(htmlwidgets)
library(UpSetR)
library(stringi)
#### GLOBAL SETTINGS ####
#ht_opt(RESET = TRUE)
# Header style for excel outputs
hs <- createStyle(
textDecoration = "BOLD", fgFill = "#1D8348",
)
# Setting scientific option
options(scipen = 999)
# Output folder
if(!is.null(outfolder)){
outfolder <- paste0(outfolder, "/")
dir.create(outfolder, recursive = T, showWarnings = F)
}else{
stop("Please set the output folder ..")
}
tabledir <- paste(outfolder,"Tables/", sep = "/")
dir.create(tabledir, recursive = T, showWarnings = F)
plotdir <- paste(outfolder,"Plots/", sep = "/")
dir.create(plotdir, recursive = T, showWarnings = F)
# Retrieve group info from samplesheet file
annot <- read.table(file = samplesheet, header = T, stringsAsFactors = F)
annot <- subset(x = annot, SampleID %in% sampleids)
print(sampleids)
sampleids <- annot$SampleID
print(sampleids)
#### FUNCTIONS ####
datacharts <- function(indata, prefix = "raw", plotdir, projectid = "project", mc, hs, tabledir){
fulldata_tmp <- indata
#### Count distribution pdf file creation ####
pdf(file = paste0(plotdir, projectid,"_",prefix,"_distribution_counts.pdf"), width = 12, height = 9)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) +
geom_line() + ggtitle("Rank by SampleID")
plot(myplot)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, color = SampleID)) +
geom_line() + facet_wrap( . ~ SampleID) + ggtitle("Rank by SampleID - split")
plot(myplot)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) +
geom_line() + ggtitle("Normalized Rank by SampleID")
plot(myplot)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, color = SampleID)) +
geom_line() + facet_wrap( . ~ SampleID) + ggtitle("Normalized by SampleID - split")
plot(myplot)
if(length(unique(fulldata_tmp$Group)) > 1){
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) +
geom_line() + ggtitle("Rank by Group")
plot(myplot)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = Rank, y = logCPM, group = SampleID, color = Group)) +
geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Rank by Group - split")
plot(myplot)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) +
geom_line(aes(color = Group)) + ggtitle("Normalized Rank by Group")
plot(myplot)
myplot <- ggplot(data = fulldata_tmp, mapping = aes(x = RankNorm, y = logCPM, group = SampleID, color = Group)) +
geom_line(aes(color = Group)) + facet_wrap( . ~ Group) + ggtitle("Normalized by Group - split")
plot(myplot)
}
dev.off()
# REMOVING temporal data
# rm(indata)
#### Collect raw stats ####
# filter fulldata according to min.counts (mc) and collecting number of barcodes by sampleID
BarcodeCounts <- fulldata_tmp[Counts >= mc, .(nBarcodes = .N), by = SampleID ]
colnames(BarcodeCounts) <- c("SampleID","nBarcodes")
pdf(paste0(plotdir, projectid,"_",prefix,"_counts_by_coverage.pdf"), width = 12, height = 9)
plot(ggplot(data = BarcodeCounts, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) +
geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", mc)) +
theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
)
colnames(BarcodeCounts) <- c("SampleID",paste0("nBarcodes_",mc))
for(i in c(5,10,20,50,80,100,200,500,1000,1500,2000,3000,5000,10000)){
tmp <- fulldata_tmp[Counts >= i, .(nBarcodes = .N), by = SampleID ]
colnames(tmp) <- c("SampleID","nBarcodes")
plot(ggplot(data = tmp, mapping = aes(x = SampleID, y = nBarcodes, fill = SampleID)) +
geom_bar(stat = "identity") + ggtitle(paste0("Minimun read counts: ", i)) +
theme(axis.text = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
)
colnames(tmp) <- c("SampleID",paste0("nBarcodes_",i))
BarcodeCounts <- merge(BarcodeCounts,tmp, by = "SampleID")
}
dev.off() # closing multichart
BarcodeCounts <- as.data.frame(t(BarcodeCounts))
colnames(BarcodeCounts) <- BarcodeCounts[1,]
cols_back <- colnames(BarcodeCounts)
rows_back <- rownames(BarcodeCounts)
BarcodeCounts <- as.data.frame(BarcodeCounts[2:nrow(BarcodeCounts),])
colnames(BarcodeCounts) <- cols_back
rownames(BarcodeCounts) <- rows_back[2:length(rows_back)]
BarcodeCounts$MinCount <- rownames(BarcodeCounts)
BarcodeCounts <- BarcodeCounts[,c("MinCount",colnames(BarcodeCounts)[1:(ncol(BarcodeCounts)-1)])]
write.xlsx(x = BarcodeCounts, file = paste0(tabledir, projectid, "_",prefix,"_N.barcodes_by_min_counts.xlsx"),
asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs)
# Casting and transform to barcode x sampleid matrix
all_counts_casted <- dcast(fulldata_tmp, formula = Barcode ~ SampleID, value.var = 'Counts')
all_counts_casted <- as.data.frame(all_counts_casted)
all_counts_casted[is.na(all_counts_casted)] <- 0 # replace NAs with 0
rownames(all_counts_casted) <- all_counts_casted$Barcode
all_counts_casted$Barcode <- NULL
# Create binary version
all_counts_casted_binary <- all_counts_casted
all_counts_casted_binary[all_counts_casted_binary > 0] <- 1
all_counts_casted_binary <- as.matrix(all_counts_casted_binary)
# matrix of abundances in pct
percent_abundance <- prop.table(x = as.matrix(all_counts_casted), margin = 2)*100 # create % abundances matrix
percent_abundance_melt <- melt(percent_abundance)
percent_abundance <- as.data.frame(percent_abundance)
# matrix in CPM
CPM <- as.data.frame(cpm(y = all_counts_casted))
# matrix in logCPM
logCPM <- log1p(CPM)
#### save list of raw matrices in xlsx format ####
write.xlsx(x = list(Raw = as.data.frame(all_counts_casted),
Binary = as.data.frame(all_counts_casted_binary),
CPM = as.data.frame(CPM),
logCPM = as.data.frame(logCPM),
pct_abund = percent_abundance),
file = paste0(tabledir,projectid,"_",prefix,"_counts_matrices.xlsx"), asTable = T,overwrite = T,
rowNames = T, colWidths = "auto", firstRow = T, firstCol = T, headerStyle = hs)
write.xlsx(x = list(full = as.data.frame(fulldata_tmp)), file = paste0(tabledir,projectid,"_",prefix,"_fulldata.xlsx"),
overwrite = T,
asTable = T, colWidths = "auto",
firstRow = T, rowNames = F, headerStyle = hs)
return(list(Raw = as.data.frame(all_counts_casted),
Binary = as.data.frame(all_counts_casted_binary),
CPM = as.data.frame(CPM),
logCPM = as.data.frame(logCPM),
pct_abund = percent_abundance))
}
#### IMPORTING RAW DATA ####
fulldata <- fread(input = paste0(bseq.fold,"/",sampleids[1],"/",sampleids[1],"_edit_",ed,".barcode.mc",mc,".tsv"))
#print(paste0("Before removal of unwanted barcodes : ", nrow(fulldata)))
fulldata$SampleID <- sampleids[1]
colnames(fulldata) <- c("Barcode","Counts","SaturationOri","SampleID")
fulldata$Rank <- as.numeric(rownames(fulldata))
fulldata$RankNorm <- fulldata$Rank / nrow(fulldata)
fulldata$Percentage <- fulldata$Counts / sum(fulldata$Counts) * 100
fulldata$Saturation <- cumsum(fulldata$Percentage)
fulldata$logCPM <- log1p(fulldata$Counts / sum(fulldata$Counts)*1000000)
for(id in sampleids[2:length(sampleids)]){
indata <- fread(input = paste0(bseq.fold,"/",id,"/",id,"_edit_",ed,".barcode.mc",mc,".tsv"))
indata$SampleID <- id
colnames(indata) <- c("Barcode","Counts","SaturationOri","SampleID")
indata$Rank <- as.numeric(rownames(indata))
indata$RankNorm <- indata$Rank / nrow(indata)
indata$Percentage <- indata$Counts / sum(indata$Counts) * 100
indata$Saturation <- cumsum(indata$Percentage)
indata$logCPM <- log1p(indata$Counts / sum(indata$Counts)*1000000)
fulldata <- rbind(fulldata, indata)
}
# Adding info from samplesheet
fulldata <- data.table(merge.data.frame(x = fulldata, y = annot, by = "SampleID", all.x = T, sort = F))
## Adding new columns to final full data including:
# Pct_x_VCN = Percentage * VCN
# Pct_x_Engraft
fulldata$Pct_x_EngraftWBM <- fulldata$Percentage * (fulldata$Engraftment/100) * (fulldata$PctNGFR/100)
raw_objects <- datacharts(indata = fulldata, prefix = "raw", plotdir = plotdir, projectid = projectid, mc = mc , hs = hs, tabledir = tabledir)
raw_objects$fulldata <- fulldata
#### DERIVATES ####
fulldata_clean_list <- list()
deriv <- function(x, y) diff(y) / diff(x)
middle_pts <- function(x) x[-1] - diff(x) / 2
combined_plots_p1 <- list()
combined_plots_p2 <- list()
cut_thresh <- list()
for (sample in unique(fulldata$SampleID)) {
t <- subset(fulldata, SampleID == sample) # subsetting specific sample
d1log <- deriv(log10(t$Rank), log10(t$Counts))
sd1_log <- which.min(diff(d1log)) + 1
d1logCPM <- deriv(t$Rank, t$logCPM)
sd1_logCPM <- which.min(diff(d1logCPM)) + 1
prova <- deriv(log10(t$Rank), t$logCPM)
sd1_prova <- which.min(diff(prova)) + 1
cut_thresh[[sample]] <- t$Rank[sd1_log]
}
#### PERFORM CUT ####
pdf(file = paste0(plotdir, projectid,"_raw_barcode_distribution_cut_offs.pdf"), width = 12, height = 6)
# for (sample in unique(fulldata$SampleID)) {
for (sample in annot$SampleID) {
t <- subset(fulldata, SampleID == sample)
# removing barcodes ACCORDING TO threshold
mythresh <- ifelse(isTRUE(custom.cut), yes = annot[annot$SampleID == sample,]$Cutoff, no = cut_thresh[[sample]])
if(isTRUE(do.filter)){
if(mythresh > 1){
if(isTRUE(MinlogCPMFilter)){
print("Going with MinCPM filter ...")
t_filt <- t[t$logCPM > mythresh,]
p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs logCPM: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T)
combined_plots_p1[[sample]] <- p1
p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs Counts: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[nrow(t[t$logCPM > mythresh,])], color = "#3a93bd",show.legend = T)
combined_plots_p2[[sample]] <- p2
print(grid.arrange(p1, p2, ncol = 2))
}else{
print("Going with rank based filter or auto according to custom.cut parameter...")
t_filt <- t[t$Rank < mythresh,]
p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs logCPM: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T)
combined_plots_p1[[sample]] <- p1
p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs Counts: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[mythresh], color = "#3a93bd",show.legend = T)
combined_plots_p2[[sample]] <- p2
print(grid.arrange(p1, p2, ncol = 2))
}
}else{
if(method == "MinPct"){
print("Going with percentage filter ...")
#t_filt <- t[t$Percentage > mythresh,]
t_filt <- subset(t, Percentage > mythresh)
p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs logCPM: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T)
combined_plots_p1[[sample]] <- p1
p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs Counts: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[nrow(t[t$Percentage > mythresh,])], color = "#3a93bd",show.legend = T)
combined_plots_p2[[sample]] <- p2
print(grid.arrange(p1, p2, ncol = 2))
}
else if(method == "MaxSat"){
print("Going with saturation filter ...")
t_filt <- t[t$Saturation < (mythresh*100),]
#t_filt <- subset(t, subset = NewAbund < (mythresh * 100))
p1 <- ggplot(t, aes(x = Rank, y = logCPM)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs logCPM: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < (mythresh*100),])], color = "#3a93bd",show.legend = T)
combined_plots_p1[[sample]] <- p1
p2 <- ggplot(t, aes(x = Rank, y = Counts)) +
theme_bw() +
theme() +
ggtitle(paste0("Rank vs Counts: ", sample)) +
geom_point() +
geom_line() +
geom_vline(xintercept = t$Rank[nrow(t[t$Saturation < mythresh*100,])], color = "#3a93bd",show.legend = T)
combined_plots_p2[[sample]] <- p2
print(grid.arrange(p1, p2, ncol = 2))
}
else{
stop("Wrong method !!")
}
}
}else{
t_filt <- t
}
# removing barcodes with Ns
t_filt <- t_filt[grep(x = t_filt$Barcode, pattern = "N", invert = T),]
# Re-calculate metrics
t_filt$Rank <- as.numeric(rownames(t_filt))
t_filt$RankNorm <- t_filt$Rank / nrow(t_filt)
t_filt$Percentage <- t_filt$Counts / sum(t_filt$Counts) * 100
t_filt$Saturation <- cumsum(t_filt$Percentage)
t_filt$logCPM <- log1p(t_filt$Counts / sum(t_filt$Counts)*1000000)
fulldata_clean_list[[sample]] <- t_filt
}
dev.off()
# Re-create fulldata and run the charts
fulldata <- NULL
fulldata <- fulldata_clean_list[[sampleids[1]]]
for(id in sampleids[2:length(sampleids)]){
fulldata <- rbind(fulldata, fulldata_clean_list[[id]])
}
saveRDS(fulldata, paste0(outfolder, "fulldata_post_curve_filt_objects.rds"))
#### RUN FILTERING ACCORDING TO SHARING WITHIN GROUP ####
if(isTRUE(removeMaxN)){
bc_by_group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")]
bcode2remove <- NULL
for (g in unique(annot$Group)) {
print(paste0("Collecting barcodes present in more than ", max(annot[annot$Group == g,]$MaxN), " samples in group ", g, " ..."))
bcode2remove <- c(bcode2remove,bc_by_group[bc_by_group$Group == g & bc_by_group$cnt > max(annot[annot$Group == g,]$MaxN),]$Barcode)
print(paste0("Number of barcodes identified: ", length(bcode2remove)))
}
print(paste0("Total number of barcodes to remove from analysis: ", length(unique(bcode2remove))))
removed_data <- fulldata[fulldata$Barcode %in% unique(bcode2remove),]
fulldata <- fulldata[!fulldata$Barcode %in% unique(bcode2remove),]
print(x = paste0("Prima stampa"))
print(head(fulldata))
fulldata <- as.data.table(fulldata %>% group_by(SampleID) %>%
mutate(Percentage = Counts / sum(Counts) * 100) %>%
mutate(Saturation = cumsum(Percentage)) %>%
mutate(Rank = row_number()) %>%
mutate(RankNorm = Rank/max(Rank)))
print(paste0("Seconda stampa .. post-ricalcolo percentuali"))
print(head(fulldata))
write.xlsx(x = list(full = as.data.frame(removed_data)), file = paste0(tabledir,projectid,"_removed_data.xlsx"),
overwrite = T,
asTable = T, colWidths = "auto", firstRow = T, rowNames = F, headerStyle = hs)
}
# Apply eventual re-assignment of barcodes to most expressing sample - according to the bcodes2reassign vector #
#saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp.rds"))
if(!is.null(bcodes2reassign)){
tmpdf_reass <- as.data.frame(bcodes2reassign)
colnames(tmpdf_reass) <- c("bcodes2reassign")
tmpdf_reass$SampleID <- names(bcodes2reassign)
tmpdf_reass$SampleBC <- paste0(tmpdf_reass$SampleID,"_", tmpdf_reass$bcodes2reassign)
fulldata$SampleBC <- paste0(fulldata$SampleID, "_", fulldata$Barcode)
fulldata_good <- fulldata[!fulldata$Barcode %in% bcodes2reassign, ]
print("fulldata_good print - elements shared within groups or not shared between groups")
print(dim(fulldata_good))
fulldata_reassigned <- fulldata[fulldata$SampleBC %in% tmpdf_reass$SampleBC, ]
print("fulldata_reassigned print - elements to reassing")
print(dim(fulldata_reassigned))
fulldata2 <- rbind(fulldata_good, fulldata_reassigned)
print(dim(fulldata2))
fulldata2 <- fulldata2[order(factor(fulldata2$SampleID, levels=rev(sampleids)),
fulldata2$Counts, decreasing = T
),]
fulldata2$SampleBC <- NULL
fulldata2 <- as.data.table(fulldata2 %>% group_by(SampleID) %>%
mutate(Percentage = Counts / sum(Counts) * 100) %>%
mutate(Saturation = cumsum(Percentage)) %>%
mutate(Rank = row_number()) %>%
mutate(RankNorm = Rank/max(Rank)))
print("Fulldata 2 print .....")
#head(fulldata2)
# fulldata2 <- as.data.frame(fulldata2 %>% group_by(SampleID) %>%
# mutate(Percentage_new = Counts / sum(Counts) * 100) %>%
# mutate(Saturation_new = cumsum(Percentage_new))) %>%
# mutate(Rank_new = row_number()) %>%
# mutate(RankNorm_new = Rank_new/max(Rank_new))
dim(fulldata2)
fulldata <- fulldata2
print(dim(fulldata))
rm(fulldata2)
}
print(dim(fulldata))
#### Collect data and stats after filtering ####
filtered_objects <- datacharts(indata = fulldata, prefix = "filtered", plotdir = plotdir, projectid = projectid, mc = mc, hs = hs, tabledir = tabledir)
#saveRDS(fulldata, file = paste0(projectid,"_fulldata_tmp2.rds"))
# Creating jaccard matrix
jaccard <- function(a, b) {
intersection = length(intersect(a, b))
union = length(a) + length(b) - intersection
return (intersection/union)
}
jacc_mt <- matrix(0, nrow = length(unique(fulldata$SampleID)),
ncol = length(unique(fulldata$SampleID)))
rownames(jacc_mt) <- unique(fulldata$SampleID)
colnames(jacc_mt) <- rownames(jacc_mt)
jacc_df <- as.data.frame(jacc_mt)
intersect_df <- jacc_df
sharing_df <- jacc_df
for (k in unique(fulldata$SampleID)) {
for (j in unique(fulldata$SampleID)){
jacc_df[k,j] <- jaccard(a = fulldata[fulldata$SampleID == k]$Barcode, b = fulldata[fulldata$SampleID == j]$Barcode)
intersect_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode))
sharing_df[k,j] <- length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == j]$Barcode)) / length(intersect(fulldata[fulldata$SampleID == k]$Barcode, fulldata[fulldata$SampleID == k]$Barcode)) * 100
}
}
all_counts_casted_binary <- as.matrix(filtered_objects$Binary)
#print(head(all_counts_casted_binary))
jacc_mt <- as.matrix(jacc_df)
intersect_mt <- as.matrix(intersect_df)
sharing_mt <- as.matrix(sharing_df)
unique_shared <- as.data.frame(matrix(nrow=length(sampleids),ncol=4))
colnames(unique_shared) <- c("Total","Unique","Shared","Shared_Pct")
rownames(unique_shared) <- colnames(x = all_counts_casted_binary)
for(i in 1:nrow(unique_shared)){
unique_shared$Total[i] <- colSums(all_counts_casted_binary)[i]
a <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) == 0),]))
# print(a)
unique_shared$Unique[i] <- a
b <- nrow(as.data.frame(all_counts_casted_binary[which(all_counts_casted_binary[,i] == 1 & rowSums(as.data.frame(all_counts_casted_binary[,colnames(all_counts_casted_binary)[-i]])) > 0),]))
# print(b)
unique_shared$Shared[i] <- b
unique_shared$Shared_Pct[i] <- unique_shared$Shared[i]/unique_shared$Total[i] * 100
}
pheatmap::pheatmap(mat = intersect_mt, main = "Barcodes intersection",
display_numbers = T,number_format = "%.0f",
cluster_cols = F, cluster_rows = F,
color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
filename = paste0(plotdir, projectid,"_Intersection_pairwise.pdf"), width = 12, height = 12)
numsize = ifelse(nrow(sharing_mt) > 15, 4, 10)
pheatmap::pheatmap(mat = sharing_mt, main = "Percent of sharing",
display_numbers = T,number_format = "%.1f",
cluster_cols = F, cluster_rows = F, fontsize_number = numsize,
color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
filename = paste0(plotdir, projectid,"_Sharing_pairwise.pdf"),
width = 12, height = 12)
pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity",
display_numbers = T,number_format = "%.2f",
cluster_cols = F, cluster_rows = F, fontsize_number = numsize,
color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
filename = paste0(plotdir, projectid,"_Jaccard_pairwise.pdf"),
width = 12, height = 12)
pheatmap::pheatmap(mat = jacc_mt, main = "Jaccard similarity",
display_numbers = T,number_format = "%.2f",
cluster_cols = T, cluster_rows = T, fontsize_number = numsize,
color = rev(brewer.pal(n = 11, name = "Spectral")), number_color = "black",
filename = paste0(plotdir, projectid,"_Jaccard_pairwise_auto.pdf"),
width = 12, height = 12)
# Sharing between groups present in the matrix
intersect_vectors <- list()
intersect_vectors_ori <- list()
#saveRDS(fulldata, file = "Fulldata.rds")
sum.in.group <- fulldata[, .(cnt=.N) , by = c("Barcode","Group")]
sum.in.group.melt <- as.data.frame(dcast(data = sum.in.group, formula = Barcode ~ Group, value.var = "cnt"))
sum.in.group.melt[is.na(sum.in.group.melt)] <- 0
rownames(sum.in.group.melt) <- sum.in.group.melt$Barcode
sum.in.group.melt$Barcode <- NULL
if(length(unique(sum.in.group$Group)) > 1){
l <- list()
for (gr in unique(sum.in.group$Group)) {
l[[gr]] <- c(0,1)
}
j <- expand.grid(l)
if(ncol(j) == 2){
j <- matrix(j[rowSums(j) == 2,], nrow=1)
}else{
j <- as.matrix(j[rowSums(j) > 1,])
}
colnames(j) <- unique(sum.in.group$Group)
for(i in 1:nrow(j)){
rownames(j)[i] <- paste(x = colnames(j)[which(j[i,] == 1)],collapse = "_vs_")
}
splitdata <- split(sum.in.group, sum.in.group$Group)
for(i in names(splitdata)){
splitdata[[i]] <- splitdata[[i]]$Barcode
}
###### UPSET ######
upsetr.input <- fromList(splitdata)
rownames(upsetr.input) <- unique(unlist(splitdata))
upsetr.input <- upsetr.input[,colnames(j)]
for(comparison in rownames(j)){
matching.vect <- NULL
for(n in 1:nrow(upsetr.input)){
a <- as.character(upsetr.input[n,])
b <- as.character(j[comparison,])
if(identical(a,b)){
matching.vect <- c(matching.vect , n)
}
}
bcodes <- rownames(upsetr.input[matching.vect,])
intersect_vectors[[comparison]] <- sum.in.group.melt[bcodes,]
}
counts_by_sample <- fulldata[, .(count = .N), by = SampleID]
counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T)
counts_by_sample_annot$Cutoff <- NULL
pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9)
print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count)) + geom_boxplot())
dev.off()
counts_by_sample <- fulldata[, .(count = .N), by = SampleID]
counts_by_sample_annot <- merge.data.frame(counts_by_sample, annot, sort = F, all.x = T)
counts_by_sample_annot$Cutoff <- NULL
pdf(file = paste0(plotdir, projectid,"_BoxPlot_counts.pdf"), width = 12, height = 9)
print(ggplot(data = counts_by_sample_annot, mapping = aes(x = Group, y = count, fill = Group)) + geom_boxplot() + geom_point())
dev.off()
# UpSet R (only if groups >= 2)
pdf(file = paste0(plotdir, projectid,"_UpSetR_by_groups.pdf"), width = 18, height = 9)
print(upset(upsetr.input, order.by = "freq", sets = rev(colnames(upsetr.input)), keep.order = TRUE, nintersects = NA))
dev.off()
#saveRDS(intersect_vectors, "intersect_vectors.rds")
intersect_vectors_ori <- intersect_vectors
for(i in 1:length(names(intersect_vectors))){
if(nchar(names(intersect_vectors)[i]) > 30){
newname <- str_sub(string = stri_reverse(names(intersect_vectors)[i]), start = 1, end = 30)
names(intersect_vectors)[i] <- newname
}
if(isTRUE(duplicated(names(intersect_vectors)))){
stop()
}
}
write.xlsx(x = intersect_vectors, file = paste0(tabledir, projectid,"_SharingByGroup.xlsx"), overwrite = T,
asTable = T, rowNames = T, colWidths = "auto", firstRow = T, headerStyle = hs)
write.xlsx(x = sum.in.group.melt, file = paste0(tabledir, projectid,"_AggregationByGroup.xlsx"), overwrite = T,
asTable = T, rowNames = T, colWidths = "auto", firstRow = T, headerStyle = hs)
}
write.xlsx(x = list(Jaccard = as.data.frame(jacc_mt),
Intersection = as.data.frame(intersect_mt),
Sharing = as.data.frame(sharing_mt),
Unique_and_shared = as.data.frame(unique_shared)
),
file = paste0(tabledir, projectid,"_Sharing.xlsx"), overwrite = T,
asTable = T, rowNames = T, colWidths = "auto", firstRow = T, headerStyle = hs)
# Shared barcodes in the group (only if N° groups == 1)
if(length(unique(fulldata$Group)) == 1){
shared_barcodes_list <- list()
file.remove(paste0(tabledir, "/","Sharing_", projectid,".txt"))
for(i in unique(fulldata$SampleID)){
shared_barcodes_list[[i]] <- intersect(fulldata[fulldata$SampleID == i,]$Barcode,
fulldata[fulldata$SampleID != i,]$Barcode)
cur.pct <- length(shared_barcodes_list[[i]]) / length(fulldata[fulldata$SampleID == i,]$Barcode)
write(x = paste0("Sharing ", i, " vs all: ", round(cur.pct * 100,2) , "%"),
file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T)
}
shared_barcodes <- unique(unlist(shared_barcodes_list))
cur.pct <- length(shared_barcodes) / length(unique(fulldata$Barcode))
write(x = paste0("Sharing ", "within ", " group: ", round(cur.pct * 100,2) , "%"),
file = paste0(tabledir, "/","Sharing_", projectid,".txt"), append = T)
}
circosdata <- NULL
if(length(sampleids) > 2){
circosdata <- as.data.frame(sharing_mt)
diag(circosdata) <- 0
circosdata$SampleID <- rownames(circosdata)
mydata_long <- melt(circosdata, id.vars = "SampleID")
colnames(mydata_long) <- c("rowname","key","value")
pdf(file= paste0(plotdir, projectid,"_Circos_by_sampleIDs.pdf"),
width = 16, height = 16)
chordDiagram(mydata_long,
grid.col = colorRampPalette(brewer.pal(12, "Paired"))(length(unique(mydata_long$rowname))))
dev.off()
}
saveRDS(object = list(
fulldata = fulldata,
counts_abund_matrices = filtered_objects,
intersect_mt = intersect_mt,
sharing_mt = sharing_mt,
jacc_mt = jacc_mt,
aggregationbygroup = sum.in.group,
aggregationbygroup_melt = sum.in.group.melt,
sharingbygroup = intersect_vectors_ori,
#sorted_barcodes_all = sorted_barcodes_all,
#sorted_barcodes_shared = sorted_barcodes_shared,
sampleinfo = annot,
raw_objects = raw_objects,
circosdata = circosdata
), file = paste0(outfolder, projectid, "_objects.rds"))
}
# Run Analysis (MM samples + Expanded)
library(vegan)
library(dplyr)
source(file = "../ChartingBCs_Light.R")
library(RColorBrewer)
library(ggrepel)
library(openxlsx)
paperdata <- "output/"
dir.create(path = paperdata)
rootdir <- "./"
bseqdir <- paste0(rootdir,"Input/") # output from barseq
samplesheets <- list(MinPct = paste0(rootdir,"Input/DP-like_EX-U_samplesheet.txt"))
# SampleIDs
# MMs : MM1 MM3 MM4 (Minimally manipulated - DP-like group)
# Ehigh: E09 E10 E11 E12 E13 E14
# Elow: E16 E17 E18 E19 E20 E21
MM = c("MM1","MM3","MM4")
Ehigh = c("E09", "E10", "E11", "E12", "E13", "E14")
Elow = c( "E16", "E17", "E18", "E19", "E20", "E21")
SampleIDs <- list(MM = MM,
Ehigh = Ehigh,
Elow = Elow,
MM_E = c(MM,Ehigh,Elow),
MM_Elow = c(MM,Elow),
MM_Ehigh = c(MM,Ehigh),
)
#### Basic analysis ####
for(setting in c("MinPct"))
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in names(SampleIDs)) {
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting,
bcodes2reassign = NULL,
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"),
outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
# Remove barcodes that are shared between MM vs Es (Elow + Ehigh)
# loading dataframe with all counts passing threshold (0.05) including all replicates and after removal of
# barcodes that are present in more than N replicates
MM_E_replicates <- readRDS(file = paste0(rootdir,"MinPct/MM_E_MaxN_removal_seqtk/MinPct_MM_E_MaxN_removal_objects.rds"))
# Findout shared barcodes between MM and Es and assign to the higher replicate
# data input: MM_E_replicates$sharingbygroup$MM_vs_E_highdose
shared_MM_and_Es_with_replicates <- unique(c(rownames(MM_E_replicates$sharingbygroup$MM_vs_E_highdose),
rownames(MM_E_replicates$sharingbygroup$MM_vs_E_lowdose),
rownames(MM_E_replicates$sharingbygroup$MM_vs_E_highdose_vs_E_lowdose)))
saveRDS(object = shared_MM_and_Es_with_replicates, paste0(paperdata,"/shared_MM_and_Es_with_replicates_step01.rds"))
shared_MM_and_Es_with_replicates_mt <- MM_E_replicates$counts_abund_matrices$logCPM[shared_MM_and_Es_with_replicates,c(MM,Ehigh,Elow)]
shared_MM_and_Es_with_replicates_mt <- as.matrix(arrange_all(shared_MM_and_Es_with_replicates_mt,desc))
shared_MM_and_Es_with_replicates_mt_pcts <- MM_E_replicates$counts_abund_matrices$pct_abund[shared_MM_and_Es_with_replicates,c(MM,Ehigh,Elow)]
shared_MM_and_Es_with_replicates_mt_pcts <- as.matrix(arrange_all(shared_MM_and_Es_with_replicates_mt_pcts,desc))
# retain barcodes that are shared within dataset ####
MM_obj <- readRDS("MinPct/MM_MaxN_removal_seqtk/MinPct_MM_MaxN_removal_objects.rds")
Ehigh_obj <- readRDS("MinPct/Ehigh_MaxN_removal_seqtk/MinPct_Ehigh_MaxN_removal_objects.rds")
Elow_obj <- readRDS("MinPct/Elow_MaxN_removal_seqtk/MinPct_Elow_MaxN_removal_objects.rds")
shared_within_MM <- rownames(subset(MM_obj$aggregationbygroup_melt, subset = MM > 1))
shared_within_Ehigh <- rownames(subset(Ehigh_obj$aggregationbygroup_melt, subset = E_highdose > 1))
shared_within_Elow <- rownames(subset(Elow_obj$aggregationbygroup_melt, subset = E_lowdose > 1))
writeLines(text = shared_within_MM, con = paste0(paperdata,"/Shared_barcodes_within_MM.txt"))
writeLines(text = shared_within_Ehigh, con = paste0(paperdata,"/Shared_barcodes_within_Ehigh.txt")
writeLines(text = shared_within_Elow, con = paste0(paperdata,"/Shared_barcodes_within_Elow.txt")
dim(shared_MM_and_Es_with_replicates_mt_pcts)
index_to_retain <- which(!rownames(shared_MM_and_Es_with_replicates_mt_pcts) %in% unique(c(shared_within_MM, shared_within_Ehigh, shared_within_Elow)))
shared_MM_and_Es_with_replicates_mt_pcts <- shared_MM_and_Es_with_replicates_mt_pcts[index_to_retain,]
shared_MM_and_Es_with_replicates_mt <- shared_MM_and_Es_with_replicates_mt[index_to_retain,]
sds <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = function(x) {sd(x[x > 0])})
maxs <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = max)
second <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
means <- apply(shared_MM_and_Es_with_replicates_mt_pcts, MARGIN = 1, FUN = mean)
ratio_max_to_2nd <- apply(shared_MM_and_Es_with_replicates_mt_pcts,
MARGIN = 1, FUN = max) / apply(shared_MM_and_Es_with_replicates_mt_pcts,
MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
mydf_weightedratio <- as.data.frame(ratio_max_to_2nd)
colnames(mydf_weightedratio) <- "ratioMax2Second"
mydf_weightedratio$weighted <- mydf_weightedratio$ratioMax2Second * means
mydf_weightedratio$logged <- log10(mydf_weightedratio$weighted)
reassignment_rule <- ifelse(mydf_weightedratio$logged > -1, 1, 0)
names(reassignment_rule) <- rownames(mydf_weightedratio)
saveRDS(object = ratio_max_to_2nd, file = paste0(paperdata,"/ratio_max_to_2nd_pcts_nofilt_within.rds"))
png(paste0(paperdata,"Baplot_ratio_max_2nd.png"), width = 9, height = 6, units = "in", res = 200)
barplot(log10(ratio_max_to_2nd[order(ratio_max_to_2nd, decreasing = T)]), xaxt='n',
xlab = "Shared barcodes - MM vs EX450K EX150K (replicates)",
main = "log10 ratio max value vs 2nd")
abline(h = 1, col = "red")
dev.off()
barcodes_to_reassign_A <- names(ratio_max_to_2nd[ratio_max_to_2nd > 10]) # filtering
barcodes_to_reassign_A_names <- colnames(MM_E_replicates$counts_abund_matrices$pct_abund[barcodes_to_reassign_A,])[apply(X = MM_E_replicates$counts_abund_matrices$pct_abund[barcodes_to_reassign_A,],
MARGIN = 1, FUN = function(x){which.max(x)})]
names(barcodes_to_reassign_A) <- barcodes_to_reassign_A_names
saveRDS(object = barcodes_to_reassign_A, file = paste0(paperdata,"/barcodes_to_reassign_A.rds"))
pheatmap::pheatmap(mat = shared_MM_and_Es_with_replicates_mt, cluster_cols = F, cluster_rows = F,
scale = "none", filename = paste0(paperdata, "Shared_barcodes_MM_Es_logCPM_before_reassignment.png"),
main = paste0("Shared barcodes (",length(shared_MM_and_Es_with_replicates),") between MM and Expanded", "\n",
"logCPM counts"),
show_rownames = F,
width = 9, height = 6, color = rev(brewer.pal(n = 9, "Spectral")))
shared_MM_and_Es_with_replicates_mt_melt <- as.data.frame(melt(shared_MM_and_Es_with_replicates_mt))
png(filename = paste0(paperdata, "Shared_barcodes_MM_Es_logCPM_ggplot_before_reassignment.png"),
width = 9, height = 6, units = "in", res = 300)
ggplot(data = shared_MM_and_Es_with_replicates_mt_melt,
mapping = aes(x = Var2, y = Var1, fill = value)) +
geom_tile() + xlab("Samples") + ylab("Barcodes") +
ggtitle(paste0("Shared barcodes (",length(shared_MM_and_Es_with_replicates),") between MM and Expanded"), subtitle = "logCPM counts") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank()) +
scale_fill_distiller(palette = "Spectral", direction = -1)
dev.off()
# re-running with re-assigning of shared barcodes between MM and Es ##
for(setting in c("MinPct")){
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in names(SampleIDs)) {
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets_replicates[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting, bcodes2reassign = barcodes_to_reassign_A,
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal_reassignA"),
outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk_reassignA"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
### Create final plots ####
MM_E_replicates_reassigned <- readRDS(file = paste0(rootdir,"MinPct/MM_E_MaxN_removal_seqtk_reassignA/MinPct_MM_E_MaxN_removal_reassignA_objects.rds"))
## Adding new column to
mydata <- as.data.frame(table(MM_E_replicates_reassigned$fulldata$SampleID))
colnames(mydata) <- c("SampleID","nBarcodes")
groupannot <- MM_E_replicates_reassigned$sampleinfo[,c("SampleID","Group")]
mydata <- merge.data.frame(mydata, groupannot, by = "SampleID", all.x = T,sort = F)
mydata$Group <- gsub(x = mydata$Group, pattern = "E_lowdose", "EX-U150K")
mydata$Group <- gsub(x = mydata$Group, pattern = "E_highdose", "EX-U450K")
mydata$Group <- factor(mydata$Group, levels = c("MM","EX-U450K","EX-U150K"))
png(filename = paste0(paperdata,"/Number_of_unique_barcodes_MM_Es_post_reassignment.png"),
width = 4, height = 6, units = "in", res = 300)
ggplot(data = mydata, mapping = aes(Group, y = nBarcodes, color = Group,
label = paste0("(",nBarcodes,")"))) +
geom_boxplot(outlier.shape = NA, fatten = 1, lwd=1) +
geom_point(position=position_jitterdodge(seed = 500)) +
geom_text_repel(show_legend = FALSE, size = 3, seed = 500) +
scale_y_continuous(trans = "log10") +
ggtitle(label = "Unique barcodes MM and EXs", subtitle = "filter: % > 0.005") +
#theme_bw() +
theme(legend.position = "none", axis.title = element_blank(),
panel.background = element_rect(fill = "white"),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.line = element_line(linetype = 1))
dev.off()
# AIM: plotting BC sharing within group (MM, Elow and Ehigh) - boxplot
library(openxlsx)
library(ggplot2)
library(ggrepel)
# load input
Ehigh <- readRDS("MinPct/Ehigh_MaxN_removal_seqtk_reassignA/MinPct_Ehigh_MaxN_removal_reassignA_objects.rds")
Elow <- readRDS("MinPct/Elow_MaxN_removal_seqtk_reassignA/MinPct_Elow_MaxN_removal_reassignA_objects.rds")
MM <- readRDS("MinPct/MM_MaxN_removal_seqtk_reassignA/MinPct_MM_MaxN_removal_reassignA_objects.rds")
# Sharing intra group
sharing_intra <- list()
sharing_intra[["MM"]] <- read.table(file = "MinPct/MM_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_MM_MaxN_removal_reassignA.txt",
nrows = 3)
sharing_intra[["Ehigh"]] <- read.table(file = "MinPct/Ehigh_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_Ehigh_MaxN_removal_reassignA.txt",
nrows = 6)
sharing_intra[["Elow"]] <- read.table(file = "MinPct/Elow_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_Elow_MaxN_removal_reassignA.txt",
nrows = 6)
for(j in names(sharing_intra)){
sharing_intra[[j]] <- subset(sharing_intra[[j]], select = c("V2","V5"))
sharing_intra[[j]]$Group <- j
colnames(sharing_intra[[j]]) <- c("SampleID","pct.share_1vsAll","Group")
sharing_intra[[j]]$pct.share_1vsAll <- gsub(x = sharing_intra[[j]]$pct.share_1vsAll, pattern = "%", replacement = "")
}
df <- rbind(sharing_intra[["MM"]], sharing_intra[["Ehigh"]])
df <- rbind(df, sharing_intra[["Elow"]])
df$pct.share_1vsAll <- as.numeric(df$pct.share_1vsAll)
df$Group <- gsub(x = df$Group, pattern = "Elow", "EX-U150K")
df$Group <- gsub(x = df$Group, pattern = "Ehigh", "EX-U450K")
df$Group <- factor(df$Group, levels = c("MM","EX-U450K","EX-U150K"))
png("PaperData/Pct_sharing_1_vs_All_intragroup_MM_Es.png",
width = 4, height = 4, units = "in", res = 300)
ggplot(df, mapping = aes(x = Group, y = pct.share_1vsAll, color = Group, label = SampleID)) +
geom_boxplot(outlier.shape = NA) + geom_jitter(position = position_jitter(seed = 6), show.legend = F) +
geom_text_repel(position = position_jitter(seed = 6), show.legend = F) +
theme_bw() + ylab("% sharing") +
ggtitle(label = "Barcoding sharing intra-group", subtitle = "One vs All") +
theme(axis.title.x = element_blank(),
#legend.position = c(0.25, 0.8),
legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
dev.off()
write.xlsx(df, file = "PaperData/Pct_sharing_1_vs_All_intragroup_MM_Es.xlsx", asTable = T)
### Entropy measures #
MM_E_reassigned <- readRDS(file = paste0(rootdir, "MinPct/MM_E_MaxN_removal_seqtk_reassignA/MinPct_MM_E_MaxN_removal_reassignA_objects.rds"))
shannon_MM_E <- diversity(MM_E_reassigned$counts_abund_matrices$Raw, index = "shannon", MARGIN = 2)
shannon_MM_E_df <- as.data.frame(shannon_MM_E)
colnames(shannon_MM_E_df) <- "Shannon"
shannon <- shannon_MM_E_df
saveRDS(shannon, file = "PaperData/Shannon_diversity.rds")
SampleID Group Cutoff MaxN VCN Engraftment PctNGFR
MM1 MM 0.005 2 2.84 25.5 66.39
MM3 MM 0.005 2 1.61 52.77 95.53
MM4 MM 0.005 2 2.11 26.56 88.79
E09 E_highdose 0.005 3 1.86 36.76 81.17
E10 E_highdose 0.005 3 1.77 21.19 73.39
E11 E_highdose 0.005 3 2.07 36.88 92.76
E12 E_highdose 0.005 3 1.85 20.83 91.43
E13 E_highdose 0.005 3 2.01 43.92 97.08
E14 E_highdose 0.005 3 1.35 24.26 89.71
E16 E_lowdose 0.005 3 1.76 12.11 51.44
E17 E_lowdose 0.005 3 1.87 4.37 28.89
E18 E_lowdose 0.005 3 1.73 6.05 48.25
E19 E_lowdose 0.005 3 1.33 23.58 73.31
E20 E_lowdose 0.005 3 1.53 1.21 43.94
E21 E_lowdose 0.005 3 1.74 6.65 67.06
E09E14 E_highdose 0.005 4 1.86 30.51 90.57
E16E21 E_lowdose 0.005 4 1.73 6.35 49.845
MM MM 0.005 4 2.19 34.94 88.79
\ No newline at end of file
SampleID Group Cutoff MaxN VCN Engraftment PctNGFR
CG_UM_01 CG_UM 0.005 4 1.244 49.27 56.56
CG_UM_02 CG_UM 0.005 4 0.862 39.51 48.46
CG_UM_03 CG_UM 0.005 4 1.13 24.89 48.29
CG_UM_04 CG_UM 0.005 4 0.779 49.62 38.2
CG_UM_05 CG_UM 0.005 4 0.96 22 48.44
CG_UM_06 CG_UM 0.005 4 0.844 57.05 50.48
CG_UM_07 CG_UM 0.005 4 1.215 71.27 63.59
CG_UM_08 CG_UM 0.005 4 0.711 49.7 41.95
CG_UM_2nd_01 CG_UM_2nd 0 4 1 4.51 50.29
CG_UM_2nd_02 CG_UM_2nd 0 4 1 7.56 15.25
CG_UM_2nd_03 CG_UM_2nd 0 4 1 1.41 42.31
CG_UM_2nd_04 CG_UM_2nd 0 4 1 1.56 63.85
CG_UMSR_12 CG_UMSR 0.005 4 0.769 47.88 46.08
CG_UMSR_13 CG_UMSR 0.005 4 0.709 51.83 44.16
CG_UMSR_14 CG_UMSR 0.005 4 0.803 24.77 53.88
CG_UMSR_15 CG_UMSR 0.005 4 0.53 40.5 32.81
CG_UMSR_16 CG_UMSR 0.005 4 0.463 63.83 32.13
CG_UMSR_17 CG_UMSR 0.005 4 0.613 33.51 34.26
CG_UMSR_18 CG_UMSR 0.005 4 0.506 67.05 35.42
CG_UMSR_19 CG_UMSR 0.005 4 0.563 61.64 34.02
CG_UMSR_20 CG_UMSR 0.005 4 0.595 42.11 33.91
CG_UMSR_21 CG_UMSR 0.005 4 0.428 65.74 29.73
CG_UMSR_2nd_05 CG_UMSR_2nd 0 4 1 5.62 44.82
CG_UMSR_2nd_06 CG_UMSR_2nd 0 4 1 2.58 23.98
CG_UMSR_2nd_07 CG_UMSR_2nd 0 4 1 16.01 25.11
CG_UMSR_2nd_08 CG_UMSR_2nd 0 4 1 11.19 89.78
SampleID Group Cutoff MaxN VCN Engraftment PctNGFR
CG_UM_01 CG_UM 0 4 1.244 49.27 56.56
CG_UM_02 CG_UM 0 4 0.862 39.51 48.46
CG_UM_03 CG_UM 0 4 1.13 24.89 48.29
CG_UM_04 CG_UM 0 4 0.779 49.62 38.2
CG_UM_05 CG_UM 0 4 0.96 22 48.44
CG_UM_06 CG_UM 0 4 0.844 57.05 50.48
CG_UM_07 CG_UM 0 4 1.215 71.27 63.59
CG_UM_08 CG_UM 0 4 0.711 49.7 41.95
CG_UM_2nd_01 CG_UM_2nd 0 4 1 4.51 50.29
CG_UM_2nd_02 CG_UM_2nd 0 4 1 7.56 15.25
CG_UM_2nd_03 CG_UM_2nd 0 4 1 1.41 42.31
CG_UM_2nd_04 CG_UM_2nd 0 4 1 1.56 63.85
CG_UMSR_12 CG_UMSR 0 4 0.769 47.88 46.08
CG_UMSR_13 CG_UMSR 0 4 0.709 51.83 44.16
CG_UMSR_14 CG_UMSR 0 4 0.803 24.77 53.88
CG_UMSR_15 CG_UMSR 0 4 0.53 40.5 32.81
CG_UMSR_16 CG_UMSR 0 4 0.463 63.83 32.13
CG_UMSR_17 CG_UMSR 0 4 0.613 33.51 34.26
CG_UMSR_18 CG_UMSR 0 4 0.506 67.05 35.42
CG_UMSR_19 CG_UMSR 0 4 0.563 61.64 34.02
CG_UMSR_20 CG_UMSR 0 4 0.595 42.11 33.91
CG_UMSR_21 CG_UMSR 0 4 0.428 65.74 29.73
CG_UMSR_2nd_05 CG_UMSR_2nd 0 4 1 5.62 44.82
CG_UMSR_2nd_06 CG_UMSR_2nd 0 4 1 2.58 23.98
CG_UMSR_2nd_07 CG_UMSR_2nd 0 4 1 16.01 25.11
CG_UMSR_2nd_08 CG_UMSR_2nd 0 4 1 11.19 89.78
# Run Analysis CG (Cell growth) samples
library(vegan)
library(dplyr)
source(file = "../ChartingBCs_Light.R")
library(RColorBrewer)
library(ggrepel)
paperdata <- "PaperData/"
dir.create(path = paperdata)
rootdir <- "./"
bseqdir <- paste0(rootdir,"Input/")
# using only the minpct version - here renamed simply to samplesheet
samplesheets <- list(MinPct = paste0(bseqdir,"SCGM_samplesheet.txt"))
# SampleIDs / groups present in the paper
# CG_UMSR : CG_UMSR_12-21
# CG_UM : CG_UM_01-08
# CG_UM_2nd : CG_UM_2nd_01-04
# CG_UMSR_2nd : CG_UMSR_2nd_05-08
CG_UMSR = c("CG_UMSR_12","CG_UMSR_13","CG_UMSR_14","CG_UMSR_15","CG_UMSR_16","CG_UMSR_17",
"CG_UMSR_18","CG_UMSR_19","CG_UMSR_20","CG_UMSR_21")
CG_UM = c("CG_UM_01","CG_UM_02","CG_UM_03","CG_UM_04",
"CG_UM_05","CG_UM_06","CG_UM_07","CG_UM_08")
CG_UM_2nd = c("CG_UM_2nd_01","CG_UM_2nd_02","CG_UM_2nd_03","CG_UM_2nd_04")
CG_UMSR_2nd = c("CG_UMSR_2nd_05","CG_UMSR_2nd_06","CG_UMSR_2nd_07","CG_UMSR_2nd_08")
SampleIDs <- list(CG_UMSR = CG_UMSR,
CG_UM = CG_UM,
CG_UMSR_2nd = CG_UMSR_2nd,
CG_UM_2nd = CG_UM_2nd,
UM = c(CG_UM,CG_UM_2nd),
UMSR = c(CG_UMSR,CG_UMSR_2nd),
UM_UMSR_1st = c(CG_UMSR,CG_UM),
UM_UMSR_2nd = c(CG_UMSR_2nd,CG_UM_2nd)
)
#### Basic analysis ####
for(setting in c("MinPct")){
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in names(SampleIDs)) {
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting,
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"),
outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
# Defining Barcodes to re-assign using the 2 group dataset with CG (UM and UMSR) ####
MinPct_UM_UMSR_1st_MaxN_removal_objects <- readRDS("MinPct/UM_UMSR_1st_MaxN_removal_seqtk/MinPct_UM_UMSR_1st_MaxN_removal_objects.rds")
tmp.obj <- MinPct_UM_UMSR_1st_MaxN_removal_objects
# potentially to re-assign
shared_CG_UM_UMSR_first <- rownames(tmp.obj$sharingbygroup$CG_UM_vs_CG_UMSR)
saveRDS(object = shared_CG_UM_UMSR_first, paste0(paperdata,"PaperData/shared_CG_UM_UMSR_first_step01.rds"))
shared_CG_UM_UMSR_first_mt <- tmp.obj$counts_abund_matrices$logCPM[shared_CG_UM_UMSR_first,c(CG_UM,CG_UMSR)]
shared_CG_UM_UMSR_first_mt <- as.matrix(arrange_all(shared_CG_UM_UMSR_first_mt,desc))
shared_CG_UM_UMSR_first_mt_pcts <- tmp.obj$counts_abund_matrices$pct_abund[shared_CG_UM_UMSR_first,c(CG_UM,CG_UMSR)]
shared_CG_UM_UMSR_first_mt_pcts <- as.matrix(arrange_all(shared_CG_UM_UMSR_first_mt_pcts,desc))
# exclude BCs that are shared within group from the list of BCs to be re-assigned ####
shared_within_CG_UM <- rownames(subset(tmp.obj$aggregationbygroup_melt, subset = CG_UM > 1))
shared_within_CG_UMSR <- rownames(subset(tmp.obj$aggregationbygroup_melt, subset = CG_UMSR > 1))
writeLines(text = shared_within_CG_UM, con = paste0(paperdata,"/Shared_barcodes_within_CG_UM.txt"))
writeLines(text = shared_within_CG_UMSR, con = paste0(paperdata,"/Shared_barcodes_within_CG_UMSR.txt"))
# from the potentially BCs to be re-assigned exclude the one that are also shared within group
index_to_retain <- which(!rownames(shared_CG_UM_UMSR_first_mt_pcts) %in% union(shared_within_CG_UMSR, shared_within_CG_UM)) # if not intragroup sharing is affected
shared_CG_UM_UMSR_first_mt_pcts <- shared_CG_UM_UMSR_first_mt_pcts[index_to_retain,]
shared_CG_UM_UMSR_first_mt <- shared_CG_UM_UMSR_first_mt[index_to_retain,]
# adding new method of re-assignment based on std dev
sds <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = function(x) {sd(x[x > 0])})
maxs <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = max)
second <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
means <- apply(shared_CG_UM_UMSR_first_mt_pcts, MARGIN = 1, FUN = mean)
ratio_max_to_2nd <- apply(shared_CG_UM_UMSR_first_mt_pcts,
MARGIN = 1, FUN = max) / apply(shared_CG_UM_UMSR_first_mt_pcts,
MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
mydf_weightedratio <- as.data.frame(ratio_max_to_2nd)
colnames(mydf_weightedratio) <- "ratioMax2Second"
mydf_weightedratio$weighted <- mydf_weightedratio$ratioMax2Second * means
mydf_weightedratio$logged <- log10(mydf_weightedratio$weighted)
reassignment_rule <- ifelse(mydf_weightedratio$logged > -1, 1, 0)
names(reassignment_rule) <- rownames(mydf_weightedratio)
saveRDS(object = ratio_max_to_2nd, file = paste0(paperdata,"/ratio_max_to_2nd_CG_UM_UMSR_first.rds"))
saveRDS(object = mydf_weightedratio, file = paste0(paperdata,"/mydf_weightedratio_CG_UM_UMSR_first.rds"))
saveRDS(object = means, file = paste0(paperdata,"/means_CG_UM_UMSR_first.rds"))
saveRDS(object = reassignment_rule, file = paste0(paperdata,"/reassignment_rule_CG_UM_UMSR_first.rds"))
####
barcodes_to_reassign <- names(ratio_max_to_2nd[ratio_max_to_2nd > 10]) # filtering
barcodes_to_reassign_names <- colnames(tmp.obj$counts_abund_matrices$pct_abund[barcodes_to_reassign,])[apply(X = tmp.obj$counts_abund_matrices$pct_abund[barcodes_to_reassign,], MARGIN = 1, FUN = function(x){which.max(x)})]
names(barcodes_to_reassign) <- barcodes_to_reassign_names
saveRDS(object = barcodes_to_reassign, file = "PaperData/barcodes_to_reassign_CG_UM_UMSR_first.rds")
# re-running with re-assigning of shared barcodes between CG_UM and CG_UMSR ##
# Note: barcodes shared also within condition are removed from re-assignment ####
for(setting in c("MinPct")){
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in names(SampleIDs[c(1,2,5,6,7)])) { # pickup only groups that contain 1st transplant samples, exclude only 2nd
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], custom.cut = custom.cut.param, do.filter = T,
method = setting, bcodes2reassign = barcodes_to_reassign,
removeMaxN = FALSE, MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[c(1,2,5,6,7)][[SampleGroup]],
ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup),
outfolder = paste(setting, paste0(SampleGroup,"_seqtk_reassign"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
for (SampleGroup in names(SampleIDs[c(1,2,5,6,7)])) { # pickup only groups that contain 1st transplant samples
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting, bcodes2reassign = barcodes_to_reassign,
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[c(1,2,5,6,7)][[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"),
outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk_reassign"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
### Create final plots ####
UM_UMSR_1st_reassigned <- readRDS(file = paste0(rootdir,"MinPct/UM_UMSR_1st_MaxN_removal_seqtk_reassign/MinPct_UM_UMSR_1st_MaxN_removal_objects.rds"))
# look at numbers #
table(UM_UMSR_1st_reassigned$fulldata$Group)
# CG_UM CG_UMSR
# 4466 5127
## Adding new column to
mydata <- as.data.frame(table(UM_UMSR_1st_reassigned$fulldata$SampleID))
colnames(mydata) <- c("SampleID","nBarcodes")
mydata$Group <- c(rep(x = "CG_UM", 8), rep("CG_UMSR",10))
png(filename = "PaperData/Number_of_unique_barcodes_CG_UM_UMSR_post_reassignment.png",
width = 9, height = 6, units = "in", res = 300)
ggplot(data = mydata, mapping = aes(Group, y = nBarcodes, color = Group,
label = paste0(SampleID,"_",nBarcodes))) +
geom_point() + geom_label_repel(show_guide = FALSE) +
ggtitle(label = "Unique barcodes CG_UM and CG_UMSR", subtitle = "filter: % > 0.005") +
theme(legend.position = "none", axis.title = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
dev.off()
# shared barcodes after reassignment ####
UM_UMSR_1st_reassigned_shared <- rownames(UM_UMSR_1st_reassigned$sharingbygroup$CG_UM_vs_CG_UMSR)
UM_UMSR_1st_reassigned_shared_mt <- UM_UMSR_1st_reassigned$counts_abund_matrices$logCPM[UM_UMSR_1st_reassigned_shared,c(CG_UM,CG_UMSR)]
UM_UMSR_1st_reassigned_shared_mt <- as.matrix(arrange_all(UM_UMSR_1st_reassigned_shared_mt))
UM_UMSR_1st_reassigned_shared_mt_melt <- as.data.frame(melt(UM_UMSR_1st_reassigned_shared_mt))
colnames(UM_UMSR_1st_reassigned_shared_mt_melt) <- c("Barcode","SampleID","logCPM")
fulldata_subset <- subset(UM_UMSR_1st_reassigned$fulldata, select = c("Barcode", "SampleID", "Rank"),
subset = Barcode %in% UM_UMSR_1st_reassigned_shared_mt_melt$Barcode)
UM_UMSR_1st_reassigned_shared_mt_melt_rank <- merge.data.frame(UM_UMSR_1st_reassigned_shared_mt_melt, fulldata_subset, sort = F)
png(filename = paste0(paperdata, "Shared_barcodes_UM_UMSR_1st_logCPM_ggplot_post_reassignment.png"),
width = 6, height = 6, units = "in", res = 300)
ggplot(data = UM_UMSR_1st_reassigned_shared_mt_melt_rank,
mapping = aes(x = SampleID, y = Barcode, fill = logCPM, label = Rank)) +
geom_tile() + xlab("Samples") + ylab("Barcodes") + geom_text(size = 3) +
#scale_x_discrete(guide = guide_axis(n.dodge=3)) +
ggtitle(paste0("Shared barcodes (",length(UM_UMSR_1st_reassigned_shared),") between CG_UM and CG_UMSR"), subtitle = "logCPM counts (rank)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank()) +
scale_fill_distiller(palette = "Spectral", direction = -1)
dev.off()
# shared barcodes after reassignment ####
tmp2_data <- rownames(do.call(what = rbind, UM_UMSR_1st_reassigned$sharingbygroup))
CG_UM_UMSR_1st_reassigned_shared <- unique(gsub(x = tmp2_data, pattern = ".*\\.", replacement = ""))
CG_UM_UMSR_1st_reassigned_shared_mt <- UM_UMSR_1st_reassigned$counts_abund_matrices$logCPM[CG_UM_UMSR_1st_reassigned_shared,c(CG_UM,CG_UMSR)]
CG_UM_UMSR_1st_reassigned_shared_mt <- as.matrix(arrange_all(CG_UM_UMSR_1st_reassigned_shared_mt))
CG_UM_UMSR_1st_reassigned_shared_mt_melt <- as.data.frame(melt(CG_UM_UMSR_1st_reassigned_shared_mt))
colnames(CG_UM_UMSR_1st_reassigned_shared_mt_melt) <- c("Barcode","SampleID","logCPM")
fulldata_subset <- subset(UM_UMSR_1st_reassigned$fulldata, select = c("Barcode", "SampleID", "Rank"),
subset = Barcode %in% CG_UM_UMSR_1st_reassigned_shared_mt_melt$Barcode)
CG_UM_UMSR_1st_reassigned_shared_mt_melt_rank <- merge.data.frame(CG_UM_UMSR_1st_reassigned_shared_mt_melt, fulldata_subset, sort = F)
png(filename = paste0(paperdata, "Shared_barcodes_CG_UM_UMSR_1st_logCPM_ggplot_post_reassignment.png"),
width = 6, height = 6, units = "in", res = 300)
ggplot(data = CG_UM_UMSR_1st_reassigned_shared_mt_melt_rank,
mapping = aes(x = SampleID, y = Barcode, fill = logCPM, label = Rank)) +
geom_tile() + xlab("Samples") + ylab("Barcodes") + geom_text(size = 3) +
ggtitle(paste0("Shared barcodes (",length(CG_UM_UMSR_1st_reassigned_shared),") between CG UM and UMSR"), subtitle = "logCPM counts (rank)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank()) +
scale_fill_distiller(palette = "Spectral", direction = -1)
dev.off()
library(openxlsx)
library(ggplot2)
library(ggrepel)
# sharing within groups
sharing_intra <- list()
sharing_intra[["CG_UM"]] <- read.table(file = "MinPct/CG_UM_MaxN_removal_seqtk_reassign/Tables/Sharing_MinPct_CG_UM_MaxN_removal.txt",
nrows = 8)
sharing_intra[["CG_UMSR"]] <- read.table(file = "MinPct/CG_UMSR_MaxN_removal_seqtk_reassign/Tables/Sharing_MinPct_CG_UMSR_MaxN_removal.txt",
nrows = 10)
for(j in names(sharing_intra)){
sharing_intra[[j]] <- subset(sharing_intra[[j]], select = c("V2","V5"))
sharing_intra[[j]]$Group <- j
colnames(sharing_intra[[j]]) <- c("SampleID","pct.share_1vsAll","Group")
sharing_intra[[j]]$pct.share_1vsAll <- gsub(x = sharing_intra[[j]]$pct.share_1vsAll, pattern = "%", replacement = "")
}
df <- rbind(sharing_intra[["CG_UM"]], sharing_intra[["CG_UMSR"]])
df$pct.share_1vsAll <- as.numeric(df$pct.share_1vsAll)
df$Group <- gsub(x = df$Group, pattern = "UM",replacement = "U")
df$Group <- gsub(x = df$Group, pattern = "SR",replacement = "S")
df$Group <- factor(df$Group, levels = c("CG_U", "CG_US"))
write.xlsx(df, file = paste0(paperdata,"/Pct_sharing_1_vs_All_intragroup_CG_UM_and_UMSR.xlsx"),
asTable = T)
### Entropy measures #
shannon_ALL <- diversity(UM_UMSR_1st_reassigned$counts_abund_matrices$Raw, index = "shannon", MARGIN = 2)
shannon_ALL_df <- as.data.frame(shannon_ALL)
colnames(shannon_ALL_df) <- "Shannon"
saveRDS(shannon_ALL_df, paste0(paperdata, "Shannons.rds"))
library(ggplot2)
library(vegan)
library(pheatmap)
library(vegan)
library(dplyr)
source(file = "../ChartingBCs_Light.R")
library(RColorBrewer)
library(ggrepel)
library(ggalluvial)
paperdata <- "PaperData/"
dir.create(path = paperdata)
rootdir <- "./"
bseqdir <- paste0(rootdir,"Input/")
# using only the minpct version - here renamed simply to samplesheet
samplesheets <- list(MinPct = paste0(bseqdir,"SCGM_samplesheet_1st_2nd.txt"))
# SampleIDs / groups present in the paper
# CG_UMSR : CG_UMSR_12-21
# CG_UM : CG_UM_01-08
# CG_UM_2nd : CG_UM_2nd_01-04
# CG_UMSR_2nd : CG_UMSR_2nd_05-08
CG_UMSR = c("CG_UMSR_12","CG_UMSR_13","CG_UMSR_14","CG_UMSR_15","CG_UMSR_16","CG_UMSR_17",
"CG_UMSR_18","CG_UMSR_19","CG_UMSR_20","CG_UMSR_21")
CG_UM = c("CG_UM_01","CG_UM_02","CG_UM_03","CG_UM_04",
"CG_UM_05","CG_UM_06","CG_UM_07","CG_UM_08")
CG_UM_2nd = c("CG_UM_2nd_01","CG_UM_2nd_02","CG_UM_2nd_03","CG_UM_2nd_04")
CG_UMSR_2nd = c("CG_UMSR_2nd_05","CG_UMSR_2nd_06","CG_UMSR_2nd_07","CG_UMSR_2nd_08")
SampleIDs <- list(UM = c(CG_UM,CG_UM_2nd),
UMSR = c(CG_UMSR,CG_UMSR_2nd)
)
#### Basic analysis ####
for(setting in c("MinPct")){
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in names(SampleIDs)) {
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]], custom.cut = custom.cut.param, do.filter = T,
method = setting,
removeMaxN = FALSE, MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]],
ed = 2, mc = 3,
projectid = paste0(setting,"_noFILT_",SampleGroup),
outfolder = paste(paste0(setting,"_noFILT/"), paste0(SampleGroup,"_seqtk"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
for (SampleGroup in names(SampleIDs)) {
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting,
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_noFILT_",SampleGroup,"_MaxN_removal"),
outfolder = paste(paste0(setting,"_noFILT/"), paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
### Plotting Sharing 1st and 2nd ####
CG_UM_unfiltered <- readRDS(file = paste0(rootdir,"MinPct_noFILT/UM_MaxN_removal_seqtk/MinPct_noFILT_UM_MaxN_removal_objects.rds"))
shared_elements <- CG_UM_unfiltered$counts_abund_matrices$pct_abund[rownames(CG_UM_unfiltered$sharingbygroup$CG_UM_vs_CG_UM_2nd),]
shared_elements$Barcode <- rownames(shared_elements)
shared_elements_melt <- reshape2::melt(data = shared_elements, id.vars = c("Barcode"))
shared_elements_melt_annot <- merge.data.frame(shared_elements_melt,
CG_UM_unfiltered$sampleinfo,
by.x = "variable", by.y = "SampleID",
all.x = T, sort = F)
shared_elements_melt_annot$variable <- gsub(x = shared_elements_melt_annot$variable, pattern = "CG_UM_", replacement = "")
shared_elements_melt_annot$value[which(shared_elements_melt_annot$value == 0)] <- NA
shared_elements_melt_annot$Group <- gsub(x = shared_elements_melt_annot$Group, pattern = "UM",replacement = "U")
shared_elements_melt_annot$Group <- factor(shared_elements_melt_annot$Group, levels = c("CG_U", "CG_U_2nd"))
png(filename = paste0(paperdata,"CG_UM_1st_2nd_shared_heatmap_full.png"),width = 8, height = 6, units = "in", res = 300)
ggplot(data = shared_elements_melt_annot, mapping = aes(y = Barcode, x = variable, fill = value, label = round(value,4))) +
geom_tile() +
geom_text(size = 2, color = "black") +
facet_grid( . ~ Group, scale = "free", space = "free") +
scale_x_discrete(guide = guide_axis(n.dodge=2)) +
scale_fill_distiller(palette = "Spectral", direction = -1, na.value = "white") +
theme_bw() +
theme(axis.title = element_blank(), axis.text = element_text(size = 10),
legend.position = "none",
strip.text = element_text(size = 10, face = "bold"),
legend.title = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
) +
ggtitle(label = "Shared barcodes between 1st and 2nd transplant", subtitle = "CellGro Cell Culture Medium") +
geom_rect(data = data.frame(Group = 'CG_U'),
aes(xmin=c(0.5), xmax=c(1.5), ymin=c(24), ymax=c(26)),
colour="red", fill="transparent", size=1, inherit.aes = FALSE)
dev.off()
CG_UMSR_unfiltered <- readRDS(file = paste0(rootdir,"MinPct_noFILT/UMSR_MaxN_removal_seqtk/MinPct_noFILT_UMSR_MaxN_removal_objects.rds"))
shared_elements <- CG_UMSR_unfiltered$counts_abund_matrices$pct_abund[rownames(CG_UMSR_unfiltered$sharingbygroup$CG_UMSR_vs_CG_UMSR_2nd),]
shared_elements$Barcode <- rownames(shared_elements)
shared_elements_melt <- reshape2::melt(data = shared_elements, id.vars = c("Barcode"))
shared_elements_melt_annot <- merge.data.frame(shared_elements_melt,
CG_UMSR_unfiltered$sampleinfo,
by.x = "variable", by.y = "SampleID",
all.x = T, sort = F)
shared_elements_melt_annot$variable <- gsub(x = shared_elements_melt_annot$variable, pattern = "CG_UMSR_", replacement = "")
shared_elements_melt_annot$value[which(shared_elements_melt_annot$value == 0)] <- NA
shared_elements_melt_annot$Group <- gsub(x = shared_elements_melt_annot$Group, pattern = "UM",replacement = "U")
shared_elements_melt_annot$Group <- gsub(x = shared_elements_melt_annot$Group, pattern = "SR",replacement = "S")
shared_elements_melt_annot$Group <- factor(shared_elements_melt_annot$Group, levels = c("CG_US", "CG_US_2nd"))
png(filename = paste0(paperdata,"CG_UMSR_1st_2nd_shared_heatmap_full.png"),width = 8, height = 6, units = "in", res = 300)
ggplot(data = shared_elements_melt_annot, mapping = aes(y = Barcode, x = variable, fill = value, label = round(value,4))) +
geom_tile() +
geom_text(size = 2, color = "black") +
facet_grid( . ~ Group, scale = "free", space = "free") +
scale_x_discrete(guide = guide_axis(n.dodge=2)) +
scale_fill_distiller(palette = "Spectral", direction = -1, na.value = "white") +
#scale_fill_viridis_c(alpha = 0.9,option = "A",
# breaks = c(0,0.01, 0.05, 0.1, 0.5, 1,2,5,10,20,30,50,80,100), direction = -1)
theme_bw() +
theme(axis.title = element_blank(), axis.text = element_text(size = 8),
legend.position = "none",
strip.text = element_text(size = 10, face = "bold"),
legend.title = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
) +
ggtitle(label = "Shared barcodes between 1st and 2nd transplant", subtitle = "CellGro Cell Culture MediUMSR") +
geom_rect(data = data.frame(Group = 'CG_US'),
aes(xmin=c(6.5), xmax=c(7.5), ymin=c(0.5), ymax=c(1.5)),
colour="red", fill="transparent", size=1, inherit.aes = FALSE)
dev.off()
# Evaluate N° of clones in 1st and 2nd transplant ####
mydataU <- as.data.frame(table(CG_UM_unfiltered$fulldata$SampleID))
colnames(mydataU) <- c("SampleID","nBarcodes")
groupannot <- CG_UM_unfiltered$sampleinfo[,c("SampleID","Group")]
mydataU <- merge.data.frame(mydataU, groupannot, by = "SampleID", all.x = T,sort = F)
mydataUS <- as.data.frame(table(CG_UMSR_unfiltered$fulldata$SampleID))
colnames(mydataUS) <- c("SampleID","nBarcodes")
groupannot <- CG_UMSR_unfiltered$sampleinfo[,c("SampleID","Group")]
mydataUS <- merge.data.frame(mydataUS, groupannot, by = "SampleID", all.x = T,sort = F)
mydata <- rbind(mydataU, mydataUS)
mydata$Group <- gsub(x = mydata$Group, pattern = "UM",replacement = "U")
mydata$Group <- gsub(x = mydata$Group, pattern = "SR",replacement = "S")
mydata$Group <- factor(x = mydata$Group, levels = c("CG_U","CG_US","CG_U_2nd","CG_US_2nd"))
png(filename = "PaperData/Number_of_unique_barcodes_CG_1st_2nd.png",
width = 3, height = 4, units = "in", res = 300)
ggplot(data = mydata, mapping = aes(Group, y = nBarcodes, color = Group,
label = paste0("(",nBarcodes,")"))) +
geom_boxplot(outlier.shape = NA, fatten = 0.4, lwd=0.8) +
scale_x_discrete(guide = guide_axis(n.dodge=2)) +
geom_point(position=position_jitterdodge(seed = 500, jitter.width = 0.8)) +
geom_text_repel(show_legend = FALSE, size = 3, seed = 500) +
ggtitle(label = "n° barcodes (1st and 2nd)", subtitle = "no filtering") +
theme(legend.position = "none", axis.title = element_blank(),
panel.background = element_rect(fill = "white"),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.line = element_line(linetype = 1))
dev.off()
png(filename = "PaperData/Number_of_unique_barcodes_CG_2nd.png",
width = 2, height = 4, units = "in", res = 300)
ggplot(data = subset(mydata, subset = Group %in% c("CG_U_2nd","CG_US_2nd")),
mapping = aes(Group, y = nBarcodes, color = Group,
label = paste0("(",nBarcodes,")"))) +
scale_fill_manual(values = c("#357BCF","#8DD3FB")) +
scale_color_manual(values = c("#357BCF","#8DD3FB")) +
geom_boxplot(outlier.shape = NA, fatten = 0.8, lwd=0.8) +
scale_x_discrete(guide = guide_axis(n.dodge=2)) +
geom_point(position=position_jitterdodge(seed = 500, jitter.width = 0.8)) +
ggtitle(label = "n° barcodes 2nd", subtitle = "No filtering") +
theme(panel.background = element_rect(fill = "white"),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.line = element_line(linetype = 1),
legend.position = "none",
legend.box.background = element_blank(),
legend.key = element_rect(fill = "white"),
legend.text = element_text(size = 8),
legend.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_text(size = 12, color = "black"),
axis.text.x = element_text(angle = 0, size = 12, color = "black"),
strip.text = element_text(size = 8))
dev.off()
##### Creating plots ofr showing the clonal structure in 1st vs 2nd ###
# Idea: subdivide in bins of abundance and plots classes in 1st and second on shared elements only
library(ggalluvial)
first_second_sharing_fold <- paste0(paperdata, "1st_vs_2nd_sharing/")
dir.create(first_second_sharing_fold)
clonal_structure_1st_2nd <- list(objects = list(),
shared_elements = list(),
shared_elements_grouped = list())
for(group in c("UM","UMSR")){
clonal_structure_1st_2nd[["objects"]][[group]] <- readRDS(file = paste0(rootdir,"MinPct_noFILT/",group,"_MaxN_removal_seqtk/MinPct_noFILT_",group,"_MaxN_removal_objects.rds"))
shared_elements <- clonal_structure_1st_2nd[["objects"]][[group]]$counts_abund_matrices$pct_abund[rownames(clonal_structure_1st_2nd[["objects"]][[group]]$sharingbygroup[[paste0("CG_",group,"_vs_CG_",group,"_2nd")]]),]
shared_elements$Barcode <- rownames(shared_elements)
shared_elements_melt <- reshape2::melt(data = shared_elements, id.vars = c("Barcode"))
shared_elements_melt_annot <- merge.data.frame(shared_elements_melt,
clonal_structure_1st_2nd[["objects"]][[group]]$sampleinfo,
by.x = "variable", by.y = "SampleID",
all.x = T, sort = F)
shared_elements_melt_annot$variable <- gsub(x = shared_elements_melt_annot$variable, pattern = paste0("CG_",group,"_"), replacement = "")
shared_elements_melt_annot$value[which(shared_elements_melt_annot$value == 0)] <- NA
clonal_structure_1st_2nd[["shared_elements"]][[group]] <- shared_elements_melt_annot
clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]] <- clonal_structure_1st_2nd[["shared_elements"]][[group]] %>%
group_by(Group, Barcode) %>%
summarise(mean_val = mean(value, na.rm = T),
mean_VCN = mean(VCN, na.rm = T),
mean_Engraftment = mean(Engraftment, na.rm = T),
mean_PctNGFR = mean(PctNGFR, na.rm = T))
clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]]$Category <- cut(x = clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]]$mean_val,
breaks = c(0,0.1,1,100),
labels = c("rare (0 - 0.1%)",
"medium (0.1 - 1%)",
"expanded ( > 1%)"))
png(filename = paste0(first_second_sharing_fold,"clonal_structure_1st_vs_2nd_",group,"_grouped_no0s.png"),
width = 6, height = 6, units = "in", res = 300)
print(ggplot(clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]],
mapping = aes(Group)) +
geom_bar(mapping = aes(fill = Category), position = "fill") +
ggtitle("Shared barcoded 1st vs 2nd",
subtitle = paste0("Abundance levels (CG-",group,") - grouped mean no 0s")) +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)))
dev.off()
# Create an alluvial plot of this aggregated barcodes abundances between 1st and 2nd
allbcs <- clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]]$Barcode
colpalette <- colorRampPalette(RColorBrewer::brewer.pal(12, name = "Paired"))(length(unique(allbcs)))
png(filename = paste0(first_second_sharing_fold, "clonal_structure_1st_vs_2nd_",group,"_grouped_alluvium_track_no0s.png"),
width = 6, height = 6, units = "in", res = 300)
print(ggplot(data = clonal_structure_1st_2nd[["shared_elements_grouped"]][[group]],
aes(x = Group, y = mean_val, alluvium = Barcode, label = Barcode)) +
ylab("Mean abundance %") +
geom_alluvium(aes(fill = Barcode),
alpha = .75, decreasing = FALSE, width = 1/4) +
geom_stratum(aes(stratum = Barcode, fill = Barcode), alpha = .50, decreasing = FALSE, width = 1/4) +
scale_fill_manual(values = alpha(colour = colpalette, alpha = 0.50)) +
theme_bw() +
ggtitle(label = "Shared barcodes tracking", subtitle = paste0("CG-", group, " aggregation by group - mean no 0s")) +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)))
dev.off()
}
saveRDS(object = clonal_structure_1st_2nd, file = paste0(paperdata,"/1st_vs_2nd_sharing/clonal_structure_1st_2nd_mean_no0s.rds")
library(openxlsx)
write.xlsx(x = clonal_structure_1st_2nd$shared_elements_grouped,
file = paste0(paperdata"/1st_vs_2nd_sharing/clonal_structure_1st_2nd_grouped_sharing_mean_no0s.xlsx"), asTable = T)
# Combine image - larger size ####
DataAnalysis <- readRDS(paste0(paperdata,"/1st_vs_2nd_sharing/clonal_structure_1st_2nd_mean_no0s.rds"))
for(i in names(DataAnalysis$shared_elements_grouped)){
DataAnalysis$shared_elements_grouped[[i]]$Condition <- i
}
mydata <- rbind.data.frame(DataAnalysis$shared_elements_grouped$UM, DataAnalysis$shared_elements_grouped$UMSR)
mydata$Transplant <- "Primary"
mydata$Transplant[grep(pattern = "2nd", x = mydata$Group)] <- "Secondary"
mydata$Transplant <- gsub(x = mydata$Transplant, pattern = "Primary", replacement = "1st")
mydata$Transplant <- gsub(x = mydata$Transplant, pattern = "Secondary", replacement = "2nd")
allbcs <- mydata$Barcode
colpalette <- colorRampPalette(RColorBrewer::brewer.pal(12, name = "Paired"))(length(unique(allbcs)))
png(filename = paste0(paperdata,"Combo_clonal_structure_1st_vs_2nd_alluvium_track.png"),
width = 6, height = 6, units = "in", res = 300)
print(ggplot(data = mydata,
aes(x = Transplant, y = mean_val, alluvium = Barcode, label = Barcode)) +
ylab("Mean abundance % - cumulative") +
geom_alluvium(aes(fill = Barcode),
alpha = .75, decreasing = FALSE, width = 1/4) +
geom_stratum(aes(stratum = Barcode, fill = Barcode), alpha = .50, decreasing = FALSE, width = 1/4) +
scale_fill_manual(values = alpha(colour = colpalette, alpha = 0.50)) +
facet_grid(. ~ Condition,scales = "free") +
ggtitle(label = "Shared BC tracking", subtitle = paste0("aggregation by group - mean (na.rm)")) +
theme(legend.position = "none",
axis.text = element_text(size = 20, color = "black"),
axis.title = element_blank(),
axis.text.y.left = element_text(size = 20, angle = -270, hjust = 0.5, color = "black"),
panel.background = element_rect(fill = "white"),
strip.text.x = element_text(size = 20),
axis.line = element_line(linetype = 1),
strip.background.x = element_rect(fill = "white"),
plot.subtitle = element_blank(),
plot.title = element_blank(),
))
dev.off()
SampleID Group Cutoff MaxN VCN Engraftment PctNGFR
ms01 DPlike 0.005 3 0.3164 52 23.8
ms02 DPlike 0.005 3 0.3096 47.82 18.87
ms03 DPlike 0.005 3 0.3579 44.54 29.45
ms04 DPlike 0.005 3 0.5771 27.97 29.82
ms05 DPlike 0.005 3 0.3527 34.71 18.85
ms06 DPlike 0.005 3 0.4915 35.2 28.92
ms09 EXU1 0.005 4 0.533 31.16 33.22
ms10 EXU1 0.005 4 0.268 24.41 18.49
ms11 EXU1 0.005 4 0.211 30.44 16.98
ms12 EXU1 0.005 4 0.26 32.4 23.76
ms13 EXU1 0.005 4 0.16 38.08 17.94
ms14 EXU1 0.005 4 0.263 62.23 28.24
ms15 EXU1 0.005 4 0.457 57.21 25.87
ms16 EXU1 0.005 4 0.174 79.3 17.35
ms17 EXU3 0.005 4 3.005 9.22 94.96
ms18 EXU3 0.005 4 1.256 7.16 40.59
ms19 EXU3 0.005 4 2.849 1.64 83.84
ms20 EXU3 0.005 4 2.086 3.19 46.81
ms21 EXU3 0.005 4 1.638 15.03 76.04
ms22 EXU3 0.005 4 0.339 13.35 24.1
ms23 EXU3 0.005 4 1.802 4.04 74.94
ms24 EXU3 0.005 4 0.252 47.76 14.87
ms25 EXU2 0.005 3 1.028 37.38 54.43
ms26 EXU2 0.005 3 0.807 68.93 49.12
ms27 EXU2 0.005 3 0.649 59.51 40.94
ms29 EXU2 0.005 3 0.664 35.69 41.24
ms30 EXU2 0.005 3 0.588 53.67 37.27
ms31 EXU2 0.005 3 0.623 39.7 36.04
ms32 EXU2 0.005 3 0.718 62.29 41.74
# Run Analysis RI (DP-like) and TI (EXU1-3) samples
library(vegan)
library(dplyr)
source(file = "../ChartingBCs_Light.R")
library(RColorBrewer)
library(ggplot2)
paperdata <- "PaperData/"
dir.create(path = paperdata)
rootdir <- "./"
bseqdir <- paste0(rootdir,"Input/")
samplesheets <- list(MinPct = paste0(bseqdir,"TD_Timing_samplesheet.txt"))
# SampleIDs / groups present in the paper
DPlike_Exp2022_05 <- c("ms01","ms02","ms03","ms04","ms05","ms06")
EXU1_Exp2022_05 = c("ms09","ms10","ms11","ms12","ms13","ms14","ms15","ms16")
EXU3_Exp2022_05 = c("ms17","ms18","ms19","ms20","ms22","ms23","ms24")
EXU2_Exp2022_05 = c("ms25","ms26","ms27","ms29","ms30","ms31","ms32")
SampleIDs <- list(
DPlike_Exp2022_05 = DPlike_Exp2022_05,
EXU1_Exp2022_05 = EXU1_Exp2022_05,
EXU2_Exp2022_05 = EXU2_Exp2022_05,
EXU3_Exp2022_05 = EXU3_Exp2022_05,
RITIs_Exp2022_05 = c(DPlike_Exp2022_05,EXU1_Exp2022_05,EXU2_Exp2022_05,EXU3_Exp2022_05)
)
#### Basic analysis ####
for(setting in c("MinPct")){
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in names(SampleIDs)) {
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting,
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal"),
outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
MinPct_RITIs_Exp2022_05_MaxN_removal_objects <- readRDS("MinPct_RITIs_Exp2022_05_MaxN_removal_seqtk/MinPct_RITIs_Exp2022_05_MaxN_removal_objects.rds")
mylist <- list(Exp2022_05 = MinPct_RITIs_Exp2022_05_MaxN_removal_objects)
shared_RITIs_list <- list()
for(j in names(mylist)){
tmp.obj <- mylist[[j]]
for(comp in names(tmp.obj$sharingbygroup)){
else{
shared_RITIs_list[[paste0(j,"_",comp)]] <- rownames(tmp.obj$sharingbygroup[[comp]])
}
}
}
shared_RITIs <- list(Exp2022_05 = unique(unlist(shared_RITIs_list[grep(pattern = "Exp2022_05", value = T, x = names(shared_RITIs_list))])))
saveRDS(object = shared_RITIs, "PaperData/shared_RITIs_step01.rds")
library(dplyr)
shared_RITIs_mt <- list()
shared_RITIs_mt_pcts <- list()
for(j in names(mylist)){
tmp.obj <- mylist[[j]]
shared_RITIs_mt[[j]] <- tmp.obj$counts_abund_matrices$logCPM[shared_RITIs[[j]],c(DPlike_Exp2022_05,EXU1_Exp2022_05,EXU2_Exp2022_05,EXU3_Exp2022_05)]
shared_RITIs_mt[[j]] <- as.matrix(arrange_all(shared_RITIs_mt[[j]],desc))
shared_RITIs_mt_pcts[[j]] <- tmp.obj$counts_abund_matrices$pct_abund[shared_RITIs[[j]],c(DPlike_Exp2022_05,EXU1_Exp2022_05,EXU2_Exp2022_05,EXU3_Exp2022_05)]
shared_RITIs_mt_pcts[[j]] <- as.matrix(arrange_all(shared_RITIs_mt_pcts[[j]],desc))
}
# exclude BCs that are shared within group from the list of BCs to be re-assigned ####
within_objs <- list()
for(j in names(mylist)){
ids <- NULL
if(j == "Exp2022_05"){
ids <- c("DPlike_Exp2022_05","EXU1_Exp2022_05","EXU2_Exp2022_05","EXU3_Exp2022_05")
}
for(id in ids){
within_objs[[j]][[id]] <- readRDS(paste0("MinPct/",id,"_MaxN_removal_seqtk/MinPct_",id,"_MaxN_removal_objects.rds"))
tmpdf <- as.data.frame(within_objs[[j]][[id]]$aggregationbygroup_melt)
print(head(rownames(tmpdf)))
within_objs[[j]][[id]] <- rownames(tmpdf)[which(tmpdf[[id]] > 1)]
writeLines(text = within_objs[[j]][[id]] , con = paste0(paperdata,"/Shared_barcodes_within_",id,".txt"))
}
}
# the barcodes above (within_objs lists) should be discarded from collision rule - fetching their index ####
index_to_retain <- list()
ratio_max_to_2nd <- list()
reassignment_rule <- list()
mydf_weightedratio <- list()
means <- list()
for(j in names(mylist)){
index_to_retain[[j]] <- which(!rownames(shared_RITIs_mt_pcts[[j]]) %in% unique(unlist(within_objs[[j]])))
barcodes2exclude <- intersect(rownames(shared_RITIs_mt_pcts[[j]]), unique(unlist(within_objs[[j]])))
writeLines(text = barcodes2exclude,
con = paste0("PaperData/barcodes_shared_both_within_excluded_from_collision_",j,".txt"))
shared_RITIs_mt_pcts[[j]] <- shared_RITIs_mt_pcts[[j]][index_to_retain[[j]],]
shared_RITIs_mt[[j]] <- shared_RITIs_mt[[j]][index_to_retain[[j]],]
sds <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = function(x) {sd(x[x > 0])})
maxs <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = max)
means[[j]] <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = mean)
second <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
#reassignment_rule[[j]] <- ifelse((maxs - sds) > second, 1, 0)
#print(reassignment_rule[[j]])
ratio_max_to_2nd[[j]] <- apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = max) / apply(shared_RITIs_mt_pcts[[j]], MARGIN = 1, FUN = function(x) {sort(x, decreasing = T)[2]})
mydf_weightedratio[[j]] <- as.data.frame(ratio_max_to_2nd[[j]])
colnames(mydf_weightedratio[[j]]) <- "ratioMax2Second"
mydf_weightedratio[[j]]$weighted <- mydf_weightedratio[[j]]$ratioMax2Second * means[[j]]
mydf_weightedratio[[j]]$logged <- log10(mydf_weightedratio[[j]]$weighted)
reassignment_rule[[j]] <- ifelse(mydf_weightedratio[[j]]$logged > -1, 1, 0)
names(reassignment_rule[[j]]) <- rownames(mydf_weightedratio[[j]])
saveRDS(object = ratio_max_to_2nd[[j]], file = paste0("PaperData/ratio_max_to_2nd_pcts_",j,"_nofilt_within.rds"))
png(paste0("PaperData/",j,"_Baplot_ratio_max_2nd.png"), width = 9, height = 6, units = "in", res = 200)
barplot(log10(ratio_max_to_2nd[[j]][order(ratio_max_to_2nd[[j]], decreasing = T)]), xaxt='n',
xlab = "Shared barcodes",
main = "log10 ratio max value vs 2nd")
abline(h = 1, col = "red")
dev.off()
}
saveRDS(object = mydf_weightedratio, file = paste0(paperdata,"/mydf_weightedratio.rds"))
saveRDS(object = ratio_max_to_2nd, file = paste0(paperdata,"/ratio_max_to_2nd.rds"))
saveRDS(object = reassignment_rule, file = paste0(paperdata,"/reassignment_rule.rds"))
saveRDS(object = means, file = paste0(paperdata,"/means.rds"))
library(reshape2)
shared_RITIs_mt_melt <- list()
barcodes_to_reassign_A <- list()
barcodes_to_reassign_A_names <- list()
for(j in names(mylist)){
barcodes_to_reassign_A[[j]] <- names(ratio_max_to_2nd[[j]][ratio_max_to_2nd[[j]] > 10]) # filtering
barcodes_to_reassign_A_names[[j]] <- colnames(mylist[[j]]$counts_abund_matrices$pct_abund[barcodes_to_reassign_A[[j]],])[apply(X = mylist[[j]]$counts_abund_matrices$pct_abund[barcodes_to_reassign_A[[j]],], MARGIN = 1, FUN = function(x){which.max(x)})]
names(barcodes_to_reassign_A[[j]]) <- barcodes_to_reassign_A_names[[j]]
saveRDS(object = barcodes_to_reassign_A[[j]], file = paste0("PaperData/barcodes_to_reassign_A_",j,".rds"))
pheatmap::pheatmap(mat = shared_RITIs_mt[[j]], cluster_cols = F, cluster_rows = F,
scale = "none", filename = paste0(paperdata, "Shared_barcodes_RITIs_logCPM_before_reassignment_",j,".png"),
main = paste0("Shared barcodes (",nrow(shared_RITIs_mt[[j]]),") between TIs vs TIs vs RI (replicates)", "\n",
"logCPM counts"),
show_rownames = F,
width = 9, height = 6, color = rev(brewer.pal(n = 9, "Spectral")))
shared_RITIs_mt_melt[[j]] <- as.data.frame(melt(shared_RITIs_mt[[j]]))
png(filename = paste0(paperdata, "Shared_barcodes_RITIs_logCPM_ggplot_before_reassignment_",j,".png"),
width = 9, height = 6, units = "in", res = 300)
print(ggplot(data = shared_RITIs_mt_melt[[j]],
mapping = aes(x = Var2, y = Var1, fill = value)) +
geom_tile() + xlab("Samples") + ylab("Barcodes") +
# scale_x_discrete(guide = guide_axis(n.dodge=2)) +
ggtitle(paste0("Shared barcodes (",nrow(shared_RITIs_mt[[j]]),") between TIs vs TIs vs RI"), subtitle = "logCPM counts") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
axis.text.y.left = element_blank(), axis.ticks.y.left = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_fill_distiller(palette = "Spectral", direction = -1))
dev.off()
}
# Note: barcodes shared also within condition are removed from re-assignment ####
for(setting in c("MinPct")){
dir.create(path = setting, showWarnings = F)
if(setting == "Auto"){
custom.cut.param <- FALSE
}else{
custom.cut.param <- TRUE
}
if(setting == "MinCPM"){
MinlogCPMFilter.param <- TRUE
}else{
MinlogCPMFilter.param <- FALSE
}
for (SampleGroup in c("DPlike_Exp2022_05","EXU1_Exp2022_05","EXU2_Exp2022_05","EXU3_Exp2022_05","RITIs_Exp2022_05")) { # pickup only groups that contain 1st transplant samples
chartingBCs_light(bseq.fold = bseqdir, samplesheet = samplesheets[[setting]],
custom.cut = custom.cut.param, do.filter = T,
method = setting, bcodes2reassign = barcodes_to_reassign_A[["Exp2022_05"]],
removeMaxN = TRUE,
MinlogCPMFilter = MinlogCPMFilter.param,
sampleids = SampleIDs[[SampleGroup]], ed = 2, mc = 3,
projectid = paste0(setting,"_",SampleGroup,"_MaxN_removal_reassignedA"),
outfolder = paste(setting,paste0(SampleGroup,"_MaxN_removal","_seqtk_reassignA"),sep = "/"),
counts_vector = c(5,20,50,100,200,500,5000,20000),
pcts_vector = c(0.001,0.002,0.005,0.01,0.02,0.1,0.2,0.5,0.75,1,2))
}
}
#### sharing intra condition #####
library(openxlsx)
library(ggplot2)
library(ggrepel)
# load input for each batch ####
processed_data_A <- list(Exp2022_05 = list())
for(batch in names(processed_data_A)){
processed_data_A[[batch]][["DPlike"]] <- readRDS(paste0("MinPct/DPlike_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_DPlike_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
processed_data_A[[batch]][["EXU1"]] <- readRDS(paste0("MinPct/EXU1_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_EXU1_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
processed_data_A[[batch]][["EXU2"]] <- readRDS(paste0("MinPct/EXU2_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_EXU2_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
processed_data_A[[batch]][["EXU3"]] <- readRDS(paste0("MinPct/EXU3_Exp2022_05_MaxN_removal_seqtk_reassignA/MinPct_EXU3_Exp2022_05_MaxN_removal_reassignedA_objects.rds"))
}
# Sharing intra group
sharing_intra_A <- list(Exp2022_05 = list()) # strategy A - final choice
for(batch in names(sharing_intra_A)){
for(k in c("DPlike","EXU1","EXU2","EXU3")){
sharing_intra_A[[batch]][[k]] <- read.table(file = paste0("MinPct/",k,"_",batch,"_MaxN_removal_seqtk_reassignA/Tables/Sharing_MinPct_",k,"_",batch,"_MaxN_removal_reassignedA.txt"),
nrow = length(SampleIDs[[paste0(k,"_",batch)]]))
}
}
### Plotting intragroup sharing boxplots #####
for(j in names(sharing_intra_A)){
for(d in names(sharing_intra_A[[j]])){
# A type reassignment
sharing_intra_A[[j]][[d]] <- subset(sharing_intra_A[[j]][[d]], select = c("V2","V5"))
sharing_intra_A[[j]][[d]]$Group <- d
colnames(sharing_intra_A[[j]][[d]]) <- c("SampleID","pct.share_1vsAll","Group")
sharing_intra_A[[j]][[d]]$pct.share_1vsAll <- gsub(x = sharing_intra_A[[j]][[d]]$pct.share_1vsAll, pattern = "%", replacement = "")
}
}
colorvector <- c("#D6D6D6","#EA4125","#DCFA8A","#194375")
names(colorvector) <- c("DPlike","EXU1","EXU2","EXU3")
## Plotting
for(j in names(sharing_intra_A)){
df <- rbind(sharing_intra_A[[j]][["DPlike"]], sharing_intra_A[[j]][["EXU1"]])
df <- rbind(df, sharing_intra_A[[j]][["EXU2"]])
df <- rbind(df, sharing_intra_A[[j]][["EXU3"]])
df$pct.share_1vsAll <- as.numeric(df$pct.share_1vsAll)
df$Group <- factor(df$Group, levels = c("DPlike", "EXU1", "EXU2","EXU3"))
write.xlsx(df, file = paste0(paperdata,"Pct_sharing_1_vs_All_intragroup_",j,"_RITIs_reassignmentA.xlsx"), asTable = T)
png(paste0(paperdata,"Pct_sharing_1_vs_All_intragroup_",j,"_RITIs_reassignmentA.png"), width = 6, height = 6, units = "in", res = 300)
print(ggplot(df, mapping = aes(x = Group, y = pct.share_1vsAll, color = Group, label = SampleID)) +
geom_boxplot(outlier.shape = NA) + geom_jitter(position = position_jitter(seed = 6), show.legend = F) +
#geom_text_repel(position = position_jitter(seed = 6), show.legend = F) +
theme_bw() + ylab("% sharing") +
ggtitle(label = "Barcoding sharing intra-group", subtitle = paste0( j, "- One vs All - reassign A")) +
theme(axis.title.x = element_blank(),
legend.position = c(0.15, 0.8),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)))
dev.off()
}
## fix colors and labels for pubblication ####
intra_batch_exp2022_05 <- read.xlsx(paste0(paperdata,"Pct_sharing_1_vs_All_intragroup_Exp2022_05_RITIs_reassignmentA.xlsx"))
# Shannons
shannons_list_A <- list()
for(batch in names(processed_data_A)){
for(k in names(processed_data_A[[batch]])){
shannon <- diversity(processed_data_A[[batch]][[k]]$counts_abund_matrices$Raw, index = "shannon", MARGIN = 2)
shannon_df <- as.data.frame(shannon)
colnames(shannon_df) <- "Shannon"
shannons_list_A[[batch]][[k]] <- shannon_df
}
}
shannon_Exp2022_05_A <- rbind(shannons_list_A$Exp2022_05$DPlike, shannons_list_A$Exp2022_05$EXU1)
shannon_Exp2022_05_A <- rbind(shannon_Exp2022_05_A, shannons_list_A$Exp2022_05$EXU2)
shannon_Exp2022_05_A <- rbind(shannon_Exp2022_05_A, shannons_list_A$Exp2022_05$EXU3)
saveRDS(shannon_Exp2022_05_A, paste0(paperdata,"Shannons.rds"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment