From 8891fc619a4404d312b8aa10058e61cc067dbb61 Mon Sep 17 00:00:00 2001 From: Matteo Barcella Date: Tue, 25 Jul 2023 17:05:20 +0200 Subject: [PATCH] Adding scripts for differential methylation analysis and heatmap in manuscript --- Methylation/CpG_Heatmaps_Paper.R | 199 +++++++++++++++++ Methylation/MethylAnalysis.R | 305 +++++++++++++++++++++++++++ Methylation/MethylAnalysis_bulk.R | 305 +++++++++++++++++++++++++++ Methylation/MethylAnalysis_bulk_v2.R | 304 ++++++++++++++++++++++++++ 4 files changed, 1113 insertions(+) create mode 100644 Methylation/CpG_Heatmaps_Paper.R create mode 100644 Methylation/MethylAnalysis.R create mode 100644 Methylation/MethylAnalysis_bulk.R create mode 100644 Methylation/MethylAnalysis_bulk_v2.R diff --git a/Methylation/CpG_Heatmaps_Paper.R b/Methylation/CpG_Heatmaps_Paper.R new file mode 100644 index 0000000..ebd2c5d --- /dev/null +++ b/Methylation/CpG_Heatmaps_Paper.R @@ -0,0 +1,199 @@ +#CpGs from position 136669244 to 13666449 (the entire fragment of the intron 4 putative transcription starting site), +#CpGs from position 136670325 to 136670386 (5’ of the intron 7) and +#CpGs from position 136670827 to 136670945 (3’ of the intron 7). + +# load data: + +source("MethylAnalysis.R") + +MethylAnalysis(methcall.folder = "input/", + outfolder = "Adults_vs_Prog", + samplesheet = "SampleSheets_BisulphiteExp_Sets.xlsx", + sheetnum = 2, + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + idcol = "SampleID", + design.var = "TestVar", + case.var = "Case", + control.var = "Controls", + mincoverage = 10, + idstoremove = c("S150310"), + assem = "hg38", + pipeline.meth = "bismarkCoverage", + proj.id = "Adults_vs_Prog_Case_vs_Control", + plot.covariates = c("Condition","MRD","Cytogenetics","TestVar"), + lowperc = 5, + lowcount = 10, + delta.meth = 10, + plot.height = 12, plot.width = 18, + plot.categorical.vars = c("Condition","Cytogenetics"), + plot.continuous.vars = c("miR-126"),cellw = 14, cellh = 8, fontnumsize = 7, fontsize = 7 +) + +library(openxlsx) +library(pheatmap) +library(RColorBrewer) + +# load data pct methylation + +pctmeth_matrix <- readRDS(file = "Adults_vs_Prog/Adults_vs_Prog_Case_vs_Control_CpG_percent_methylation_matrix_pheatmap.rds") + +# fixing colors +annotations <- readRDS("Adults_vs_Prog/Adults_vs_Prog_Case_vs_Control_annotations_matrix_pheatmap.rds") +annotations2 <- annotations + +annotations2$anncolors$Condition <- c("#006600","#929292","#c9c9c9","#b30101") # #006600: verdescuro, #b30101: rosso scuro, grigio-scuro: #929292, grigiochiaro: #c9c9c9 +names(annotations2$anncolors$Condition) <- names(annotations$anncolors$Condition) + +annotations2$anncolors$Cytogenetics <- c("#0080cc","#f6a145","white") # https://www.color-hex.com/color-palette/96440 +names(annotations2$anncolors$Cytogenetics) <- names(annotations$anncolors$Cytogenetics) + +png(filename = "Adults_vs_Prog/Adults_vs_Prog_Case_vs_Control_CpG_percent_methylation_matrix_pheatmap_CUSTOM.png", + width = 15, height = 7, units = "in", res = 300) +pheatmap(mat = pctmeth_matrix, + width = 15, height = 7, + na_col = "black", + cluster_cols = FALSE, + cluster_rows = TRUE, + annotation_row = annotations2$annrow, + cellwidth = 14, + cellheight = 13, + display_numbers = T, + fontsize = 10, + fontsize_number = 7, + number_format = "%.0f", + border_color = FALSE, + annotation_col = annotations2$anncol, + legend = F, + annotation_legend = TRUE, + annotation_names_col = T, + #labels_row = lrow, + #labels_col = lcol, + # gaps_row = c(23), + gaps_col = c(9,43), + annotation_colors = annotations2$anncolors, + color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100) +) +dev.off() + +################# +# Disease 12-27 # +################# + +source("MethylAnalysis_bulk.R") + +MethylAnalysis_bulk(methcall.folder = "input/", + outfolder = "S120167_Case_vs_Controls", + samplesheet = "SampleSheets_BisulphiteExp_Sets.xlsx", + sheetnum = 4, + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + idcol = "SampleID", + design.var = "TestVar", + case.var = "POP", + control.var = "DGN", + mincoverage = 10, + idstoremove = NULL, + assem = "hg38", + pipeline.meth = "bismarkCoverage", + proj.id = "S120167_POP_vs_DGN", + plot.covariates = c("Condition","TestVar"), + lowperc = 5, + lowcount = 10, + delta.meth = 10, + plot.height = 12, plot.width = 18, + plot.categorical.vars = c("Condition"), + plot.continuous.vars = c("miR-126"),cellw = 14, cellh = 8, fontnumsize = 7, fontsize = 7 +) + +# RUNNING GFPLOW VS GFPHIGH to get diff meth results and stats + +source("MethylAnalysis_bulk_v2.R") + +MethylAnalysis_bulk_v2(methcall.folder = "input/", + outfolder = "S120167_GFP_L_vs_GFP_H", + samplesheet = "SampleSheets_BisulphiteExp_Sets.xlsx", + sheetnum = 4, + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + idcol = "SampleID", + design.var = "Condition", + case.var = "GFP_L", + control.var = "GFP_H", + mincoverage = 10, + idstoremove = NULL, + assem = "hg38", + pipeline.meth = "bismarkCoverage", + proj.id = "S120167_GFP_L_vs_GFP_H", + plot.covariates = c("Condition"), + lowperc = 5, + lowcount = 10, + delta.meth = 10, + plot.height = 12, plot.width = 18, + plot.categorical.vars = c("Condition"), + plot.continuous.vars = c("miR-126"), + cellw = 14, + cellh = 8, + fontnumsize = 7, + fontsize = 7 +) + +# FIXING THE IMAGE + +pctmeth_matrix <- readRDS(file = "S120167_Case_vs_Controls/S120167_POP_vs_DGN_CpG_percent_methylation_matrix_pheatmap.rds") + +annotations <- readRDS("S120167_Case_vs_Controls/S120167_POP_vs_DGN_annotations_matrix_pheatmap.rds") +annotations_GFP_L_vs_GFP_H <- readRDS("S120167_GFP_L_vs_GFP_H/S120167_GFP_L_vs_GFP_H_annotations_matrix_pheatmap.rds") + +# replacing diff meth and GFP_L vs GFP_H qvalue_r in POP vs DGN chart + +annotations$anncol$qvalue_r <- annotations_GFP_L_vs_GFP_H$anncol$qvalue_r +annotations$anncol$meth.diff <- annotations_GFP_L_vs_GFP_H$anncol$meth.diff + +annotations2 <- annotations + +annotations2$anncolors$Condition <- c("#656161","#0b64a1","#df4041") # #006600: verdescuro, #b30101: rosso scuro, grigio-scuro: #929292, grigiochiaro: #c9c9c9 +names(annotations2$anncolors$Condition) <- names(annotations$anncolors$Condition) + +annotations2$anncolors$qvalue_r <- annotations_GFP_L_vs_GFP_H$anncolors$qvalue_r +annotations2$anncolors$meth.diff <- annotations_GFP_L_vs_GFP_H$anncolors$meth.diff + +annotations2$annrow$`miR-126` <- NULL + +# plot heatmap custom + +png(filename = "S120167_Case_vs_Controls/S120167_GFP_Lvs_GFP_H_CpG_percent_methylation_matrix_pheatmap_CUSTOM_CpG_main.png", + width = 15, height = 5, units = "in", res = 300) +pheatmap(mat = pctmeth_matrix[,pctmeth_matrix_ADULTs_CpG], + #filename = "S10272_Case_vs_Controls/S10272_GFP_Lvs_GFP_H_CpG_percent_methylation_matrix_pheatmap_CUSTOM.png", + width = 15, height = 7, + na_col = "black", + cluster_cols = FALSE, + cluster_rows = TRUE, + annotation_row = annotations2$annrow, + cellwidth = 14, + cellheight = 13, + display_numbers = T, + fontsize = 10, + fontsize_number = 7, + number_format = "%.0f", + border_color = FALSE, + annotation_col = annotations2$anncol, + legend = F, + annotation_legend = TRUE, + annotation_names_col = T, + #labels_row = lrow, + #labels_col = lcol, + # gaps_row = c(23), + gaps_col = c(9,43), + annotation_colors = annotations2$anncolors, + color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100) +) +dev.off() + diff --git a/Methylation/MethylAnalysis.R b/Methylation/MethylAnalysis.R new file mode 100644 index 0000000..0f36d4d --- /dev/null +++ b/Methylation/MethylAnalysis.R @@ -0,0 +1,305 @@ + +MethylAnalysis <- function(methcall.folder = NULL, # input folder with .cov files (bismark) + outfolder = NULL, # output folder (if not exist it will be created) + proj.id = "myproject", # project id to prefix in putput files + samplesheet = NULL, + sheetnum = 1, + design.var = "Disease", # variable to subset samplesheet + case.var = "Case", # 1 in treatment vector + control.var = "Control", # 0 in treatment vector + idcol = "SampleID", + mincoverage = 10, + lowperc = 5, + lowcount = 10, + assem="hg38", + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + pipeline.meth = "bismarkCoverage", + plot.covariates = c("Condition","Batch","MRD", + "Torelapse","Chemorefractory", + "Sex","CytoA","Tissue"), + idstoremove = NULL, + delta.meth = 10, + plot.categorical.vars = c("Condition","Batch","MRD","Torelapse","Chemorefractory","Sex","CytoA"), + plot.continuous.vars = c("miR-126","egfl7"), + plot.height = 9, + plot.width = 12, + cellw = 15, + cellh = 15, + fontnumsize = 5, + fontsize = 8 + ){ + + # load libraries + + require(methylKit) + require(ggplot2) + require(reshape2) + require(plyr) + require(ggrepel) + library(GenomicRanges) + library(openxlsx) + library(pheatmap) + + # check vars + + if(is.null(methcall.folder)){ + stop("Please set methcall.folder") + } + + if(is.null(outfolder)){ + stop("Please set outfolder") + } + + if(is.null(samplesheet)){ + stop("Please set samplesheet") + } + + # Setup variables + + infolder = paste0(methcall.folder, "/") + dir.create(infolder, showWarnings = F) + + outfolder = paste0(outfolder, "/") + dir.create(outfolder, showWarnings = F) + + qcfolder <- paste0(outfolder, "QC/") + dir.create(qcfolder, showWarnings = F) + + + # reading sample sheet with metadata + ssheet <- read.xlsx(samplesheet, sheet = sheetnum) + + if(!is.null(idstoremove)){ + ssheet <- subset.data.frame(x = ssheet, subset = !ssheet[[idcol]] %in% idstoremove) + } + + # defining design vector according to variable + + ssheet <- subset.data.frame(x = ssheet, subset = ssheet[[design.var]] %in% c(case.var, control.var)) + + d.vec <- ssheet[[design.var]] + d.vector <- ifelse(d.vec == case.var, 1, 0) + + # initialzing coverage .cov files list + covs <- list() + sampleids <- as.list(ssheet[[idcol]]) + names(sampleids) <- ssheet[[idcol]] + + for (i in ssheet[[idcol]]) { + covs[[i]] <- paste0(infolder, "/", i, ".bismark.cov") + } + + saveRDS(object = covs, file = "covs_object.rds") + + # creating object + + myobj <- methRead(location = covs, + sample.id = sampleids, + assembly = assem, + pipeline = pipeline.meth, + treatment = d.vector, + context="CpG", + mincov = mincoverage) + + saveRDS(myobj, file = "Myobject.rds") + + names(myobj) <- ssheet[[idcol]] + + saveRDS(myobj, file = "Myobject_with_names.rds") + + # subsetting if declared + + if(isTRUE(do.subset)){ + my.win = GRanges(seqnames = chr.subset, ranges = IRanges(start = start.subset, end = end.subset)) + myobj <- selectByOverlap(myobj,my.win) + } + + + saveRDS(myobj, file = "Myobject_with_names_aftersubset.rds") + + # filtering on minimum coverage + myobj <- filterByCoverage(methylObj = myobj, lo.count=lowcount, lo.perc = lowperc) + + # Normalization + myobj <- normalizeCoverage(obj = myobj) + + # saveRDS(object = myobj, file = "Initial_object.rds") + + # Calculate basic stats and PCs + + metricsfolder <- paste0(qcfolder, "Metrics/") + dir.create(path = metricsfolder, showWarnings = F) + + for (id in ssheet[[idcol]]) { + + png(filename = paste0(metricsfolder,proj.id,"_CpG_pct_methylation_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getMethylationStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + png(filename = paste0(metricsfolder, proj.id,"_Coverage_stats_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getCoverageStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + } + + # create meth obj + + #meth <- unite(object = myobj, destrand=FALSE) + meth <- unite(object = myobj, destrand=FALSE) # for debugging + saveRDS(meth, "savemeth.tmp.rds") + # Perform correlation + + sink(paste0(qcfolder, proj.id, "_Correlations.txt")) + getCorrelation(meth,plot=FALSE) + sink() + + if(length(ssheet[[idcol]]) < 15){ + png(filename = paste0(qcfolder,proj.id, "_Correlations_pearson_pairwise.png"), + width = 9, height = 6, units = "in", res = 96) + print(getCorrelation(meth,plot=TRUE)) + dev.off() + } + + png(filename = paste0(qcfolder, proj.id, "_Clustering.png"), + width = 9, height = 6, units = "in", res = 96) + clusterSamples(meth, dist="euclidean", plot=TRUE, method = "ward.D2") + dev.off() + + # Re-plotting PCs (custom chart) + + # compute PCs and store in object + + pca_compt <- PCASamples(meth, obj.return = T, screeplot = F) + + # extract PCs components + + pcafolder <- paste0(qcfolder, "PCA/") + dir.create(path = pcafolder, showWarnings = F) + + pca_pc1_2 <- as.data.frame(x = pca_compt$x[,1:2]) + + for(myvars in plot.covariates){ + pca_pc1_2$condition <- as.factor(ssheet[[myvars]]) + png(filename = paste0(pcafolder, proj.id, "_PCA_",myvars,".png"), + width = 9, height = 6, units = "in",res=96) + print(ggplot(data = pca_pc1_2, + mapping = aes(x = PC1, y=PC2, col=condition, label=rownames(pca_pc1_2))) + + geom_point(size=3) + geom_text_repel(size=3) + ggtitle(label = "Principal component analysis", subtitle = myvars) + + theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5)) + + theme(plot.subtitle=element_text(size=12, hjust=0.5, face="italic", color="black")) + + theme(axis.title = element_text(size=12, hjust=0.5, face="bold", color="black")) + + theme(legend.text = element_text(size=8, hjust=0.5)) + + theme(legend.title = element_blank()) + + theme(axis.text = element_text(size=12, hjust=0.5, color="black"))) + dev.off() + + } + + # retrieve and store % of methylation + + perc.meth <- percMethylation(meth) + + saveRDS(perc.meth,"pctmethly.rds") + + base::rownames(perc.meth) <- paste0(meth$chr, "_", meth$start) + + # Perform diff methylation + + myDiff=calculateDiffMeth(meth) + + write.table(myDiff,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + difftest <- read.table(paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), header = T) + + difftest$comparison <- proj.id + difftest$qvalue_r <- as.character(cut(x = difftest$qvalue, + breaks = c(-1, 1e-100, 1e-10, 1e-02, 1), + labels = c("***","**","*","ns"))) + + myindex <- abs(difftest$meth.diff) < delta.meth + difftest$meth.diff <- abs(difftest$meth.diff) + difftest$qvalue_r[myindex] <- "ns" + write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + + + # Adding color list and annotations + + library(RColorBrewer) + + color_list <- list() + annrows <- NULL + anncols <- subset.data.frame(difftest, select = c("qvalue_r","meth.diff")) + base::rownames(anncols) <- difftest$start + + rows.annot.vars.cat = plot.categorical.vars + rows.annot.vars.con = plot.continuous.vars + + if(!is.null(rows.annot.vars.cat) | !is.null(rows.annot.vars.con)){ + rownames(ssheet) <- ssheet$SampleID + annrows <- subset.data.frame(x = ssheet, select = c(rows.annot.vars.cat, rows.annot.vars.con)) + concolors <- RColorBrewer::brewer.pal(n = 9, name = "Set1") + catcolors <- NULL + for (varcon in 1:length(rows.annot.vars.con)) { + color_list[[rows.annot.vars.con[varcon]]] <- colorRampPalette(c("lightgrey", concolors[varcon]))(10) + } + for (varcat in 1:length(rows.annot.vars.cat)) { + ncolor <- 1 + for (val in unique(ssheet[[rows.annot.vars.cat[varcat]]])[order(unique(ssheet[[rows.annot.vars.cat[varcat]]]))]){ + if(length(unique(ssheet[[rows.annot.vars.cat[varcat]]])) > 9){ + catcolors <- colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = "Set3"))(length(unique(ssheet[[rows.annot.vars.cat[varcat]]]))) + } + else{ + catcolors <- RColorBrewer::brewer.pal(n = 9, name = "Set3") + } + color_list[[rows.annot.vars.cat[varcat]]][[val]] <- catcolors[ncolor] + ncolor <- ncolor + 1 + } + } + } + + color_list[["qvalue_r"]] = c("*" = "#6497b1", "**" = "#03396c", "***" = "#011f4b", ns= "#c4cacf") + color_list[["meth.diff"]] = colorRampPalette(brewer.pal(n = 11, "Reds"))(100) + color_list[["miR-126"]] <- brewer.pal(n = 9, "PuRd") + + # Heatmap + + pctmeth_matrix <- t(perc.meth) + base::colnames(pctmeth_matrix) <- gsub(x = base::colnames(pctmeth_matrix), pattern = "chr[0-9]+_", replacement = "") + + pheatmap(mat = pctmeth_matrix, main = gsub(x = proj.id, pattern = "_", replacement = " "), + filename = paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.pdf"), width = plot.width, height = plot.height, + na_col = "black", + cluster_cols = FALSE, + cluster_rows = TRUE, + annotation_row = annrows, + cellwidth = cellw, + cellheight = cellh, + display_numbers = T, + fontsize = fontsize, + fontsize_number = fontnumsize, + number_format = "%.0f", + #border_color = "#CBBEB5", + annotation_col = anncols, + #labels_row = lrow, + #labels_col = lcol, + #gaps_row = gaps.row, + gaps_col = c(8), + annotation_colors = color_list, + color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100) + ) + + saveRDS(object = pctmeth_matrix, paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.rds")) + saveRDS(object = list(anncol = anncols, annrow = annrows, anncolors = color_list), paste0(outfolder, proj.id,"_annotations_matrix_pheatmap.rds")) + saveRDS(object = difftest, paste0(outfolder, proj.id,"_CpG_differential_methylation.rds")) + saveRDS(object = perc.meth, paste0(outfolder, proj.id,"_CpG_percent_methylation.rds")) + + write.table(x = perc.meth, file = paste0(outfolder, proj.id,"_CpG_percent_methylation.txt")) + write.table(x = difftest, file = paste0(outfolder, proj.id,"_CpG_differential_methylation.txt")) + + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit.rds")) + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit_meth.rds")) + +} \ No newline at end of file diff --git a/Methylation/MethylAnalysis_bulk.R b/Methylation/MethylAnalysis_bulk.R new file mode 100644 index 0000000..96c9964 --- /dev/null +++ b/Methylation/MethylAnalysis_bulk.R @@ -0,0 +1,305 @@ + +MethylAnalysis_bulk <- function(methcall.folder = NULL, # input folder with .cov files (bismark) + outfolder = NULL, # output folder (if not exist it will be created) + proj.id = "myproject", # project id to prefix in putput files + samplesheet = NULL, + sheetnum = 1, + design.var = "Disease", # variable to subset samplesheet + case.var = "Case", # 1 in treatment vector + control.var = "Control", # 0 in treatment vector + idcol = "SampleID", + mincoverage = 10, + lowperc = 5, + lowcount = 10, + assem="hg38", + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + pipeline.meth = "bismarkCoverage", + plot.covariates = c("Condition","Batch","MRD", + "Torelapse","Chemorefractory", + "Sex","CytoA","Tissue"), + idstoremove = NULL, + delta.meth = 10, + plot.categorical.vars = c("Condition","Batch","MRD","Torelapse","Chemorefractory","Sex","CytoA"), + plot.continuous.vars = c("miR-126","egfl7"), + plot.height = 9, + plot.width = 12, + cellw = 15, + cellh = 15, + fontnumsize = 5, + fontsize = 8 + ){ + + # load libraries + + require(methylKit) + require(ggplot2) + require(reshape2) + require(plyr) + require(ggrepel) + library(GenomicRanges) + library(openxlsx) + library(pheatmap) + + # check vars + + if(is.null(methcall.folder)){ + stop("Please set methcall.folder") + } + + if(is.null(outfolder)){ + stop("Please set outfolder") + } + + if(is.null(samplesheet)){ + stop("Please set samplesheet") + } + + # Setup variables + + infolder = paste0(methcall.folder, "/") + dir.create(infolder, showWarnings = F) + + outfolder = paste0(outfolder, "/") + dir.create(outfolder, showWarnings = F) + + qcfolder <- paste0(outfolder, "QC/") + dir.create(qcfolder, showWarnings = F) + + + # reading sample sheet with metadata + ssheet <- read.xlsx(samplesheet, sheet = sheetnum) + + if(!is.null(idstoremove)){ + ssheet <- subset.data.frame(x = ssheet, subset = !ssheet[[idcol]] %in% idstoremove) + } + + # defining design vector according to variable + + ssheet <- subset.data.frame(x = ssheet, subset = ssheet[[design.var]] %in% c(case.var, control.var)) + + d.vec <- ssheet[[design.var]] + d.vector <- ifelse(d.vec == case.var, 1, 0) + + # initialzing coverage .cov files list + covs <- list() + sampleids <- as.list(ssheet[[idcol]]) + names(sampleids) <- ssheet[[idcol]] + + for (i in ssheet[[idcol]]) { + covs[[i]] <- paste0(infolder, "/", i, ".bismark.cov") + } + + saveRDS(object = covs, file = "covs_object.rds") + + # creating object + + myobj <- methRead(location = covs, + sample.id = sampleids, + assembly = assem, + pipeline = pipeline.meth, + treatment = d.vector, + context="CpG", + mincov = mincoverage) + + saveRDS(myobj, file = "Myobject.rds") + + names(myobj) <- ssheet[[idcol]] + + saveRDS(myobj, file = "Myobject_with_names.rds") + + # subsetting if declared + + if(isTRUE(do.subset)){ + my.win = GRanges(seqnames = chr.subset, ranges = IRanges(start = start.subset, end = end.subset)) + myobj <- selectByOverlap(myobj,my.win) + } + + + saveRDS(myobj, file = "Myobject_with_names_aftersubset.rds") + + # filtering on minimum coverage + myobj <- filterByCoverage(methylObj = myobj, lo.count=lowcount, lo.perc = lowperc) + + # Normalization + myobj <- normalizeCoverage(obj = myobj) + + # saveRDS(object = myobj, file = "Initial_object.rds") + + # Calculate basic stats and PCs + + metricsfolder <- paste0(qcfolder, "Metrics/") + dir.create(path = metricsfolder, showWarnings = F) + + for (id in ssheet[[idcol]]) { + + png(filename = paste0(metricsfolder,proj.id,"_CpG_pct_methylation_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getMethylationStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + png(filename = paste0(metricsfolder, proj.id,"_Coverage_stats_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getCoverageStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + } + + # create meth obj + + #meth <- unite(object = myobj, destrand=FALSE) + meth <- unite(object = myobj, destrand=FALSE) # for debugging + saveRDS(meth, "savemeth.tmp.rds") + # Perform correlation + + sink(paste0(qcfolder, proj.id, "_Correlations.txt")) + getCorrelation(meth,plot=FALSE) + sink() + + if(length(ssheet[[idcol]]) < 15){ + png(filename = paste0(qcfolder,proj.id, "_Correlations_pearson_pairwise.png"), + width = 9, height = 6, units = "in", res = 96) + print(getCorrelation(meth,plot=TRUE)) + dev.off() + } + + png(filename = paste0(qcfolder, proj.id, "_Clustering.png"), + width = 9, height = 6, units = "in", res = 96) + clusterSamples(meth, dist="euclidean", plot=TRUE, method = "ward.D2") + dev.off() + + # Re-plotting PCs (custom chart) + + # compute PCs and store in object + + pca_compt <- PCASamples(meth, obj.return = T, screeplot = F) + + # extract PCs components + + pcafolder <- paste0(qcfolder, "PCA/") + dir.create(path = pcafolder, showWarnings = F) + + pca_pc1_2 <- as.data.frame(x = pca_compt$x[,1:2]) + + for(myvars in plot.covariates){ + pca_pc1_2$condition <- as.factor(ssheet[[myvars]]) + png(filename = paste0(pcafolder, proj.id, "_PCA_",myvars,".png"), + width = 9, height = 6, units = "in",res=96) + print(ggplot(data = pca_pc1_2, + mapping = aes(x = PC1, y=PC2, col=condition, label=rownames(pca_pc1_2))) + + geom_point(size=3) + geom_text_repel(size=3) + ggtitle(label = "Principal component analysis", subtitle = myvars) + + theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5)) + + theme(plot.subtitle=element_text(size=12, hjust=0.5, face="italic", color="black")) + + theme(axis.title = element_text(size=12, hjust=0.5, face="bold", color="black")) + + theme(legend.text = element_text(size=8, hjust=0.5)) + + theme(legend.title = element_blank()) + + theme(axis.text = element_text(size=12, hjust=0.5, color="black"))) + dev.off() + + } + + # retrieve and store % of methylation + + perc.meth <- percMethylation(meth) + + saveRDS(perc.meth,"pctmethly.rds") + + base::rownames(perc.meth) <- paste0(meth$chr, "_", meth$start) + + # Perform diff methylation + + myDiff=calculateDiffMeth(meth) + + write.table(myDiff,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + difftest <- read.table(paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), header = T) + + difftest$comparison <- proj.id + difftest$qvalue_r <- as.character(cut(x = difftest$qvalue, + breaks = c(-1, 1e-100, 1e-10, 1e-02, 1), + labels = c("***","**","*","ns"))) + + myindex <- abs(difftest$meth.diff) < delta.meth + difftest$meth.diff <- abs(difftest$meth.diff) + difftest$qvalue_r[myindex] <- "ns" + write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + + + # Adding color list and annotations + + library(RColorBrewer) + + color_list <- list() + annrows <- NULL + anncols <- subset.data.frame(difftest, select = c("qvalue_r","meth.diff")) + base::rownames(anncols) <- difftest$start + + rows.annot.vars.cat = plot.categorical.vars + rows.annot.vars.con = plot.continuous.vars + + if(!is.null(rows.annot.vars.cat) | !is.null(rows.annot.vars.con)){ + rownames(ssheet) <- ssheet$SampleID + annrows <- subset.data.frame(x = ssheet, select = c(rows.annot.vars.cat, rows.annot.vars.con)) + concolors <- RColorBrewer::brewer.pal(n = 9, name = "Set1") + catcolors <- NULL + for (varcon in 1:length(rows.annot.vars.con)) { + color_list[[rows.annot.vars.con[varcon]]] <- colorRampPalette(c("lightgrey", concolors[varcon]))(10) + } + for (varcat in 1:length(rows.annot.vars.cat)) { + ncolor <- 1 + for (val in unique(ssheet[[rows.annot.vars.cat[varcat]]])[order(unique(ssheet[[rows.annot.vars.cat[varcat]]]))]){ + if(length(unique(ssheet[[rows.annot.vars.cat[varcat]]])) > 9){ + catcolors <- colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = "Set3"))(length(unique(ssheet[[rows.annot.vars.cat[varcat]]]))) + } + else{ + catcolors <- RColorBrewer::brewer.pal(n = 9, name = "Set3") + } + color_list[[rows.annot.vars.cat[varcat]]][[val]] <- catcolors[ncolor] + ncolor <- ncolor + 1 + } + } + } + + color_list[["qvalue_r"]] = c("*" = "#6497b1", "**" = "#03396c", "***" = "#011f4b", ns= "#c4cacf") + color_list[["meth.diff"]] = colorRampPalette(brewer.pal(n = 11, "Reds"))(100) + color_list[["miR-126"]] <- brewer.pal(n = 9, "PuRd") + + # Heatmap + + pctmeth_matrix <- t(perc.meth) + base::colnames(pctmeth_matrix) <- gsub(x = base::colnames(pctmeth_matrix), pattern = "chr[0-9]+_", replacement = "") + + pheatmap(mat = pctmeth_matrix, main = gsub(x = proj.id, pattern = "_", replacement = " "), + filename = paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.pdf"), width = plot.width, height = plot.height, + na_col = "black", + cluster_cols = FALSE, + cluster_rows = TRUE, + annotation_row = annrows, + cellwidth = cellw, + cellheight = cellh, + display_numbers = T, + fontsize = fontsize, + fontsize_number = fontnumsize, + number_format = "%.0f", + #border_color = "#CBBEB5", + annotation_col = anncols, + #labels_row = lrow, + #labels_col = lcol, + #gaps_row = gaps.row, + gaps_col = c(8), + annotation_colors = color_list, + color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100) + ) + + saveRDS(object = pctmeth_matrix, paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.rds")) + saveRDS(object = list(anncol = anncols, annrow = annrows, anncolors = color_list), paste0(outfolder, proj.id,"_annotations_matrix_pheatmap.rds")) + saveRDS(object = difftest, paste0(outfolder, proj.id,"_CpG_differential_methylation.rds")) + saveRDS(object = perc.meth, paste0(outfolder, proj.id,"_CpG_percent_methylation.rds")) + + write.table(x = perc.meth, file = paste0(outfolder, proj.id,"_CpG_percent_methylation.txt")) + write.table(x = difftest, file = paste0(outfolder, proj.id,"_CpG_differential_methylation.txt")) + + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit.rds")) + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit_meth.rds")) + +} \ No newline at end of file diff --git a/Methylation/MethylAnalysis_bulk_v2.R b/Methylation/MethylAnalysis_bulk_v2.R new file mode 100644 index 0000000..09956ef --- /dev/null +++ b/Methylation/MethylAnalysis_bulk_v2.R @@ -0,0 +1,304 @@ + +MethylAnalysis_bulk_v2 <- function(methcall.folder = NULL, # input folder with .cov files (bismark) + outfolder = NULL, # output folder (if not exist it will be created) + proj.id = "myproject", # project id to prefix in putput files + samplesheet = NULL, + sheetnum = 1, + design.var = "Disease", # variable to subset samplesheet + case.var = "Case", # 1 in treatment vector + control.var = "Control", # 0 in treatment vector + idcol = "SampleID", + mincoverage = 10, + lowperc = 5, + lowcount = 10, + assem="hg38", + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + pipeline.meth = "bismarkCoverage", + plot.covariates = c("Condition","Batch","MRD", + "Torelapse","Chemorefractory", + "Sex","CytoA","Tissue"), + idstoremove = NULL, + delta.meth = 10, + plot.categorical.vars = c("Condition","Batch","MRD","Torelapse","Chemorefractory","Sex","CytoA"), + plot.continuous.vars = c("miR-126","egfl7"), + plot.height = 9, + plot.width = 12, + cellw = 15, + cellh = 15, + fontnumsize = 5, + fontsize = 8 + ){ + + # load libraries + + require(methylKit) + require(ggplot2) + require(reshape2) + require(plyr) + require(ggrepel) + library(GenomicRanges) + library(openxlsx) + library(pheatmap) + + # check vars + + if(is.null(methcall.folder)){ + stop("Please set methcall.folder") + } + + if(is.null(outfolder)){ + stop("Please set outfolder") + } + + if(is.null(samplesheet)){ + stop("Please set samplesheet") + } + + # Setup variables + + infolder = paste0(methcall.folder, "/") + dir.create(infolder, showWarnings = F) + + outfolder = paste0(outfolder, "/") + dir.create(outfolder, showWarnings = F) + + qcfolder <- paste0(outfolder, "QC/") + dir.create(qcfolder, showWarnings = F) + + + # reading sample sheet with metadata + ssheet <- read.xlsx(samplesheet, sheet = sheetnum) + + if(!is.null(idstoremove)){ + ssheet <- subset.data.frame(x = ssheet, subset = !ssheet[[idcol]] %in% idstoremove) + } + + # defining design vector according to variable + + ssheet <- subset.data.frame(x = ssheet, subset = ssheet[[design.var]] %in% c(case.var, control.var)) + + d.vec <- ssheet[[design.var]] + d.vector <- ifelse(d.vec == case.var, 1, 0) + + # initialzing coverage .cov files list + covs <- list() + sampleids <- as.list(ssheet[[idcol]]) + names(sampleids) <- ssheet[[idcol]] + + for (i in ssheet[[idcol]]) { + covs[[i]] <- paste0(infolder, "/", i, ".bismark.cov") + } + + saveRDS(object = covs, file = "covs_object.rds") + + # creating object + + myobj <- methRead(location = covs, + sample.id = sampleids, + assembly = assem, + pipeline = pipeline.meth, + treatment = d.vector, + context="CpG", + mincov = mincoverage) + + saveRDS(myobj, file = "Myobject.rds") + + names(myobj) <- ssheet[[idcol]] + + saveRDS(myobj, file = "Myobject_with_names.rds") + + # subsetting if declared + + if(isTRUE(do.subset)){ + my.win = GRanges(seqnames = chr.subset, ranges = IRanges(start = start.subset, end = end.subset)) + myobj <- selectByOverlap(myobj,my.win) + } + + + saveRDS(myobj, file = "Myobject_with_names_aftersubset.rds") + + # filtering on minimum coverage + myobj <- filterByCoverage(methylObj = myobj, lo.count=lowcount, lo.perc = lowperc) + + # Normalization + myobj <- normalizeCoverage(obj = myobj) + + # saveRDS(object = myobj, file = "Initial_object.rds") + + # Calculate basic stats and PCs + + metricsfolder <- paste0(qcfolder, "Metrics/") + dir.create(path = metricsfolder, showWarnings = F) + + for (id in ssheet[[idcol]]) { + + png(filename = paste0(metricsfolder,proj.id,"_CpG_pct_methylation_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getMethylationStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + png(filename = paste0(metricsfolder, proj.id,"_Coverage_stats_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getCoverageStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + } + + # create meth obj + + #meth <- unite(object = myobj, destrand=FALSE) + meth <- unite(object = myobj, destrand=FALSE) # for debugging + saveRDS(meth, "savemeth.tmp.rds") + # Perform correlation + + sink(paste0(qcfolder, proj.id, "_Correlations.txt")) + getCorrelation(meth,plot=FALSE) + sink() + + if(length(ssheet[[idcol]]) < 15){ + png(filename = paste0(qcfolder,proj.id, "_Correlations_pearson_pairwise.png"), + width = 9, height = 6, units = "in", res = 96) + print(getCorrelation(meth,plot=TRUE)) + dev.off() + } + + png(filename = paste0(qcfolder, proj.id, "_Clustering.png"), + width = 9, height = 6, units = "in", res = 96) + clusterSamples(meth, dist="euclidean", plot=TRUE, method = "ward.D2") + dev.off() + + # Re-plotting PCs (custom chart) + + # compute PCs and store in object + + pca_compt <- PCASamples(meth, obj.return = T, screeplot = F) + + # extract PCs components + + pcafolder <- paste0(qcfolder, "PCA/") + dir.create(path = pcafolder, showWarnings = F) + + pca_pc1_2 <- as.data.frame(x = pca_compt$x[,1:2]) + + for(myvars in plot.covariates){ + pca_pc1_2$condition <- as.factor(ssheet[[myvars]]) + png(filename = paste0(pcafolder, proj.id, "_PCA_",myvars,".png"), + width = 9, height = 6, units = "in",res=96) + print(ggplot(data = pca_pc1_2, + mapping = aes(x = PC1, y=PC2, col=condition, label=rownames(pca_pc1_2))) + + geom_point(size=3) + geom_text_repel(size=3) + ggtitle(label = "Principal component analysis", subtitle = myvars) + + theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5)) + + theme(plot.subtitle=element_text(size=12, hjust=0.5, face="italic", color="black")) + + theme(axis.title = element_text(size=12, hjust=0.5, face="bold", color="black")) + + theme(legend.text = element_text(size=8, hjust=0.5)) + + theme(legend.title = element_blank()) + + theme(axis.text = element_text(size=12, hjust=0.5, color="black"))) + dev.off() + + } + + # retrieve and store % of methylation + + perc.meth <- percMethylation(meth) + + saveRDS(perc.meth,"pctmethly.rds") + + base::rownames(perc.meth) <- paste0(meth$chr, "_", meth$start) + + # Perform diff methylation + + myDiff=calculateDiffMeth(meth) + + write.table(myDiff,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + difftest <- read.table(paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), header = T) + + difftest$comparison <- proj.id + difftest$qvalue_r <- as.character(cut(x = difftest$qvalue, + breaks = c(-1, 1e-100, 1e-10, 1e-02, 1), + labels = c("***","**","*","ns"))) + + myindex <- abs(difftest$meth.diff) < delta.meth + difftest$meth.diff <- abs(difftest$meth.diff) + difftest$qvalue_r[myindex] <- "ns" + write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + + + # Adding color list and annotations + + library(RColorBrewer) + + color_list <- list() + annrows <- NULL + anncols <- subset.data.frame(difftest, select = c("qvalue_r","meth.diff")) + base::rownames(anncols) <- difftest$start + + rows.annot.vars.cat = plot.categorical.vars + rows.annot.vars.con = plot.continuous.vars + + # if(!is.null(rows.annot.vars.cat) | !is.null(rows.annot.vars.con)){ + # rownames(ssheet) <- ssheet$SampleID + # annrows <- subset.data.frame(x = ssheet, select = c(rows.annot.vars.cat, rows.annot.vars.con)) + # concolors <- RColorBrewer::brewer.pal(n = 9, name = "Set1") + # catcolors <- NULL + # for (varcon in 1:length(rows.annot.vars.con)) { + # color_list[[rows.annot.vars.con[varcon]]] <- colorRampPalette(c("lightgrey", concolors[varcon]))(10) + # } + # for (varcat in 1:length(rows.annot.vars.cat)) { + # ncolor <- 1 + # for (val in unique(ssheet[[rows.annot.vars.cat[varcat]]])[order(unique(ssheet[[rows.annot.vars.cat[varcat]]]))]){ + # if(length(unique(ssheet[[rows.annot.vars.cat[varcat]]])) > 9){ + # catcolors <- colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = "Set3"))(length(unique(ssheet[[rows.annot.vars.cat[varcat]]]))) + # } + # else{ + # catcolors <- RColorBrewer::brewer.pal(n = 9, name = "Set3") + # } + # color_list[[rows.annot.vars.cat[varcat]]][[val]] <- catcolors[ncolor] + # ncolor <- ncolor + 1 + # } + # } + # } + + color_list[["qvalue_r"]] = c("*" = "#6497b1", "**" = "#03396c", "***" = "#011f4b", ns= "#c4cacf") + color_list[["meth.diff"]] = colorRampPalette(brewer.pal(n = 11, "Reds"))(100) + color_list[["miR-126"]] <- brewer.pal(n = 9, "PuRd") + + # Heatmap + + pctmeth_matrix <- t(perc.meth) + base::colnames(pctmeth_matrix) <- gsub(x = base::colnames(pctmeth_matrix), pattern = "chr[0-9]+_", replacement = "") + + pheatmap(mat = pctmeth_matrix, main = gsub(x = proj.id, pattern = "_", replacement = " "), + filename = paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.pdf"), width = plot.width, height = plot.height, + na_col = "black", + cluster_cols = FALSE, + cluster_rows = TRUE, + annotation_row = annrows, + cellwidth = cellw, + cellheight = cellh, + display_numbers = T, + fontsize = fontsize, + fontsize_number = fontnumsize, + number_format = "%.0f", + #border_color = "#CBBEB5", + annotation_col = anncols, + #labels_row = lrow, + #labels_col = lcol, + #gaps_row = gaps.row, + gaps_col = c(8), + annotation_colors = color_list, + color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100) + ) + + saveRDS(object = pctmeth_matrix, paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.rds")) + saveRDS(object = list(anncol = anncols, annrow = annrows, anncolors = color_list), paste0(outfolder, proj.id,"_annotations_matrix_pheatmap.rds")) + saveRDS(object = difftest, paste0(outfolder, proj.id,"_CpG_differential_methylation.rds")) + saveRDS(object = perc.meth, paste0(outfolder, proj.id,"_CpG_percent_methylation.rds")) + + write.table(x = perc.meth, file = paste0(outfolder, proj.id,"_CpG_percent_methylation.txt")) + write.table(x = difftest, file = paste0(outfolder, proj.id,"_CpG_differential_methylation.txt")) + + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit.rds")) + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit_meth.rds")) + +} \ No newline at end of file -- GitLab