Bulk RNAseq related scripts

01d2c18d · Matteo Barcella · 5612725c · 01d2c18d · 01d2c18d · 01d2c18d
Commit 01d2c18d authored Feb 06, 2023 by Matteo Barcella
--- a/BulkRNAseq/Figure_4A.R
+++ b/BulkRNAseq/Figure_4A.R
+## PCA
+library(ggplot2)
+library(ggforce)
+library(openxlsx)
+
+P <- read.xlsx("Figure_4A_data.xlsx", sheet = 1, rowNames = T)
+
+svg(filename = "PCAplot_1000genes.svg", width = 6, height = 4.5)
+ggplot(data = P, aes(x = PC1, y = PC2)) +
+  geom_point(aes(color = Gate, shape = Treatment), size = 3) +
+  ggtitle(label = "Principal Component Analysis (PCA)",
+          subtitle = "Top 1000 genes") +
+  scale_color_manual(values = c(GFP_LOW = "#4aab23", GFP_HIGH = "#515151")) +
+  scale_shape_manual(values = c(Control = 2, Treated = 19)) +
+  # stat_ellipse(aes(group = Patient), level = 0.975, linetype = 2) +
+  geom_mark_ellipse(aes(label = Patient, group = Patient), con.border = "none", con.type = "elbow", linetype = 2) +
+  xlab("PC1: 31% variance") + ylab("PC2: 21% variance") +
+  theme_bw() + theme(axis.text = element_text(size = 12), 
+                     axis.title = element_text(size = 12), 
+                     legend.text = element_text(size = 12),
+                     legend.title = element_text(size = 12), 
+                     plot.title = element_text(size = 12))
+dev.off()
+
+Q <- read.xlsx("Figure_4A_data.xlsx", sheet = 2, rowNames = T)
+
+svg(filename = "PCAplot_1000genes_batch_corr.svg", width = 6, height = 4.5)
+ggplot(data = Q, aes(x = PC1, y = PC2)) +
+  geom_point(aes(color = Gate, shape = Treatment), size = 3) +
+  ggtitle(label = "Principal Component Analysis (PCA)",
+          subtitle = "Top 1000 genes, batch corrected") +
+  scale_color_manual(values = c(GFP_LOW = "#4aab23", GFP_HIGH = "#515151")) +
+  scale_shape_manual(values = c(Control = 2, Treated = 19)) +
+  xlab("PC1: 24% variance") + ylab("PC2: 14% variance") +
+  theme_bw() + theme(axis.text = element_text(size = 12), 
+                     axis.title = element_text(size = 12), 
+                     legend.text = element_text(size = 12),
+                     legend.title = element_text(size = 12), 
+                     plot.title = element_text(size = 12))
+dev.off()
+
+
--- a/BulkRNAseq/Figure_4A_data.xlsx
+++ b/BulkRNAseq/Figure_4A_data.xlsx
--- a/BulkRNAseq/Figure_4B.R
+++ b/BulkRNAseq/Figure_4B.R
+# Custom randomWalks figure 4B  
+
+suppressPackageStartupMessages(library('dplyr'))
+suppressPackageStartupMessages(library('clusterProfiler'))
+suppressPackageStartupMessages(library('png'))
+suppressPackageStartupMessages(library('enrichplot'))
+suppressPackageStartupMessages(library('DOSE'))
+suppressPackageStartupMessages(library('httr'))
+suppressPackageStartupMessages(library('magrittr'))
+suppressPackageStartupMessages(library('ggrepel'))
+suppressPackageStartupMessages(library('cowplot'))
+suppressPackageStartupMessages(library('RColorBrewer'))
+suppressPackageStartupMessages(library('enrichplot'))
+suppressPackageStartupMessages(library('openxlsx'))
+
+Figure_4B_data <- readRDS(file = "Figure_4B_data.rds")
+
+gseaScores <- getFromNamespace("gseaScores", "DOSE")
+
+gsInfo <- function(object, geneSetID) {
+  geneList <- object@geneList
+  if (is.numeric(geneSetID))
+    geneSetID <- object@result[geneSetID, "ID"]
+  geneSet <- object@geneSets[[geneSetID]]
+  exponent <- object@params[["exponent"]]
+  df <- gseaScores(geneList, geneSet, exponent, fortify=TRUE)
+  df$ymin=0
+  df$ymax=0
+  pos <- df$position == 1
+  h <- diff(range(df$runningScore))/20
+  df$ymin[pos] <- -h
+  df$ymax[pos] <- h
+  df$geneList <- geneList
+  df$Description <- object@result[geneSetID, "Description"]
+  return(df)
+}
+
+gseaplot <- function (x, geneSetID, title = "", color = "green", base_size = 11, 
+                       rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = FALSE, 
+                       ES_geom = "line", lp1 = 0.2, lp2 = 0.2, linesize = 1.5, labsize = 7) 
+{
+  ES_geom <- match.arg(ES_geom, c("line", "dot"))
+  geneList <- position <- NULL
+  if (length(geneSetID) == 1) {
+    gsdata <- gsInfo(x, geneSetID)
+  }
+  else {
+    gsdata <- do.call(rbind, lapply(geneSetID, gsInfo, object = x))
+  }
+  p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + theme_classic(base_size) + 
+    theme(panel.grid.major = element_line(colour = "grey92"), 
+          panel.grid.minor = element_line(colour = "grey92"), 
+          panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + 
+    scale_x_continuous(expand = c(0, 0))
+  if (ES_geom == "line") {
+    es_layer <- geom_line(aes_(y = ~runningScore, color = ~Description), 
+                          size = linesize)
+  }
+  else {
+    es_layer <- geom_point(aes_(y = ~runningScore, color = ~Description), 
+                           size = 1, data = subset(gsdata, position == 1))
+  }
+  p.res <- p + es_layer + theme(legend.position = c(lp1,lp2), 
+                                legend.text = element_text(size = labsize),
+                                legend.key.height = unit(0.4, 'cm'),
+                                legend.key.width = unit(0.4, 'cm'), 
+                                #legend.key = element_rect(fill = NULL),
+                                #legend.key = element_blank(),
+                                legend.title = element_blank(), 
+                                legend.background = element_rect(fill = "transparent"))
+  p.res <- p.res + ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(), 
+                                                            axis.title.y = element_text(size = 10),
+                                                            axis.text.y.left = element_text(size = 10),
+                                                            axis.ticks.x = element_blank(), axis.line.x = element_blank(), 
+                                                            plot.margin = margin(t = 0.2, r = 0.2, b = 0, l = 0.2, 
+                                                                                 unit = "cm"))
+  i <- 0
+  for (term in unique(gsdata$Description)) {
+    idx <- which(gsdata$ymin != 0 & gsdata$Description == 
+                   term)
+    gsdata[idx, "ymin"] <- i
+    gsdata[idx, "ymax"] <- i + 1
+    i <- i + 1
+  }
+  p2 <- ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin, 
+                                                           ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) + 
+    theme_classic(base_size) + theme(legend.position = "none", 
+                                     plot.margin = margin(t = -0.1, b = 0, unit = "cm"), axis.ticks = element_blank(), 
+                                     axis.text = element_blank(), axis.line.x = element_blank()) + 
+    scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 
+                                                                         0))
+  if (length(geneSetID) == 1) {
+    v <- seq(1, sum(gsdata$position), length.out = 9)
+    inv <- findInterval(rev(cumsum(gsdata$position)), v)
+    if (min(inv) == 0) 
+      inv <- inv + 1
+    col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
+    ymin <- min(p2$data$ymin)
+    yy <- max(p2$data$ymax - p2$data$ymin) * 0.3
+    xmin <- which(!duplicated(inv))
+    xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
+    d <- data.frame(ymin = ymin, ymax = yy, xmin = xmin, 
+                    xmax = xmax, col = col[unique(inv)])
+    p2 <- p2 + geom_rect(aes_(xmin = ~xmin, xmax = ~xmax, 
+                              ymin = ~ymin, ymax = ~ymax, fill = ~I(col)), data = d, 
+                         alpha = 0.9, inherit.aes = FALSE)
+  }
+  df2 <- p$data
+  df2$y <- p$data$geneList[df2$x]
+  p.pos <- p + geom_segment(data = df2, aes_(x = ~x, xend = ~x, 
+                                             y = ~y, yend = 0), color = "grey")
+  p.pos <- p.pos + ylab("Ranked List Metric") + xlab("Rank in Ordered Dataset") + 
+    theme(plot.margin = margin(t = -0.1, r = 0.2, b = 0.2, 
+                               l = 0.2, unit = "cm"), 
+          axis.title = element_text(size = 10), axis.text = element_text(size = 10))
+  if (!is.null(title) && !is.na(title) && title != "") 
+    p.res <- p.res + ggtitle(title)
+  if (length(color) == length(geneSetID)) {
+    p.res <- p.res + scale_color_manual(values = color)
+    if (length(color) == 1) {
+      p.res <- p.res + theme(legend.position = "none")
+      p2 <- p2 + scale_color_manual(values = "black")
+    }
+    else {
+      p2 <- p2 + scale_color_manual(values = color)
+    }
+  }
+  if (pvalue_table) {
+    pd <- x[geneSetID, c("Description", "pvalue", "p.adjust")]
+    pd <- pd[order(pd[, 1], decreasing = FALSE), ]
+    rownames(pd) <- pd$Description
+    pd <- pd[, -1]
+    pd <- round(pd, 4)
+    tp <- tableGrob2(pd, p.res)
+    p.res <- p.res + theme(legend.position = "none")# + 
+    annotation_custom(tp, 
+                      xmin = quantile(p.res$data$x, 0.5), 
+                      xmax = quantile(p.res$data$x, 0.95), 
+                      ymin = quantile(p.res$data$runningScore,0.75), 
+                      ymax = quantile(p.res$data$runningScore,0.9))
+  }
+  plotlist <- list(p.res, p2, p.pos)[subplots]
+  n <- length(plotlist)
+  plotlist[[n]] <- plotlist[[n]] + theme(axis.line.x = element_line(), 
+                                         axis.ticks.x = element_line(), axis.text.x = element_text())
+  if (length(subplots) == 1) 
+    return(plotlist[[1]] + theme(plot.margin = margin(t = 0.2, 
+                                                      r = 0.2, b = 0.2, l = 0.2, unit = "cm")))
+  if (length(rel_heights) > length(subplots)) 
+    rel_heights <- rel_heights[subplots]
+  plot_grid(plotlist = plotlist, ncol = 1, align = "v", rel_heights = rel_heights)
+}
+
+Figure4B_left <- gseaplot(x = Figure_4B_data, 
+                              geneSetID = c("GENTLES_LEUKEMIC_STEM_CELL_UP",
+                                            "EPPERT_CE_HSC_LSC",
+                                            "JAATINEN_HEMATOPOIETIC_STEM_CELL_UP",
+                                            "GRAHAM_CML_DIVIDING_VS_NORMAL_QUIESCENT_DN"),
+                              pvalue_table = FALSE,rel_heights = c(2.25,0.25,1),
+                              color = c("#929292", "#1e1e1e", "#c9c9c9", "#4b4b4b"), ES_geom = "line", 
+                              #lp1 = 0.75,
+                              lp1 = 0.35,
+                              lp2 = 0.20, linesize = 1.4, labsize = 10)
+
+png(filename = "Figure4B_left.png", width = 7, height = 4, units = "in", res = 250)
+Figure4B_left
+dev.off()
+
+svg(filename = "Figure4B_left.svg", width = 7, height = 4)
+Figure4B_left
+dev.off()
+
+Figure4B_right <- gseaplot(x = Figure_4B_data, 
+                    geneSetID = c("GAL_LEUKEMIC_STEM_CELL_DN",
+                                  "ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER",
+                                  "JAATINEN_HEMATOPOIETIC_STEM_CELL_DN",
+                                  "BROWN_MYELOID_CELL_DEVELOPMENT_UP"),
+                    pvalue_table = FALSE,rel_heights = c(2.25,0.25,1),
+                    color = c("#929292", "#1e1e1e", "#c9c9c9", "#4b4b4b"), ES_geom = "line", 
+                    lp1 = 0.63, 
+                    lp2 = 0.9, 
+                    linesize = 1.2, labsize = 10)
+
+png(filename = "Figure4B_right.png", width = 7, height = 4, units = "in", res = 250)
+Figure4B_right
+dev.off()
+
+svg(filename = "Figure4B_right.svg", width = 7, height = 4)
+Figure4B_right
+dev.off()
+
+
+
--- a/BulkRNAseq/Figure_4B_data.rds
+++ b/BulkRNAseq/Figure_4B_data.rds
--- a/BulkRNAseq/Figure_4B_fun.R
+++ b/BulkRNAseq/Figure_4B_fun.R
+Figure_4B_fun <- function (x, geneSetID, title = "", color = "green", base_size = 11, 
+          rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = FALSE, 
+          ES_geom = "line", lp1 = 0.2, lp2 = 0.2, linesize = 1.5, labsize = 7) 
+{
+  ES_geom <- match.arg(ES_geom, c("line", "dot"))
+  geneList <- position <- NULL
+  if (length(geneSetID) == 1) {
+    gsdata <- gsInfo(x, geneSetID)
+  }
+  else {
+    gsdata <- do.call(rbind, lapply(geneSetID, gsInfo, object = x))
+  }
+  p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + theme_classic(base_size) + 
+    theme(panel.grid.major = element_line(colour = "grey92"), 
+          panel.grid.minor = element_line(colour = "grey92"), 
+          panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + 
+    scale_x_continuous(expand = c(0, 0))
+  if (ES_geom == "line") {
+    es_layer <- geom_line(aes_(y = ~runningScore, color = ~Description), 
+                          size = linesize)
+  }
+  else {
+    es_layer <- geom_point(aes_(y = ~runningScore, color = ~Description), 
+                           size = 1, data = subset(gsdata, position == 1))
+  }
+  p.res <- p + es_layer + theme(legend.position = c(lp1, lp2), legend.text = element_text(size = labsize),
+                                legend.title = element_blank(), legend.background = element_rect(fill = "transparent"))
+  p.res <- p.res + ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(), 
+                                                            axis.title.y = element_text(size = 8),
+                                                            axis.text.y.left = element_text(size = 8),
+                                                            axis.ticks.x = element_blank(), axis.line.x = element_blank(), 
+                                                            plot.margin = margin(t = 0.2, r = 0.2, b = 0, l = 0.2, 
+                                                                                 unit = "cm"))
+  i <- 0
+  for (term in unique(gsdata$Description)) {
+    idx <- which(gsdata$ymin != 0 & gsdata$Description == 
+                   term)
+    gsdata[idx, "ymin"] <- i
+    gsdata[idx, "ymax"] <- i + 1
+    i <- i + 1
+  }
+  p2 <- ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin, 
+                                                           ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) + 
+    theme_classic(base_size) + theme(legend.position = "none", 
+                                     plot.margin = margin(t = -0.1, b = 0, unit = "cm"), axis.ticks = element_blank(), 
+                                     axis.text = element_blank(), axis.line.x = element_blank()) + 
+    scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 
+                                                                         0))
+  if (length(geneSetID) == 1) {
+    v <- seq(1, sum(gsdata$position), length.out = 9)
+    inv <- findInterval(rev(cumsum(gsdata$position)), v)
+    if (min(inv) == 0) 
+      inv <- inv + 1
+    col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
+    ymin <- min(p2$data$ymin)
+    yy <- max(p2$data$ymax - p2$data$ymin) * 0.3
+    xmin <- which(!duplicated(inv))
+    xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
+    d <- data.frame(ymin = ymin, ymax = yy, xmin = xmin, 
+                    xmax = xmax, col = col[unique(inv)])
+    p2 <- p2 + geom_rect(aes_(xmin = ~xmin, xmax = ~xmax, 
+                              ymin = ~ymin, ymax = ~ymax, fill = ~I(col)), data = d, 
+                         alpha = 0.9, inherit.aes = FALSE)
+  }
+  df2 <- p$data
+  df2$y <- p$data$geneList[df2$x]
+  p.pos <- p + geom_segment(data = df2, aes_(x = ~x, xend = ~x, 
+                                             y = ~y, yend = 0), color = "grey")
+  p.pos <- p.pos + ylab("Ranked List Metric") + xlab("Rank in Ordered Dataset") + 
+    theme(plot.margin = margin(t = -0.1, r = 0.2, b = 0.2, 
+                               l = 0.2, unit = "cm"), 
+          axis.title = element_text(size = 8))
+  if (!is.null(title) && !is.na(title) && title != "") 
+    p.res <- p.res + ggtitle(title)
+  if (length(color) == length(geneSetID)) {
+    p.res <- p.res + scale_color_manual(values = color)
+    if (length(color) == 1) {
+      p.res <- p.res + theme(legend.position = "none")
+      p2 <- p2 + scale_color_manual(values = "black")
+    }
+    else {
+      p2 <- p2 + scale_color_manual(values = color)
+    }
+  }
+  if (pvalue_table) {
+    pd <- x[geneSetID, c("Description", "pvalue", "p.adjust")]
+    pd <- pd[order(pd[, 1], decreasing = FALSE), ]
+    rownames(pd) <- pd$Description
+    pd <- pd[, -1]
+    pd <- round(pd, 4)
+    tp <- tableGrob2(pd, p.res)
+    p.res <- p.res + theme(legend.position = "none") + annotation_custom(tp, 
+                                                                         xmin = quantile(p.res$data$x, 0.5), xmax = quantile(p.res$data$x, 
+                                                                                                                             0.95), ymin = quantile(p.res$data$runningScore, 
+                                                                                                                                                    0.75), ymax = quantile(p.res$data$runningScore, 
+                                                                                                                                                                           0.9))
+  }
+  plotlist <- list(p.res, p2, p.pos)[subplots]
+  n <- length(plotlist)
+  plotlist[[n]] <- plotlist[[n]] + theme(axis.line.x = element_line(), 
+                                         axis.ticks.x = element_line(), axis.text.x = element_text())
+  if (length(subplots) == 1) 
+    return(plotlist[[1]] + theme(plot.margin = margin(t = 0.2, 
+                                                      r = 0.2, b = 0.2, l = 0.2, unit = "cm")))
+  if (length(rel_heights) > length(subplots)) 
+    rel_heights <- rel_heights[subplots]
+  plot_grid(plotlist = plotlist, ncol = 1, align = "v", rel_heights = rel_heights)
+}
--- a/BulkRNAseq/Figure_4C.R
+++ b/BulkRNAseq/Figure_4C.R
+library(readxl)
+library(RColorBrewer)
+library(pathfindR)
+library(pathfindR.data)
+library(dplyr)
+library(DESeq2)
+
+
+DGE_GFPLvsH <- readRDS("DGE_GFPLvsH.rds") 
+
+# Load expression matrix:
+
+# raw counts:
+rawdata <- read.table(file = "Full-gene-counts.txt") # deposited to GEO
+rawdata_m <- as.matrix(rawdata)
+samples <- colnames(rawdata)
+
+# normalized counts
+normcounts_GFPLvsH = counts(DGE_GFPLvsH[["deseq"]][["ds"]], normalized=T)
+
+# Sample info
+GFPhighsamples <- grep(".+H$", samples, value = T)
+GFPlowsamples <- grep(".+L$", samples, value = T)
+Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
+Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
+
+Post_analysis_Enrichment_GOBP_Full <- read_excel("Figure_4C_data.xlsx",
+                                                 sheet = "ALL_noduplicates", 
+                                                 col_types = c("skip","text", "text", "numeric", "numeric", "numeric", "text"))
+
+Post_analysis_Enrichment_GOBP_Full$Down_regulated <- NA
+Post_analysis_Enrichment_GOBP_Full$Down_regulated <- as.character(Post_analysis_Enrichment_GOBP_Full$Down_regulated)
+Post_analysis_Enrichment_GOBP_Full = Post_analysis_Enrichment_GOBP_Full %>% filter(Term_Description != "detoxification of inorganic compound")
+score_matrix_FULL_norm <- score_terms(enrichment_table = Post_analysis_Enrichment_GOBP_Full,
+                                      exp_mat = normcounts_GFPLvsH,
+                                      cases = GFPlowsamples, 
+                                      use_description = TRUE, # default FALSE
+                                      label_samples = FALSE, # default = TRUE
+                                      case_title = "GFP_low",  # default = "Case"
+                                      control_title = "GFP_high", # default = "Control"
+                                      low = "#f7797d", # default = "green"
+                                      mid = "#fffde4", # default = "black"
+                                      high = "#1f4037") # default = "red"
+
+score_matrix_FULL <- score_terms(enrichment_table = Post_analysis_Enrichment_GOBP_Full,
+                                      exp_mat = rawdata_m,
+                                      cases = GFPlowsamples, 
+                                      use_description = TRUE, # default FALSE
+                                      label_samples = FALSE, # default = TRUE
+                                      case_title = "GFP_low",  # default = "Case"
+                                      control_title = "GFP_high", # default = "Control"
+                                      low = "#f7797d", # default = "green"
+                                      mid = "#fffde4", # default = "black"
+                                      high = "#1f4037") # default = "red"
+
+colorsZ2 <- c(min(score_matrix_FULL_norm),seq(-2,2,by=0.01),max(score_matrix_FULL_norm))
+my_palette <- c("Darkblue", colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2)-3), "Darkred")
+Annotations_colors_4_gexp_heatmap <- readRDS("Figure_4CDE_S4DE_colors.rds")
+Annotations_colors_4_gexp_heatmap_nomir = Annotations_colors_4_gexp_heatmap
+Annotations_colors_4_gexp_heatmap_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp <- readRDS("Figure_4CDE_S4DE_annots.rds")
+SampleInfo_annotation_heatmap_gexp_nomir = SampleInfo_annotation_heatmap_gexp
+SampleInfo_annotation_heatmap_gexp_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$condition = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$Patient = SampleInfo_annotation_heatmap_gexp_nomir$patient
+SampleInfo_annotation_heatmap_gexp_nomir$patient = NULL
+
+colorsZ2bis <- c(seq(-2,2,by=0.1))
+my_palette <- c("#021830",colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2bis)-3),"#33000f")
+
+callback = function(hc, score_matrix_FULL_norm){
+  sv = svd(t(score_matrix_FULL_norm))$v[,1]
+  dend = reorder(as.dendrogram(hc), wts = sv)
+  as.hclust(dend)
+}
+
+png(filename = "Figure_4C.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_FULL_norm, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 2, cutree_rows = 3)
+dev.off()
+
+png(filename = "Figure_4C_callback.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_FULL_norm, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 2, cutree_rows = 3, clustering_callback = callback)
+dev.off()
+
+
--- a/BulkRNAseq/Figure_4CDE_S4DE_annots.rds
+++ b/BulkRNAseq/Figure_4CDE_S4DE_annots.rds
--- a/BulkRNAseq/Figure_4CDE_S4DE_colors.rds
+++ b/BulkRNAseq/Figure_4CDE_S4DE_colors.rds
--- a/BulkRNAseq/Figure_4C_data.xlsx
+++ b/BulkRNAseq/Figure_4C_data.xlsx
--- a/BulkRNAseq/Figure_4D.R
+++ b/BulkRNAseq/Figure_4D.R
+# BulkRNAseq data analysis with PathfindR:
+library(pathfindR)
+library(pathfindR.data)
+library(dplyr)
+library(DESeq2)
+
+
+# Load expression matrix:
+
+rawdata <- read.table(file = "Full-gene-counts.txt") # GEO data
+rawdata_m <- as.matrix(rawdata)
+samples <- colnames(rawdata)
+
+# Sample info
+GFPhighsamples <- grep(".+H$", samples, value = T)
+GFPlowsamples <- grep(".+L$", samples, value = T)
+Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
+Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
+
+LowOnly_TvsC <- read_excel("Figure_4D_data.xlsx")
+LowOnly_TvsC$Down_regulated <- NA
+LowOnly_TvsC$Down_regulated <- as.character(LowOnly_TvsC$Down_regulated)
+
+# load DGE analysis
+
+DGE = readRDS("20210804_202201_DGE.rds")
+normcounts = counts(DGE[["deseq"]][["ds"]], normalized=T)
+score_matrix_LowOnly_TvsC <- score_terms(enrichment_table = LowOnly_TvsC,
+                                         exp_mat = normcounts[,GFPlowsamples],
+                                         cases = intersect(GFPlowsamples, Treatedsamples), 
+                                         use_description = TRUE, # default FALSE
+                                         label_samples = FALSE, # default = TRUE
+                                         case_title = "Treated",  # default = "Case"
+                                         control_title = "Control", # default = "Control"
+                                         low = "#f7797d", # default = "green"
+                                         mid = "#fffde4", # default = "black"
+                                         high = "#1f4037") # default = "red"
+
+Annotations_colors_4_gexp_heatmap <- readRDS("Figure_4CDE_S4DE_colors.rds")
+Annotations_colors_4_gexp_heatmap_nomir = Annotations_colors_4_gexp_heatmap
+Annotations_colors_4_gexp_heatmap_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp <- readRDS("Figure_4CDE_S4DE_annots.rds")
+SampleInfo_annotation_heatmap_gexp_nomir = SampleInfo_annotation_heatmap_gexp
+SampleInfo_annotation_heatmap_gexp_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$condition = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$Patient = SampleInfo_annotation_heatmap_gexp_nomir$patient
+SampleInfo_annotation_heatmap_gexp_nomir$patient = NULL
+
+colorsZ2bis <- c(seq(-2,2,by=0.1))
+my_palette <- c("#021830",colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2bis)-3),"#33000f")
+
+png(filename = "Figure_4D.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_LowOnly_TvsC, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 1, cutree_rows = 1)
+dev.off()
+
--- a/BulkRNAseq/Figure_4D_data.xlsx
+++ b/BulkRNAseq/Figure_4D_data.xlsx
--- a/BulkRNAseq/Figure_4E.R
+++ b/BulkRNAseq/Figure_4E.R
+library(pathfindR)
+library(pathfindR.data)
+library(dplyr)
+library(DESeq2)
+
+rawdata <- read.table(file = "Full-gene-counts.txt") # GEO data
+rawdata_m <- as.matrix(rawdata)
+samples <- colnames(rawdata)
+
+# Sample info
+GFPhighsamples <- grep(".+H$", samples, value = T)
+GFPlowsamples <- grep(".+L$", samples, value = T)
+Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
+Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
+
+dge <- readRDS("20210804_233137_DGE.rds") # TvsC analysis
+normcountsTvsC = counts(dge[["deseq"]][["ds"]], normalized=T)
+
+Post_analysis_Enrichment_TvsC_senescence_Full <- read_excel("Figure_4E_data.xlsx")
+
+Post_analysis_Enrichment_TvsC_senescence_Full$Down_regulated <- NA
+Post_analysis_Enrichment_TvsC_senescence_Full$Down_regulated <- as.character(Post_analysis_Enrichment_TvsC_senescence_Full$Down_regulated)
+
+score_matrix_FULL_norm_TvsC <- score_terms(enrichment_table = Post_analysis_Enrichment_TvsC_senescence_Full,
+                                           exp_mat = normcountsTvsC,
+                                           cases = Treatedsamples, 
+                                           use_description = TRUE, # default FALSE
+                                           label_samples = FALSE, # default = TRUE
+                                           case_title = "Treated",  # default = "Case"
+                                           control_title = "Control", # default = "Control"
+                                           low = "#f7797d", # default = "green"
+                                           mid = "#fffde4", # default = "black"
+                                           high = "#1f4037") # default = "red"
+
+png(filename = "Figure_4E.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_FULL_norm_TvsC, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 1, cutree_rows = 1)
+dev.off()
+
+callback = function(hc, score_matrix_FULL_norm_TvsC){
+  sv = svd(t(score_matrix_FULL_norm_TvsC))$v[,1]
+  dend = reorder(as.dendrogram(hc), wts = sv)
+  as.hclust(dend)
+}
+
+png(filename = "Figure_4E_callback.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_FULL_norm_TvsC, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 1, cutree_rows = 1, clustering_callback = callback)
+dev.off()
+
+score_matrix_FULL_norm_TvsC_filt <- score_terms(enrichment_table = Post_analysis_Enrichment_TvsC_senescence_Full %>% filter(!Term_Description %in% c("RHEIN_ALL_GLUCOCORTICOID_THERAPY_UP",
+                                                                                                                                                     "KAN_RESPONSE_TO_ARSENIC_TRIOXIDE",
+                                                                                                                                                     "CHIARETTI_T_ALL_REFRACTORY_TO_THERAPY")),
+                                                exp_mat = normcountsTvsC,
+                                                cases = Treatedsamples, 
+                                                use_description = TRUE, # default FALSE
+                                                label_samples = FALSE, # default = TRUE
+                                                case_title = "Treated",  # default = "Case"
+                                                control_title = "Control", # default = "Control"
+                                                low = "#f7797d", # default = "green"
+                                                mid = "#fffde4", # default = "black"
+                                                high = "#1f4037") # default = "red"
+
+
+png(filename = "Figure_4E_filtered.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_FULL_norm_TvsC_filt, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 1, cutree_rows = 1, clustering_callback = callback)
+dev.off()
+
--- a/BulkRNAseq/Figure_4E_data.xlsx
+++ b/BulkRNAseq/Figure_4E_data.xlsx
--- a/BulkRNAseq/Figure_8A_8B.R
+++ b/BulkRNAseq/Figure_8A_8B.R
+suppressPackageStartupMessages(library('GOSemSim')) # embedd the simplify function for performing recuction
+suppressPackageStartupMessages(library('ade4')) # for further development
+suppressPackageStartupMessages(library('pheatmap'))
+suppressPackageStartupMessages(library('RColorBrewer'))
+suppressPackageStartupMessages(library('clusterProfiler'))
+suppressPackageStartupMessages(library('ComplexHeatmap')) # allows to simply create existence matrix
+
+postAnalysisClustering <- function(postanalysisrds, simplify.cutoff = 0.7, analysis, outfolder){
+    brewerpalettecode = "YlGnBu"
+   # brewerpalettecode = "RdBu"
+    # Get data from rdsfile if set
+    myres <- list()
+    if(!is.null(postanalysisrds)) {
+        gset_df <- NULL
+        postanalysis <- readRDS(postanalysisrds) # load rds from post analysis folder (pipeline)
+        # collect all enrichments optionally fetched by pattern estabilished with enrichset variable
+        # default: perform for all enrichments
+	if(analysis == "enrichment"){
+	mypattern = "enr"
+	}
+	else{
+	mypattern = "gsea"
+	}
+        gset_val <- grep(x = names(postanalysis$results[[analysis]]), pattern = mypattern, value = T) 
+        # print all sets
+        print(gset_val)
+        # cycling over all sets
+        for(eset in gset_val) {
+            simplify.cutoff.bis <- NULL
+            simplyfication <- FALSE
+            refining <- FALSE
+            print(paste0("Analyzing: ", eset))
+            gset_obj <- postanalysis$results[[analysis]][[eset]]
+            print(paste0("Size: ", nrow(as.data.frame(gset_obj))))
+            if(nrow(as.data.frame(gset_obj)) > 80) {
+                if(length(grep(pattern = "GO", x = eset, value = T)) > 0) {
+                    simplyfication <- TRUE
+                    gset_df <- as.data.frame(simplify(x = gset_obj, cutoff = simplify.cutoff, by = "p.adjust", measure = "Wang"))
+                    simplify.cutoff.bis <- simplify.cutoff
+                    while(nrow(gset_df) > 70 & simplify.cutoff.bis > 0.2) {
+                        refining <- TRUE
+                        simplify.cutoff.bis <- simplify.cutoff.bis - 0.05
+                        print(paste0("Reducing kappa cut-off to: ", simplify.cutoff.bis))
+                        gset_df <- as.data.frame(simplify(x = gset_obj, cutoff = simplify.cutoff.bis, by = "p.adjust", measure = "Wang"))
+                    }
+                    print(paste0("test ", simplify.cutoff))
+                    if(nrow(as.data.frame(gset_df)) > 90) {
+                        print(paste0("Too many terms in ", eset))
+                        next
+                    }
+                } else {
+                    if(nrow(as.data.frame(gset_obj)) > 90) {
+                        print(paste0("Too many terms in ", eset))
+                        next
+                    }
+                }
+            } else {
+                if(nrow(as.data.frame(gset_obj)) > 2) {
+                    gset_df <- as.data.frame(gset_obj)
+                } else {
+                    print(paste0("No terms in ", eset))
+                    next
+                }
+            }
+            gset_df$extra <- paste(gset_df$ID, gset_df$Description, sep = "_")
+            if(analysis == "enrichment") {
+                gset <- subset(gset_df, select = c("extra","geneID", "qvalue"))
+                gset$geneID <- gsub(pattern = "\\/", replacement = " ", perl = T, x = gset$geneID)
+            } else {
+                gset <- subset(gset_df, select = c("extra","core_enrichment", "qvalues"))
+                gset$core_enrichment <- gsub(pattern = "\\/", replacement = " ", perl = T, x = gset$core_enrichment)
+            }
+            colnames(gset) <- c("GeneSet", "Genelist", "q-values")
+            genelist <- NULL
+            for(i in 1:nrow(gset)) {
+                genelist[[rownames(gset)[i]]] <- strsplit(gset$Genelist[[i]], " ")[[1]]
+            }
+            names(genelist) <- gset$GeneSet
+            # creating existence matrix
+            existence_matrix <- list_to_matrix(lt = genelist)
+            colnames(existence_matrix) <- unlist(gset$GeneSet)
+            # transpose it
+            existence_matrix_t <- t(existence_matrix)
+            # initialize lists for indicence matrices and kappa score matrix
+            xtab <- list()
+            kscore <- list()
+            # create incidence matrices (term vs term)
+            for(i in 1:nrow(existence_matrix_t)) {
+                for(j in 1:nrow(existence_matrix_t)) {
+                    if(sum(abs(existence_matrix_t[i,] - existence_matrix_t[j,])) == 0) {
+                        kscore[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- 1
+                    } else {
+                        xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- as.table(table(factor(existence_matrix_t[i,], c(0,1)),
+                                                                                                                     factor(existence_matrix_t[j,], c(0,1))))
+                        if(i == j) {
+                            kscore[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- 1
+                        } else {
+                            diagonal.counts <- diag(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]])
+                            N <- sum(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]])
+                            row.marginal.props <- rowSums(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]]) / N
+                            col.marginal.props <- colSums(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]]) / N
+                            # Compute kappa (k)
+                            Po <- sum(diagonal.counts) / N
+                            Pe <- sum(row.marginal.props*col.marginal.props)
+                            k <- (Po - Pe)/(1 - Pe)
+                            kscore[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- k
+                        }
+                    }
+                }
+            }
+            kscore_df <- do.call(rbind.data.frame, kscore)
+            if(isFALSE(simplyfication) & isFALSE(refining)){
+                s.cutoff <- "no_simplify"
+            }
+            if(isTRUE(simplyfication) & isFALSE(refining)){
+                s.cutoff <- simplify.cutoff
+            }
+            if(isTRUE(simplyfication) & isTRUE(refining)){
+                s.cutoff <- simplify.cutoff.bis
+            }
+	    if(nrow(kscore_df) > 6){
+            png(paste(outfolder, paste(eset, s.cutoff, "kscore", "hm.png", sep = "_"), sep = "/"), width = 12, height = 12, units = "in", res = 300)
+            print(pheatmap::pheatmap(mat = kscore_df, color = brewer.pal(n = 9, name = brewerpalettecode),
+                     show_rownames = TRUE, show_colnames = FALSE,
+                     fontsize = 5 ,cellwidth = 5, cellheight = 5,
+                     width = 12, height = 12, main = paste0("kscore heatmap ", eset, "\nCut-off: ", s.cutoff)))
+             dev.off()
+	    }
+            myres[[eset]] <- list(genelist = genelist,
+                                  existence_mt = existence_matrix_t,
+                                  kappascore_mt = kscore_df,
+                                  incidence_mt = xtab,
+                                  geneset = gset_df,
+                                  simplify_cutoff = s.cutoff)
+        }
+    }
+    return(myres)
+}
+
+#####################################
+### post-analysis-clustering.R
+### Arg1: *-post-analysis.rds -> post-analysis rds file
+### Arg2: out.dir -> output directory
+### Arg3: contrast -> contrast in the format cond1-cond2
+####################################
+
+args <- commandArgs(trailingOnly = TRUE)
+
+## Post-analysis rds file
+post_obj_file <- args[1]
+## Output root directory
+out.dir <- args[2]
+## Contrast
+contrast <- args[3]
+
+cat("#################################", "\n")
+cat("### Cluster-analysis: ", contrast, "\n")
+cat("#################################", "\n")
+
+cat("# Output directory:", out.dir, "\n")
+
+myres_enrich <- postAnalysisClustering(postanalysisrds = post_obj_file,
+                                       simplify.cutoff = 0.8,
+                                       analysis = "enrichment",
+                                       outfolder = out.dir)
+myres_gsea <- postAnalysisClustering(postanalysisrds = post_obj_file,
+                                     simplify.cutoff = 0.8,
+                                     analysis = "gsea",
+                                     outfolder = out.dir)
+myres <- list("enrichment" = myres_enrich, "gsea" = myres_gsea)
+saveRDS(object = myres, file = paste(out.dir, paste(contrast, "cluster", "analysis.rds", sep = "-"), sep = "/"))
--- a/BulkRNAseq/Figure_8A_8B_data.rds
+++ b/BulkRNAseq/Figure_8A_8B_data.rds