Commit 01d2c18d authored by Matteo Barcella's avatar Matteo Barcella
Browse files

Bulk RNAseq related scripts

parent 5612725c
## PCA
library(ggplot2)
library(ggforce)
library(openxlsx)
P <- read.xlsx("Figure_4A_data.xlsx", sheet = 1, rowNames = T)
svg(filename = "PCAplot_1000genes.svg", width = 6, height = 4.5)
ggplot(data = P, aes(x = PC1, y = PC2)) +
geom_point(aes(color = Gate, shape = Treatment), size = 3) +
ggtitle(label = "Principal Component Analysis (PCA)",
subtitle = "Top 1000 genes") +
scale_color_manual(values = c(GFP_LOW = "#4aab23", GFP_HIGH = "#515151")) +
scale_shape_manual(values = c(Control = 2, Treated = 19)) +
# stat_ellipse(aes(group = Patient), level = 0.975, linetype = 2) +
geom_mark_ellipse(aes(label = Patient, group = Patient), con.border = "none", con.type = "elbow", linetype = 2) +
xlab("PC1: 31% variance") + ylab("PC2: 21% variance") +
theme_bw() + theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12),
plot.title = element_text(size = 12))
dev.off()
Q <- read.xlsx("Figure_4A_data.xlsx", sheet = 2, rowNames = T)
svg(filename = "PCAplot_1000genes_batch_corr.svg", width = 6, height = 4.5)
ggplot(data = Q, aes(x = PC1, y = PC2)) +
geom_point(aes(color = Gate, shape = Treatment), size = 3) +
ggtitle(label = "Principal Component Analysis (PCA)",
subtitle = "Top 1000 genes, batch corrected") +
scale_color_manual(values = c(GFP_LOW = "#4aab23", GFP_HIGH = "#515151")) +
scale_shape_manual(values = c(Control = 2, Treated = 19)) +
xlab("PC1: 24% variance") + ylab("PC2: 14% variance") +
theme_bw() + theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12),
plot.title = element_text(size = 12))
dev.off()
# Custom randomWalks figure 4B
suppressPackageStartupMessages(library('dplyr'))
suppressPackageStartupMessages(library('clusterProfiler'))
suppressPackageStartupMessages(library('png'))
suppressPackageStartupMessages(library('enrichplot'))
suppressPackageStartupMessages(library('DOSE'))
suppressPackageStartupMessages(library('httr'))
suppressPackageStartupMessages(library('magrittr'))
suppressPackageStartupMessages(library('ggrepel'))
suppressPackageStartupMessages(library('cowplot'))
suppressPackageStartupMessages(library('RColorBrewer'))
suppressPackageStartupMessages(library('enrichplot'))
suppressPackageStartupMessages(library('openxlsx'))
Figure_4B_data <- readRDS(file = "Figure_4B_data.rds")
gseaScores <- getFromNamespace("gseaScores", "DOSE")
gsInfo <- function(object, geneSetID) {
geneList <- object@geneList
if (is.numeric(geneSetID))
geneSetID <- object@result[geneSetID, "ID"]
geneSet <- object@geneSets[[geneSetID]]
exponent <- object@params[["exponent"]]
df <- gseaScores(geneList, geneSet, exponent, fortify=TRUE)
df$ymin=0
df$ymax=0
pos <- df$position == 1
h <- diff(range(df$runningScore))/20
df$ymin[pos] <- -h
df$ymax[pos] <- h
df$geneList <- geneList
df$Description <- object@result[geneSetID, "Description"]
return(df)
}
gseaplot <- function (x, geneSetID, title = "", color = "green", base_size = 11,
rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = FALSE,
ES_geom = "line", lp1 = 0.2, lp2 = 0.2, linesize = 1.5, labsize = 7)
{
ES_geom <- match.arg(ES_geom, c("line", "dot"))
geneList <- position <- NULL
if (length(geneSetID) == 1) {
gsdata <- gsInfo(x, geneSetID)
}
else {
gsdata <- do.call(rbind, lapply(geneSetID, gsInfo, object = x))
}
p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + theme_classic(base_size) +
theme(panel.grid.major = element_line(colour = "grey92"),
panel.grid.minor = element_line(colour = "grey92"),
panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) +
scale_x_continuous(expand = c(0, 0))
if (ES_geom == "line") {
es_layer <- geom_line(aes_(y = ~runningScore, color = ~Description),
size = linesize)
}
else {
es_layer <- geom_point(aes_(y = ~runningScore, color = ~Description),
size = 1, data = subset(gsdata, position == 1))
}
p.res <- p + es_layer + theme(legend.position = c(lp1,lp2),
legend.text = element_text(size = labsize),
legend.key.height = unit(0.4, 'cm'),
legend.key.width = unit(0.4, 'cm'),
#legend.key = element_rect(fill = NULL),
#legend.key = element_blank(),
legend.title = element_blank(),
legend.background = element_rect(fill = "transparent"))
p.res <- p.res + ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(),
axis.title.y = element_text(size = 10),
axis.text.y.left = element_text(size = 10),
axis.ticks.x = element_blank(), axis.line.x = element_blank(),
plot.margin = margin(t = 0.2, r = 0.2, b = 0, l = 0.2,
unit = "cm"))
i <- 0
for (term in unique(gsdata$Description)) {
idx <- which(gsdata$ymin != 0 & gsdata$Description ==
term)
gsdata[idx, "ymin"] <- i
gsdata[idx, "ymax"] <- i + 1
i <- i + 1
}
p2 <- ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin,
ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) +
theme_classic(base_size) + theme(legend.position = "none",
plot.margin = margin(t = -0.1, b = 0, unit = "cm"), axis.ticks = element_blank(),
axis.text = element_blank(), axis.line.x = element_blank()) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0,
0))
if (length(geneSetID) == 1) {
v <- seq(1, sum(gsdata$position), length.out = 9)
inv <- findInterval(rev(cumsum(gsdata$position)), v)
if (min(inv) == 0)
inv <- inv + 1
col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
ymin <- min(p2$data$ymin)
yy <- max(p2$data$ymax - p2$data$ymin) * 0.3
xmin <- which(!duplicated(inv))
xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
d <- data.frame(ymin = ymin, ymax = yy, xmin = xmin,
xmax = xmax, col = col[unique(inv)])
p2 <- p2 + geom_rect(aes_(xmin = ~xmin, xmax = ~xmax,
ymin = ~ymin, ymax = ~ymax, fill = ~I(col)), data = d,
alpha = 0.9, inherit.aes = FALSE)
}
df2 <- p$data
df2$y <- p$data$geneList[df2$x]
p.pos <- p + geom_segment(data = df2, aes_(x = ~x, xend = ~x,
y = ~y, yend = 0), color = "grey")
p.pos <- p.pos + ylab("Ranked List Metric") + xlab("Rank in Ordered Dataset") +
theme(plot.margin = margin(t = -0.1, r = 0.2, b = 0.2,
l = 0.2, unit = "cm"),
axis.title = element_text(size = 10), axis.text = element_text(size = 10))
if (!is.null(title) && !is.na(title) && title != "")
p.res <- p.res + ggtitle(title)
if (length(color) == length(geneSetID)) {
p.res <- p.res + scale_color_manual(values = color)
if (length(color) == 1) {
p.res <- p.res + theme(legend.position = "none")
p2 <- p2 + scale_color_manual(values = "black")
}
else {
p2 <- p2 + scale_color_manual(values = color)
}
}
if (pvalue_table) {
pd <- x[geneSetID, c("Description", "pvalue", "p.adjust")]
pd <- pd[order(pd[, 1], decreasing = FALSE), ]
rownames(pd) <- pd$Description
pd <- pd[, -1]
pd <- round(pd, 4)
tp <- tableGrob2(pd, p.res)
p.res <- p.res + theme(legend.position = "none")# +
annotation_custom(tp,
xmin = quantile(p.res$data$x, 0.5),
xmax = quantile(p.res$data$x, 0.95),
ymin = quantile(p.res$data$runningScore,0.75),
ymax = quantile(p.res$data$runningScore,0.9))
}
plotlist <- list(p.res, p2, p.pos)[subplots]
n <- length(plotlist)
plotlist[[n]] <- plotlist[[n]] + theme(axis.line.x = element_line(),
axis.ticks.x = element_line(), axis.text.x = element_text())
if (length(subplots) == 1)
return(plotlist[[1]] + theme(plot.margin = margin(t = 0.2,
r = 0.2, b = 0.2, l = 0.2, unit = "cm")))
if (length(rel_heights) > length(subplots))
rel_heights <- rel_heights[subplots]
plot_grid(plotlist = plotlist, ncol = 1, align = "v", rel_heights = rel_heights)
}
Figure4B_left <- gseaplot(x = Figure_4B_data,
geneSetID = c("GENTLES_LEUKEMIC_STEM_CELL_UP",
"EPPERT_CE_HSC_LSC",
"JAATINEN_HEMATOPOIETIC_STEM_CELL_UP",
"GRAHAM_CML_DIVIDING_VS_NORMAL_QUIESCENT_DN"),
pvalue_table = FALSE,rel_heights = c(2.25,0.25,1),
color = c("#929292", "#1e1e1e", "#c9c9c9", "#4b4b4b"), ES_geom = "line",
#lp1 = 0.75,
lp1 = 0.35,
lp2 = 0.20, linesize = 1.4, labsize = 10)
png(filename = "Figure4B_left.png", width = 7, height = 4, units = "in", res = 250)
Figure4B_left
dev.off()
svg(filename = "Figure4B_left.svg", width = 7, height = 4)
Figure4B_left
dev.off()
Figure4B_right <- gseaplot(x = Figure_4B_data,
geneSetID = c("GAL_LEUKEMIC_STEM_CELL_DN",
"ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER",
"JAATINEN_HEMATOPOIETIC_STEM_CELL_DN",
"BROWN_MYELOID_CELL_DEVELOPMENT_UP"),
pvalue_table = FALSE,rel_heights = c(2.25,0.25,1),
color = c("#929292", "#1e1e1e", "#c9c9c9", "#4b4b4b"), ES_geom = "line",
lp1 = 0.63,
lp2 = 0.9,
linesize = 1.2, labsize = 10)
png(filename = "Figure4B_right.png", width = 7, height = 4, units = "in", res = 250)
Figure4B_right
dev.off()
svg(filename = "Figure4B_right.svg", width = 7, height = 4)
Figure4B_right
dev.off()
Figure_4B_fun <- function (x, geneSetID, title = "", color = "green", base_size = 11,
rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = FALSE,
ES_geom = "line", lp1 = 0.2, lp2 = 0.2, linesize = 1.5, labsize = 7)
{
ES_geom <- match.arg(ES_geom, c("line", "dot"))
geneList <- position <- NULL
if (length(geneSetID) == 1) {
gsdata <- gsInfo(x, geneSetID)
}
else {
gsdata <- do.call(rbind, lapply(geneSetID, gsInfo, object = x))
}
p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + theme_classic(base_size) +
theme(panel.grid.major = element_line(colour = "grey92"),
panel.grid.minor = element_line(colour = "grey92"),
panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) +
scale_x_continuous(expand = c(0, 0))
if (ES_geom == "line") {
es_layer <- geom_line(aes_(y = ~runningScore, color = ~Description),
size = linesize)
}
else {
es_layer <- geom_point(aes_(y = ~runningScore, color = ~Description),
size = 1, data = subset(gsdata, position == 1))
}
p.res <- p + es_layer + theme(legend.position = c(lp1, lp2), legend.text = element_text(size = labsize),
legend.title = element_blank(), legend.background = element_rect(fill = "transparent"))
p.res <- p.res + ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(),
axis.title.y = element_text(size = 8),
axis.text.y.left = element_text(size = 8),
axis.ticks.x = element_blank(), axis.line.x = element_blank(),
plot.margin = margin(t = 0.2, r = 0.2, b = 0, l = 0.2,
unit = "cm"))
i <- 0
for (term in unique(gsdata$Description)) {
idx <- which(gsdata$ymin != 0 & gsdata$Description ==
term)
gsdata[idx, "ymin"] <- i
gsdata[idx, "ymax"] <- i + 1
i <- i + 1
}
p2 <- ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin,
ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) +
theme_classic(base_size) + theme(legend.position = "none",
plot.margin = margin(t = -0.1, b = 0, unit = "cm"), axis.ticks = element_blank(),
axis.text = element_blank(), axis.line.x = element_blank()) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0,
0))
if (length(geneSetID) == 1) {
v <- seq(1, sum(gsdata$position), length.out = 9)
inv <- findInterval(rev(cumsum(gsdata$position)), v)
if (min(inv) == 0)
inv <- inv + 1
col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
ymin <- min(p2$data$ymin)
yy <- max(p2$data$ymax - p2$data$ymin) * 0.3
xmin <- which(!duplicated(inv))
xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
d <- data.frame(ymin = ymin, ymax = yy, xmin = xmin,
xmax = xmax, col = col[unique(inv)])
p2 <- p2 + geom_rect(aes_(xmin = ~xmin, xmax = ~xmax,
ymin = ~ymin, ymax = ~ymax, fill = ~I(col)), data = d,
alpha = 0.9, inherit.aes = FALSE)
}
df2 <- p$data
df2$y <- p$data$geneList[df2$x]
p.pos <- p + geom_segment(data = df2, aes_(x = ~x, xend = ~x,
y = ~y, yend = 0), color = "grey")
p.pos <- p.pos + ylab("Ranked List Metric") + xlab("Rank in Ordered Dataset") +
theme(plot.margin = margin(t = -0.1, r = 0.2, b = 0.2,
l = 0.2, unit = "cm"),
axis.title = element_text(size = 8))
if (!is.null(title) && !is.na(title) && title != "")
p.res <- p.res + ggtitle(title)
if (length(color) == length(geneSetID)) {
p.res <- p.res + scale_color_manual(values = color)
if (length(color) == 1) {
p.res <- p.res + theme(legend.position = "none")
p2 <- p2 + scale_color_manual(values = "black")
}
else {
p2 <- p2 + scale_color_manual(values = color)
}
}
if (pvalue_table) {
pd <- x[geneSetID, c("Description", "pvalue", "p.adjust")]
pd <- pd[order(pd[, 1], decreasing = FALSE), ]
rownames(pd) <- pd$Description
pd <- pd[, -1]
pd <- round(pd, 4)
tp <- tableGrob2(pd, p.res)
p.res <- p.res + theme(legend.position = "none") + annotation_custom(tp,
xmin = quantile(p.res$data$x, 0.5), xmax = quantile(p.res$data$x,
0.95), ymin = quantile(p.res$data$runningScore,
0.75), ymax = quantile(p.res$data$runningScore,
0.9))
}
plotlist <- list(p.res, p2, p.pos)[subplots]
n <- length(plotlist)
plotlist[[n]] <- plotlist[[n]] + theme(axis.line.x = element_line(),
axis.ticks.x = element_line(), axis.text.x = element_text())
if (length(subplots) == 1)
return(plotlist[[1]] + theme(plot.margin = margin(t = 0.2,
r = 0.2, b = 0.2, l = 0.2, unit = "cm")))
if (length(rel_heights) > length(subplots))
rel_heights <- rel_heights[subplots]
plot_grid(plotlist = plotlist, ncol = 1, align = "v", rel_heights = rel_heights)
}
library(readxl)
library(RColorBrewer)
library(pathfindR)
library(pathfindR.data)
library(dplyr)
library(DESeq2)
DGE_GFPLvsH <- readRDS("DGE_GFPLvsH.rds")
# Load expression matrix:
# raw counts:
rawdata <- read.table(file = "Full-gene-counts.txt") # deposited to GEO
rawdata_m <- as.matrix(rawdata)
samples <- colnames(rawdata)
# normalized counts
normcounts_GFPLvsH = counts(DGE_GFPLvsH[["deseq"]][["ds"]], normalized=T)
# Sample info
GFPhighsamples <- grep(".+H$", samples, value = T)
GFPlowsamples <- grep(".+L$", samples, value = T)
Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
Post_analysis_Enrichment_GOBP_Full <- read_excel("Figure_4C_data.xlsx",
sheet = "ALL_noduplicates",
col_types = c("skip","text", "text", "numeric", "numeric", "numeric", "text"))
Post_analysis_Enrichment_GOBP_Full$Down_regulated <- NA
Post_analysis_Enrichment_GOBP_Full$Down_regulated <- as.character(Post_analysis_Enrichment_GOBP_Full$Down_regulated)
Post_analysis_Enrichment_GOBP_Full = Post_analysis_Enrichment_GOBP_Full %>% filter(Term_Description != "detoxification of inorganic compound")
score_matrix_FULL_norm <- score_terms(enrichment_table = Post_analysis_Enrichment_GOBP_Full,
exp_mat = normcounts_GFPLvsH,
cases = GFPlowsamples,
use_description = TRUE, # default FALSE
label_samples = FALSE, # default = TRUE
case_title = "GFP_low", # default = "Case"
control_title = "GFP_high", # default = "Control"
low = "#f7797d", # default = "green"
mid = "#fffde4", # default = "black"
high = "#1f4037") # default = "red"
score_matrix_FULL <- score_terms(enrichment_table = Post_analysis_Enrichment_GOBP_Full,
exp_mat = rawdata_m,
cases = GFPlowsamples,
use_description = TRUE, # default FALSE
label_samples = FALSE, # default = TRUE
case_title = "GFP_low", # default = "Case"
control_title = "GFP_high", # default = "Control"
low = "#f7797d", # default = "green"
mid = "#fffde4", # default = "black"
high = "#1f4037") # default = "red"
colorsZ2 <- c(min(score_matrix_FULL_norm),seq(-2,2,by=0.01),max(score_matrix_FULL_norm))
my_palette <- c("Darkblue", colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2)-3), "Darkred")
Annotations_colors_4_gexp_heatmap <- readRDS("Figure_4CDE_S4DE_colors.rds")
Annotations_colors_4_gexp_heatmap_nomir = Annotations_colors_4_gexp_heatmap
Annotations_colors_4_gexp_heatmap_nomir$`miR-126 activity` = NULL
SampleInfo_annotation_heatmap_gexp <- readRDS("Figure_4CDE_S4DE_annots.rds")
SampleInfo_annotation_heatmap_gexp_nomir = SampleInfo_annotation_heatmap_gexp
SampleInfo_annotation_heatmap_gexp_nomir$`miR-126 activity` = NULL
SampleInfo_annotation_heatmap_gexp_nomir$condition = NULL
SampleInfo_annotation_heatmap_gexp_nomir$Patient = SampleInfo_annotation_heatmap_gexp_nomir$patient
SampleInfo_annotation_heatmap_gexp_nomir$patient = NULL
colorsZ2bis <- c(seq(-2,2,by=0.1))
my_palette <- c("#021830",colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2bis)-3),"#33000f")
callback = function(hc, score_matrix_FULL_norm){
sv = svd(t(score_matrix_FULL_norm))$v[,1]
dend = reorder(as.dendrogram(hc), wts = sv)
as.hclust(dend)
}
png(filename = "Figure_4C.png", width = 16, height = 9, res = 700, units = "in")
pheatmap::pheatmap(score_matrix_FULL_norm, scale = "none",
border_color = NA, fontsize_row = 8,
color = my_palette, breaks = colorsZ2bis,
show_rownames = T, annotation_names_col = T,
annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
cellwidth = 9, cellheight = 7,
cluster_rows = T, cutree_cols = 2, cutree_rows = 3)
dev.off()
png(filename = "Figure_4C_callback.png", width = 16, height = 9, res = 700, units = "in")
pheatmap::pheatmap(score_matrix_FULL_norm, scale = "none",
border_color = NA, fontsize_row = 8,
color = my_palette, breaks = colorsZ2bis,
show_rownames = T, annotation_names_col = T,
annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
cellwidth = 9, cellheight = 7,
cluster_rows = T, cutree_cols = 2, cutree_rows = 3, clustering_callback = callback)
dev.off()
# BulkRNAseq data analysis with PathfindR:
library(pathfindR)
library(pathfindR.data)
library(dplyr)
library(DESeq2)
# Load expression matrix:
rawdata <- read.table(file = "Full-gene-counts.txt") # GEO data
rawdata_m <- as.matrix(rawdata)
samples <- colnames(rawdata)
# Sample info
GFPhighsamples <- grep(".+H$", samples, value = T)
GFPlowsamples <- grep(".+L$", samples, value = T)
Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
LowOnly_TvsC <- read_excel("Figure_4D_data.xlsx")
LowOnly_TvsC$Down_regulated <- NA
LowOnly_TvsC$Down_regulated <- as.character(LowOnly_TvsC$Down_regulated)
# load DGE analysis
DGE = readRDS("20210804_202201_DGE.rds")
normcounts = counts(DGE[["deseq"]][["ds"]], normalized=T)
score_matrix_LowOnly_TvsC <- score_terms(enrichment_table = LowOnly_TvsC,
exp_mat = normcounts[,GFPlowsamples],
cases = intersect(GFPlowsamples, Treatedsamples),
use_description = TRUE, # default FALSE
label_samples = FALSE, # default = TRUE
case_title = "Treated", # default = "Case"
control_title = "Control", # default = "Control"
low = "#f7797d", # default = "green"
mid = "#fffde4", # default = "black"
high = "#1f4037") # default = "red"
Annotations_colors_4_gexp_heatmap <- readRDS("Figure_4CDE_S4DE_colors.rds")
Annotations_colors_4_gexp_heatmap_nomir = Annotations_colors_4_gexp_heatmap
Annotations_colors_4_gexp_heatmap_nomir$`miR-126 activity` = NULL
SampleInfo_annotation_heatmap_gexp <- readRDS("Figure_4CDE_S4DE_annots.rds")
SampleInfo_annotation_heatmap_gexp_nomir = SampleInfo_annotation_heatmap_gexp
SampleInfo_annotation_heatmap_gexp_nomir$`miR-126 activity` = NULL
SampleInfo_annotation_heatmap_gexp_nomir$condition = NULL
SampleInfo_annotation_heatmap_gexp_nomir$Patient = SampleInfo_annotation_heatmap_gexp_nomir$patient
SampleInfo_annotation_heatmap_gexp_nomir$patient = NULL
colorsZ2bis <- c(seq(-2,2,by=0.1))
my_palette <- c("#021830",colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2bis)-3),"#33000f")
png(filename = "Figure_4D.png", width = 16, height = 9, res = 700, units = "in")
pheatmap::pheatmap(score_matrix_LowOnly_TvsC, scale = "none",
border_color = NA, fontsize_row = 8,
color = my_palette, breaks = colorsZ2bis,
show_rownames = T, annotation_names_col = T,
annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
cellwidth = 9, cellheight = 7,
cluster_rows = T, cutree_cols = 1, cutree_rows = 1)
dev.off()
library(pathfindR)
library(pathfindR.data)
library(dplyr)
library(DESeq2)
rawdata <- read.table(file = "Full-gene-counts.txt") # GEO data
rawdata_m <- as.matrix(rawdata)
samples <- colnames(rawdata)
# Sample info
GFPhighsamples <- grep(".+H$", samples, value = T)
GFPlowsamples <- grep(".+L$", samples, value = T)
Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
dge <- readRDS("20210804_233137_DGE.rds") # TvsC analysis
normcountsTvsC = counts(dge[["deseq"]][["ds"]], normalized=T)
Post_analysis_Enrichment_TvsC_senescence_Full <- read_excel("Figure_4E_data.xlsx")
Post_analysis_Enrichment_TvsC_senescence_Full$Down_regulated <- NA
Post_analysis_Enrichment_TvsC_senescence_Full$Down_regulated <- as.character(Post_analysis_Enrichment_TvsC_senescence_Full$Down_regulated)
score_matrix_FULL_norm_TvsC <- score_terms(enrichment_table = Post_analysis_Enrichment_TvsC_senescence_Full,
exp_mat = normcountsTvsC,
cases = Treatedsamples,
use_description = TRUE, # default FALSE
label_samples = FALSE, # default = TRUE
case_title = "Treated", # default = "Case"
control_title = "Control", # default = "Control"
low = "#f7797d", # default = "green"
mid = "#fffde4", # default = "black"
high = "#1f4037") # default = "red"
png(filename = "Figure_4E.png", width = 16, height = 9, res = 700, units = "in")
pheatmap::pheatmap(score_matrix_FULL_norm_TvsC, scale = "none",
border_color = NA, fontsize_row = 8,
color = my_palette, breaks = colorsZ2bis,
show_rownames = T, annotation_names_col = T,
annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
cellwidth = 9, cellheight = 7,
cluster_rows = T, cutree_cols = 1, cutree_rows = 1)
dev.off()
callback = function(hc, score_matrix_FULL_norm_TvsC){
sv = svd(t(score_matrix_FULL_norm_TvsC))$v[,1]
dend = reorder(as.dendrogram(hc), wts = sv)
as.hclust(dend)
}
png(filename = "Figure_4E_callback.png", width = 16, height = 9, res = 700, units = "in")
pheatmap::pheatmap(score_matrix_FULL_norm_TvsC, scale = "none",
border_color = NA, fontsize_row = 8,
color = my_palette, breaks = colorsZ2bis,
show_rownames = T, annotation_names_col = T,
annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
cellwidth = 9, cellheight = 7,
cluster_rows = T, cutree_cols = 1, cutree_rows = 1, clustering_callback = callback)
dev.off()
score_matrix_FULL_norm_TvsC_filt <- score_terms(enrichment_table = Post_analysis_Enrichment_TvsC_senescence_Full %>% filter(!Term_Description %in% c("RHEIN_ALL_GLUCOCORTICOID_THERAPY_UP",
"KAN_RESPONSE_TO_ARSENIC_TRIOXIDE",
"CHIARETTI_T_ALL_REFRACTORY_TO_THERAPY")),
exp_mat = normcountsTvsC,
cases = Treatedsamples,
use_description = TRUE, # default FALSE
label_samples = FALSE, # default = TRUE
case_title = "Treated", # default = "Case"
control_title = "Control", # default = "Control"
low = "#f7797d", # default = "green"
mid = "#fffde4", # default = "black"
high = "#1f4037") # default = "red"
png(filename = "Figure_4E_filtered.png", width = 16, height = 9, res = 700, units = "in")
pheatmap::pheatmap(score_matrix_FULL_norm_TvsC_filt, scale = "none",
border_color = NA, fontsize_row = 8,
color = my_palette, breaks = colorsZ2bis,
show_rownames = T, annotation_names_col = T,
annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
cellwidth = 9, cellheight = 7,
cluster_rows = T, cutree_cols = 1, cutree_rows = 1, clustering_callback = callback)
dev.off()
suppressPackageStartupMessages(library('GOSemSim')) # embedd the simplify function for performing recuction
suppressPackageStartupMessages(library('ade4')) # for further development
suppressPackageStartupMessages(library('pheatmap'))
suppressPackageStartupMessages(library('RColorBrewer'))
suppressPackageStartupMessages(library('clusterProfiler'))
suppressPackageStartupMessages(library('ComplexHeatmap')) # allows to simply create existence matrix
postAnalysisClustering <- function(postanalysisrds, simplify.cutoff = 0.7, analysis, outfolder){
brewerpalettecode = "YlGnBu"
# brewerpalettecode = "RdBu"
# Get data from rdsfile if set
myres <- list()
if(!is.null(postanalysisrds)) {
gset_df <- NULL
postanalysis <- readRDS(postanalysisrds) # load rds from post analysis folder (pipeline)
# collect all enrichments optionally fetched by pattern estabilished with enrichset variable
# default: perform for all enrichments
if(analysis == "enrichment"){
mypattern = "enr"
}
else{
mypattern = "gsea"
}
gset_val <- grep(x = names(postanalysis$results[[analysis]]), pattern = mypattern, value = T)
# print all sets
print(gset_val)
# cycling over all sets
for(eset in gset_val) {
simplify.cutoff.bis <- NULL
simplyfication <- FALSE
refining <- FALSE
print(paste0("Analyzing: ", eset))
gset_obj <- postanalysis$results[[analysis]][[eset]]
print(paste0("Size: ", nrow(as.data.frame(gset_obj))))
if(nrow(as.data.frame(gset_obj)) > 80) {
if(length(grep(pattern = "GO", x = eset, value = T)) > 0) {
simplyfication <- TRUE
gset_df <- as.data.frame(simplify(x = gset_obj, cutoff = simplify.cutoff, by = "p.adjust", measure = "Wang"))
simplify.cutoff.bis <- simplify.cutoff
while(nrow(gset_df) > 70 & simplify.cutoff.bis > 0.2) {
refining <- TRUE
simplify.cutoff.bis <- simplify.cutoff.bis - 0.05
print(paste0("Reducing kappa cut-off to: ", simplify.cutoff.bis))
gset_df <- as.data.frame(simplify(x = gset_obj, cutoff = simplify.cutoff.bis, by = "p.adjust", measure = "Wang"))
}
print(paste0("test ", simplify.cutoff))
if(nrow(as.data.frame(gset_df)) > 90) {
print(paste0("Too many terms in ", eset))
next
}
} else {
if(nrow(as.data.frame(gset_obj)) > 90) {
print(paste0("Too many terms in ", eset))
next
}
}
} else {
if(nrow(as.data.frame(gset_obj)) > 2) {
gset_df <- as.data.frame(gset_obj)
} else {
print(paste0("No terms in ", eset))
next
}
}
gset_df$extra <- paste(gset_df$ID, gset_df$Description, sep = "_")
if(analysis == "enrichment") {
gset <- subset(gset_df, select = c("extra","geneID", "qvalue"))
gset$geneID <- gsub(pattern = "\\/", replacement = " ", perl = T, x = gset$geneID)
} else {
gset <- subset(gset_df, select = c("extra","core_enrichment", "qvalues"))
gset$core_enrichment <- gsub(pattern = "\\/", replacement = " ", perl = T, x = gset$core_enrichment)
}
colnames(gset) <- c("GeneSet", "Genelist", "q-values")
genelist <- NULL
for(i in 1:nrow(gset)) {
genelist[[rownames(gset)[i]]] <- strsplit(gset$Genelist[[i]], " ")[[1]]
}
names(genelist) <- gset$GeneSet
# creating existence matrix
existence_matrix <- list_to_matrix(lt = genelist)
colnames(existence_matrix) <- unlist(gset$GeneSet)
# transpose it
existence_matrix_t <- t(existence_matrix)
# initialize lists for indicence matrices and kappa score matrix
xtab <- list()
kscore <- list()
# create incidence matrices (term vs term)
for(i in 1:nrow(existence_matrix_t)) {
for(j in 1:nrow(existence_matrix_t)) {
if(sum(abs(existence_matrix_t[i,] - existence_matrix_t[j,])) == 0) {
kscore[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- 1
} else {
xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- as.table(table(factor(existence_matrix_t[i,], c(0,1)),
factor(existence_matrix_t[j,], c(0,1))))
if(i == j) {
kscore[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- 1
} else {
diagonal.counts <- diag(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]])
N <- sum(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]])
row.marginal.props <- rowSums(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]]) / N
col.marginal.props <- colSums(xtab[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]]) / N
# Compute kappa (k)
Po <- sum(diagonal.counts) / N
Pe <- sum(row.marginal.props*col.marginal.props)
k <- (Po - Pe)/(1 - Pe)
kscore[[rownames(existence_matrix_t)[i]]][[rownames(existence_matrix_t)[j]]] <- k
}
}
}
}
kscore_df <- do.call(rbind.data.frame, kscore)
if(isFALSE(simplyfication) & isFALSE(refining)){
s.cutoff <- "no_simplify"
}
if(isTRUE(simplyfication) & isFALSE(refining)){
s.cutoff <- simplify.cutoff
}
if(isTRUE(simplyfication) & isTRUE(refining)){
s.cutoff <- simplify.cutoff.bis
}
if(nrow(kscore_df) > 6){
png(paste(outfolder, paste(eset, s.cutoff, "kscore", "hm.png", sep = "_"), sep = "/"), width = 12, height = 12, units = "in", res = 300)
print(pheatmap::pheatmap(mat = kscore_df, color = brewer.pal(n = 9, name = brewerpalettecode),
show_rownames = TRUE, show_colnames = FALSE,
fontsize = 5 ,cellwidth = 5, cellheight = 5,
width = 12, height = 12, main = paste0("kscore heatmap ", eset, "\nCut-off: ", s.cutoff)))
dev.off()
}
myres[[eset]] <- list(genelist = genelist,
existence_mt = existence_matrix_t,
kappascore_mt = kscore_df,
incidence_mt = xtab,
geneset = gset_df,
simplify_cutoff = s.cutoff)
}
}
return(myres)
}
#####################################
### post-analysis-clustering.R
### Arg1: *-post-analysis.rds -> post-analysis rds file
### Arg2: out.dir -> output directory
### Arg3: contrast -> contrast in the format cond1-cond2
####################################
args <- commandArgs(trailingOnly = TRUE)
## Post-analysis rds file
post_obj_file <- args[1]
## Output root directory
out.dir <- args[2]
## Contrast
contrast <- args[3]
cat("#################################", "\n")
cat("### Cluster-analysis: ", contrast, "\n")
cat("#################################", "\n")
cat("# Output directory:", out.dir, "\n")
myres_enrich <- postAnalysisClustering(postanalysisrds = post_obj_file,
simplify.cutoff = 0.8,
analysis = "enrichment",
outfolder = out.dir)
myres_gsea <- postAnalysisClustering(postanalysisrds = post_obj_file,
simplify.cutoff = 0.8,
analysis = "gsea",
outfolder = out.dir)
myres <- list("enrichment" = myres_enrich, "gsea" = myres_gsea)
saveRDS(object = myres, file = paste(out.dir, paste(contrast, "cluster", "analysis.rds", sep = "-"), sep = "/"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment