Commit 5b7884f3 authored by Matteo Barcella's avatar Matteo Barcella
Browse files

upload DGE script

parent 2cf381d4
# DEGs analysis customized for 1vs1
# 72h vs 24h
library(edgeR)
library(ggplot2)
library(dplyr)
library(openxlsx)
library(ggrepel)
library(biomaRt)
cur.dir <- "DGE/"
data = read.table("featureCounts_results.txt",
header=T, row.names = 1)
colnames(data) <- gsub(x = gsub(x = colnames(data), pattern = "_Aligned.sortedByCoord.out.bam", replacement = ""),
pattern = "X.beegfs.scratch.ric.gentner.ric.gentner.HSC.Exp2155.results.03.aln.", replacement = "")
data <- data[,6:ncol(data)]
bcv <- 0.1 # i campioni in PCA sul totale sembrano molto vicini, quindi imposto bassa dispersione
counts <- data
comparisons <- list(T172h_vs_T124h_rd062 = c("TI72h_rd062","TI24h_rd062"),
T172h_vs_T124h_202205 = c("TI72h_202205","TI24h_202205")
)
normalized_data <- list()
# filtering out very low counts genes
comparisons_results <- list()
for(i in names(comparisons)){
if(i == "T172h_vs_T124h_202205"){
bcv <- 0.3
}else{
bcv <- 0.1
}
ids <- comparisons[[i]]
counts.ss <- subset(x = counts, select = ids)
rsums <- rowSums(counts.ss)
counts.ss <- subset(counts.ss, subset = rsums > 31) # at least 30 counts as sum in the 2
y <- DGEList(counts=counts.ss, group = ids)
TMM <- calcNormFactors(y, method = "TMM")
normdata <- cpm(y = TMM, normalized.lib.sizes = TRUE)
TMM <- exactTest(TMM, dispersion=bcv^2, pair = rev(ids))
table(p.adjust(TMM$table$PValue, method="BH")<0.05)
DGEresults <- TMM$table
DGEresults$FDR <- p.adjust(DGEresults$PValue, method="BH")
DGEresults$isDEG <- DGEresults$FDR < 0.05
# retrieve counts from the counts table
DGEresults_with_counts_tmp <- merge.data.frame(DGEresults, counts.ss, by = 0, all.x = T, sort = F)
rownames(DGEresults_with_counts_tmp) <- DGEresults_with_counts_tmp$Row.names
DGEresults_with_counts_tmp$Row.names <- NULL
comparison.dir <- paste0(cur.dir, "/", i, "/")
dir.create(comparison.dir, recursive = T, showWarnings = F)
volcanodir <- paste0(comparison.dir, "/volcano/")
dir.create(volcanodir, recursive = T,showWarnings = F)
saveRDS(object = DGEresults_with_counts_tmp, file = paste0(comparison.dir,"edgeR-",ids[1],"-",ids[2],".rds"))
write.table(x = DGEresults_with_counts_tmp, file = paste0(comparison.dir,"edgeR-",ids[1],"-",ids[2],".txt"),
sep = "\t", row.names = T)
comparisons_results[[i]] <- DGEresults_with_counts_tmp
colnames(normdata) <- paste0(colnames(normdata), ".cpm")
normalized_data[[i]] <- normdata
# Volcano plot + goi in all comparisons ####
logfc.thr <- 0.6
adj.pval.thr <- 0.05
ntop <- 15
df <- DGEresults_with_counts_tmp
df$gene <- rownames(DGEresults_with_counts_tmp)
df$FDR <- -log10(df$FDR)
print(head(df))
df <- mutate(df, color = case_when(df$logFC > logfc.thr & df$FDR > -log10(adj.pval.thr) ~ "Overexpressed",
df$logFC < -logfc.thr & df$FDR > -log10(adj.pval.thr) ~ "Underexpressed",
abs(df$logFC) < logfc.thr | df$FDR < -log10(adj.pval.thr) ~ "NonSignificant")
)
df <- df[order(df$logFC, decreasing = TRUE),]
under <- tail(subset(x = df, subset = color == "Underexpressed"),ntop)
over <- head(subset(x = df, subset = color == "Overexpressed"),ntop)
#genesofinterest <- subset(x = df, subset = color %in% c("Overexpressed","Underexpressed") & gene %in% goi)
hightext <- rbind(under, over)
max_x <- ceiling(max(abs(df$logFC)))
max_y <- ceiling(max(abs(na.omit(df$FDR))))
vol <- ggplot(df, aes(x = logFC, y = FDR, colour = color)) +
#ggtitle(paste(c1, "vs.", c2, sep = " ")) +
theme_bw(base_size = 12) +
theme(legend.position = "right") +
xlim(c(-max_x,max_x)) +
ylim(c(0,max_y)) +
#xlab(paste("log2(", paste(c1, c2, sep = " / "), ")", sep = "")) +
xlab("log2 Fold Change") +
ylab(expression(-log[10]("adjusted p-value"))) +
labs(title = i) +
geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray")
if (logfc.thr > 0) {
vol <- vol +
geom_vline(xintercept = logfc.thr, colour = "darkgray") +
geom_vline(xintercept = -logfc.thr, colour = "darkgray")
}
vol <- vol +
geom_point(size = 1, alpha = 0.8, na.rm = T) +
scale_color_manual(name = "Expression",
values = c(Overexpressed = "#CD4F39",
Underexpressed = "#0B5394",
NonSignificant = "darkgray")) +
geom_label_repel(data = hightext, aes(label = gene), show.legend = FALSE) +
#geom_label_repel(data = genesofinterest, aes(label = gene, fill = color), show.legend = FALSE) +
theme(legend.position = c(0.5,0.9), plot.title = element_text(hjust = 0.5))
#scale_y_continuous(trans = "log2")
ggsave(filename = paste(volcanodir,
paste(i,
"volcano.png",
sep = "-"),
sep = "/"),
plot = vol,
width = 10, height = 9, dpi = 600)
}
significant <- list()
for (k in names(comparisons_results)) {
significant[[paste0(k,"_UP")]] <- subset(comparisons_results[[k]], subset = isDEG == T & logFC > 0)
significant[[paste0(k,"_DOWN")]] <- subset(comparisons_results[[k]], subset = isDEG == T & logFC < 0)
significant[[paste0(k,"_UP")]] <- rownames(significant[[paste0(k,"_UP")]])
significant[[paste0(k,"_DOWN")]] <- rownames(significant[[paste0(k,"_DOWN")]])
}
# Connect to the database
ensembl = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Define the attributes to retrieve
attributes = c("external_gene_name", "description")
# Retrieve the information
gene_info = getBM(attributes, mart = ensembl)
comparisons_results[["intersection_sign_concordant_UP"]] <- as.data.frame(intersect(significant$T172h_vs_T124h_rd062_UP,
significant$T172h_vs_T124h_202205_UP))
colnames(comparisons_results[["intersection_sign_concordant_UP"]]) <- "Concordant_UP"
rownames(comparisons_results[["intersection_sign_concordant_UP"]]) <- comparisons_results[["intersection_sign_concordant_UP"]]$Concordant_UP
tmpdf <- comparisons_results[["intersection_sign_concordant_UP"]]
tmpdf.annot <- merge(tmpdf, gene_info, by.x = "Concordant_UP", by.y = "external_gene_name", all.x = T, sort = F)
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_rd062[,c(1,6:7)], by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[3] <- "logFC_rd062"
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_202205[,c(1,6:7)], by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[6] <- "logFC_202205"
tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_rd062, by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_202205, by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
comparisons_results[["intersection_sign_concordant_UP"]] <- tmpdf.annot
comparisons_results[["intersection_sign_concordant_DW"]] <- as.data.frame(intersect(significant$T172h_vs_T124h_rd062_DOWN,
significant$T172h_vs_T124h_202205_DOWN))
colnames(comparisons_results[["intersection_sign_concordant_DW"]]) <- "Concordant_DOWN"
rownames(comparisons_results[["intersection_sign_concordant_DW"]]) <- comparisons_results[["intersection_sign_concordant_DW"]]$Concordant_DOWN
tmpdf <- comparisons_results[["intersection_sign_concordant_DW"]]
tmpdf.annot <- merge(tmpdf, gene_info, by.x = "Concordant_DOWN", by.y = "external_gene_name", all.x = T, sort = F)
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_rd062[,c(1,6:7)], by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[3] <- "logFC_rd062"
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_202205[,c(1,6:7)], by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[6] <- "logFC_202205"
tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_rd062, by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_202205, by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
comparisons_results[["intersection_sign_concordant_DW"]] <- tmpdf.annot
saveRDS(comparisons_results, file = "Comparisons_exp1_1vs1.rds")
write.xlsx(comparisons_results, file = "Comparisons_exp1_1vs1.xlsx",
asTable = T, rowNames = T, overwrite = T, firstRow = T)
saveRDS(normalized_data, file = "SizeFactors_norm_cpm_exp1_1vs1.rds")
normalized_data$T172h_vs_T124h_rd062 <- as.data.frame(normalized_data$T172h_vs_T124h_rd062)
normalized_data$T172h_vs_T124h_202205 <- as.data.frame(normalized_data$T172h_vs_T124h_202205)
write.xlsx(normalized_data, file = "SizeFactors_norm_cpm_exp1_1vs1.xlsx",
asTable = T, rowNames = T, overwrite = T, firstRow = T)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment