upload DGE script

5b7884f3 · Matteo Barcella · 2cf381d4 · 5b7884f3
Commit 5b7884f3 authored Aug 04, 2025 by Matteo Barcella
--- a/DGE/DGE.R
+++ b/DGE/DGE.R
+# DEGs analysis customized for 1vs1
+# 72h vs 24h
+
+library(edgeR)
+library(ggplot2)
+library(dplyr)
+library(openxlsx)
+library(ggrepel)
+library(biomaRt)
+
+cur.dir <- "DGE/"
+
+data = read.table("featureCounts_results.txt", 
+                  header=T, row.names = 1)
+colnames(data) <- gsub(x = gsub(x = colnames(data), pattern = "_Aligned.sortedByCoord.out.bam", replacement = ""), 
+                       pattern = "X.beegfs.scratch.ric.gentner.ric.gentner.HSC.Exp2155.results.03.aln.", replacement = "")
+data <- data[,6:ncol(data)]
+bcv <- 0.1 # i campioni in PCA sul totale sembrano molto vicini, quindi imposto bassa dispersione
+
+counts <- data
+comparisons <- list(T172h_vs_T124h_rd062 = c("TI72h_rd062","TI24h_rd062"),
+                    T172h_vs_T124h_202205 = c("TI72h_202205","TI24h_202205")
+                    )
+normalized_data <- list()
+
+# filtering out very low counts genes
+
+comparisons_results <- list()
+for(i in names(comparisons)){
+  
+  if(i == "T172h_vs_T124h_202205"){
+    bcv <- 0.3
+  }else{
+    bcv <- 0.1
+  }
+  
+  ids <- comparisons[[i]]
+  
+  counts.ss <- subset(x = counts, select = ids)
+  rsums <- rowSums(counts.ss)
+  counts.ss <- subset(counts.ss, subset = rsums > 31) # at least 30 counts as sum in the 2
+  
+  y <- DGEList(counts=counts.ss, group = ids)
+  TMM <- calcNormFactors(y, method = "TMM") 
+  normdata <- cpm(y = TMM, normalized.lib.sizes = TRUE)
+  TMM <- exactTest(TMM, dispersion=bcv^2, pair = rev(ids))
+  table(p.adjust(TMM$table$PValue, method="BH")<0.05)
+  
+  DGEresults <- TMM$table
+  DGEresults$FDR <- p.adjust(DGEresults$PValue, method="BH")
+  DGEresults$isDEG <- DGEresults$FDR < 0.05
+  
+  # retrieve counts from the counts table
+  
+  DGEresults_with_counts_tmp <- merge.data.frame(DGEresults, counts.ss, by = 0, all.x = T, sort = F)
+  rownames(DGEresults_with_counts_tmp) <- DGEresults_with_counts_tmp$Row.names
+  DGEresults_with_counts_tmp$Row.names <- NULL
+  
+  comparison.dir <- paste0(cur.dir, "/", i, "/")
+  dir.create(comparison.dir, recursive = T, showWarnings = F)
+  volcanodir <- paste0(comparison.dir, "/volcano/")
+  dir.create(volcanodir, recursive = T,showWarnings = F)
+  
+  saveRDS(object = DGEresults_with_counts_tmp, file = paste0(comparison.dir,"edgeR-",ids[1],"-",ids[2],".rds"))
+  write.table(x = DGEresults_with_counts_tmp, file = paste0(comparison.dir,"edgeR-",ids[1],"-",ids[2],".txt"), 
+              sep = "\t", row.names = T)
+  comparisons_results[[i]] <- DGEresults_with_counts_tmp
+  colnames(normdata) <- paste0(colnames(normdata), ".cpm")
+  normalized_data[[i]] <- normdata
+  
+  # Volcano plot + goi in all comparisons ####
+  
+  logfc.thr <- 0.6
+  adj.pval.thr <- 0.05
+  ntop <- 15
+  
+  df <- DGEresults_with_counts_tmp
+  df$gene <- rownames(DGEresults_with_counts_tmp)
+  df$FDR <- -log10(df$FDR)
+  print(head(df))
+  df <- mutate(df, color = case_when(df$logFC > logfc.thr & df$FDR > -log10(adj.pval.thr) ~ "Overexpressed",
+                                     df$logFC < -logfc.thr & df$FDR > -log10(adj.pval.thr) ~ "Underexpressed",
+                                     abs(df$logFC) < logfc.thr | df$FDR < -log10(adj.pval.thr) ~ "NonSignificant")
+  )
+  df <- df[order(df$logFC, decreasing = TRUE),]
+  
+  under <- tail(subset(x = df, subset = color == "Underexpressed"),ntop)
+  over <- head(subset(x = df, subset = color == "Overexpressed"),ntop)
+  #genesofinterest <- subset(x = df, subset = color %in% c("Overexpressed","Underexpressed") & gene %in% goi)
+  hightext <- rbind(under, over)
+  
+  max_x <- ceiling(max(abs(df$logFC)))
+  max_y <- ceiling(max(abs(na.omit(df$FDR))))
+  
+  vol <- ggplot(df, aes(x = logFC, y = FDR, colour = color)) +
+    #ggtitle(paste(c1, "vs.", c2, sep = " ")) +
+    theme_bw(base_size = 12) +
+    theme(legend.position = "right") +
+    xlim(c(-max_x,max_x)) + 
+    ylim(c(0,max_y)) +
+    #xlab(paste("log2(", paste(c1, c2, sep = " / "), ")", sep = "")) +
+    xlab("log2 Fold Change") + 
+    ylab(expression(-log[10]("adjusted p-value"))) +
+    labs(title = i) +
+    geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray")
+  
+  if (logfc.thr > 0) {
+    vol <- vol +
+      geom_vline(xintercept = logfc.thr, colour = "darkgray") +
+      geom_vline(xintercept = -logfc.thr, colour = "darkgray")
+  }
+  vol <- vol +
+    geom_point(size = 1, alpha = 0.8, na.rm = T) + 
+    scale_color_manual(name = "Expression",
+                       values = c(Overexpressed = "#CD4F39",
+                                  Underexpressed = "#0B5394",
+                                  NonSignificant = "darkgray")) +
+    geom_label_repel(data = hightext, aes(label = gene), show.legend = FALSE) +
+    #geom_label_repel(data = genesofinterest, aes(label = gene, fill = color), show.legend = FALSE) + 
+    theme(legend.position = c(0.5,0.9), plot.title = element_text(hjust = 0.5))  
+  #scale_y_continuous(trans = "log2")
+  
+  ggsave(filename = paste(volcanodir,
+                          paste(i,
+                                "volcano.png",
+                                sep = "-"),
+                          sep = "/"),
+         plot = vol,
+         width = 10, height = 9, dpi = 600)
+}
+
+significant <- list()
+for (k in names(comparisons_results)) {
+  significant[[paste0(k,"_UP")]] <- subset(comparisons_results[[k]], subset = isDEG == T & logFC > 0)
+  significant[[paste0(k,"_DOWN")]] <- subset(comparisons_results[[k]], subset = isDEG == T & logFC < 0)
+  significant[[paste0(k,"_UP")]] <- rownames(significant[[paste0(k,"_UP")]])
+  significant[[paste0(k,"_DOWN")]] <- rownames(significant[[paste0(k,"_DOWN")]])
+}
+
+# Connect to the database
+ensembl = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
+
+# Define the attributes to retrieve
+attributes = c("external_gene_name", "description")
+
+# Retrieve the information
+gene_info = getBM(attributes, mart = ensembl)
+
+comparisons_results[["intersection_sign_concordant_UP"]] <- as.data.frame(intersect(significant$T172h_vs_T124h_rd062_UP,
+                                                                      significant$T172h_vs_T124h_202205_UP))
+colnames(comparisons_results[["intersection_sign_concordant_UP"]]) <- "Concordant_UP"
+rownames(comparisons_results[["intersection_sign_concordant_UP"]]) <- comparisons_results[["intersection_sign_concordant_UP"]]$Concordant_UP
+tmpdf <- comparisons_results[["intersection_sign_concordant_UP"]]
+tmpdf.annot <- merge(tmpdf, gene_info, by.x = "Concordant_UP", by.y = "external_gene_name", all.x = T, sort = F)
+tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_rd062[,c(1,6:7)], by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
+colnames(tmpdf.annot)[3] <- "logFC_rd062"
+tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_202205[,c(1,6:7)], by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
+colnames(tmpdf.annot)[6] <- "logFC_202205"
+
+tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_rd062, by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
+tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_202205, by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
+
+comparisons_results[["intersection_sign_concordant_UP"]] <- tmpdf.annot
+
+comparisons_results[["intersection_sign_concordant_DW"]] <- as.data.frame(intersect(significant$T172h_vs_T124h_rd062_DOWN,
+                                                                                      significant$T172h_vs_T124h_202205_DOWN))
+
+colnames(comparisons_results[["intersection_sign_concordant_DW"]]) <- "Concordant_DOWN"
+rownames(comparisons_results[["intersection_sign_concordant_DW"]]) <- comparisons_results[["intersection_sign_concordant_DW"]]$Concordant_DOWN
+
+tmpdf <- comparisons_results[["intersection_sign_concordant_DW"]]
+tmpdf.annot <- merge(tmpdf, gene_info, by.x = "Concordant_DOWN", by.y = "external_gene_name", all.x = T, sort = F)
+tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_rd062[,c(1,6:7)], by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
+colnames(tmpdf.annot)[3] <- "logFC_rd062"
+tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_202205[,c(1,6:7)], by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
+colnames(tmpdf.annot)[6] <- "logFC_202205"
+
+tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_rd062, by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
+tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_202205, by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
+
+comparisons_results[["intersection_sign_concordant_DW"]] <- tmpdf.annot
+
+saveRDS(comparisons_results, file = "Comparisons_exp1_1vs1.rds")
+write.xlsx(comparisons_results, file = "Comparisons_exp1_1vs1.xlsx", 
+           asTable = T, rowNames = T, overwrite = T, firstRow = T)
+
+saveRDS(normalized_data, file = "SizeFactors_norm_cpm_exp1_1vs1.rds")
+normalized_data$T172h_vs_T124h_rd062 <- as.data.frame(normalized_data$T172h_vs_T124h_rd062)
+normalized_data$T172h_vs_T124h_202205 <- as.data.frame(normalized_data$T172h_vs_T124h_202205)
+write.xlsx(normalized_data, file = "SizeFactors_norm_cpm_exp1_1vs1.xlsx", 
+           asTable = T, rowNames = T, overwrite = T, firstRow = T)
+