# DEGs analysis customized for 1vs1
# 72h vs 24h

library(edgeR)
library(ggplot2)
library(dplyr)
library(openxlsx)
library(ggrepel)
library(biomaRt)

cur.dir <- "DGE/"

data = read.table("featureCounts_results.txt", 
                  header=T, row.names = 1)
colnames(data) <- gsub(x = gsub(x = colnames(data), pattern = "_Aligned.sortedByCoord.out.bam", replacement = ""), 
                       pattern = "X.beegfs.scratch.ric.gentner.ric.gentner.HSC.Exp2155.results.03.aln.", replacement = "")
data <- data[,6:ncol(data)]
bcv <- 0.1 # i campioni in PCA sul totale sembrano molto vicini, quindi imposto bassa dispersione

counts <- data
comparisons <- list(T172h_vs_T124h_rd062 = c("TI72h_rd062","TI24h_rd062"),
                    T172h_vs_T124h_202205 = c("TI72h_202205","TI24h_202205")
                    )
normalized_data <- list()

# filtering out very low counts genes

comparisons_results <- list()
for(i in names(comparisons)){
  
  if(i == "T172h_vs_T124h_202205"){
    bcv <- 0.3
  }else{
    bcv <- 0.1
  }
  
  ids <- comparisons[[i]]
  
  counts.ss <- subset(x = counts, select = ids)
  rsums <- rowSums(counts.ss)
  counts.ss <- subset(counts.ss, subset = rsums > 31) # at least 30 counts as sum in the 2
  
  y <- DGEList(counts=counts.ss, group = ids)
  TMM <- calcNormFactors(y, method = "TMM") 
  normdata <- cpm(y = TMM, normalized.lib.sizes = TRUE)
  TMM <- exactTest(TMM, dispersion=bcv^2, pair = rev(ids))
  table(p.adjust(TMM$table$PValue, method="BH")<0.05)
  
  DGEresults <- TMM$table
  DGEresults$FDR <- p.adjust(DGEresults$PValue, method="BH")
  DGEresults$isDEG <- DGEresults$FDR < 0.05
  
  # retrieve counts from the counts table
  
  DGEresults_with_counts_tmp <- merge.data.frame(DGEresults, counts.ss, by = 0, all.x = T, sort = F)
  rownames(DGEresults_with_counts_tmp) <- DGEresults_with_counts_tmp$Row.names
  DGEresults_with_counts_tmp$Row.names <- NULL
  
  comparison.dir <- paste0(cur.dir, "/", i, "/")
  dir.create(comparison.dir, recursive = T, showWarnings = F)
  volcanodir <- paste0(comparison.dir, "/volcano/")
  dir.create(volcanodir, recursive = T,showWarnings = F)
  
  saveRDS(object = DGEresults_with_counts_tmp, file = paste0(comparison.dir,"edgeR-",ids[1],"-",ids[2],".rds"))
  write.table(x = DGEresults_with_counts_tmp, file = paste0(comparison.dir,"edgeR-",ids[1],"-",ids[2],".txt"), 
              sep = "\t", row.names = T)
  comparisons_results[[i]] <- DGEresults_with_counts_tmp
  colnames(normdata) <- paste0(colnames(normdata), ".cpm")
  normalized_data[[i]] <- normdata
  
  # Volcano plot + goi in all comparisons ####
  
  logfc.thr <- 0.6
  adj.pval.thr <- 0.05
  ntop <- 15
  
  df <- DGEresults_with_counts_tmp
  df$gene <- rownames(DGEresults_with_counts_tmp)
  df$FDR <- -log10(df$FDR)
  print(head(df))
  df <- mutate(df, color = case_when(df$logFC > logfc.thr & df$FDR > -log10(adj.pval.thr) ~ "Overexpressed",
                                     df$logFC < -logfc.thr & df$FDR > -log10(adj.pval.thr) ~ "Underexpressed",
                                     abs(df$logFC) < logfc.thr | df$FDR < -log10(adj.pval.thr) ~ "NonSignificant")
  )
  df <- df[order(df$logFC, decreasing = TRUE),]
  
  under <- tail(subset(x = df, subset = color == "Underexpressed"),ntop)
  over <- head(subset(x = df, subset = color == "Overexpressed"),ntop)
  #genesofinterest <- subset(x = df, subset = color %in% c("Overexpressed","Underexpressed") & gene %in% goi)
  hightext <- rbind(under, over)
  
  max_x <- ceiling(max(abs(df$logFC)))
  max_y <- ceiling(max(abs(na.omit(df$FDR))))
  
  vol <- ggplot(df, aes(x = logFC, y = FDR, colour = color)) +
    #ggtitle(paste(c1, "vs.", c2, sep = " ")) +
    theme_bw(base_size = 12) +
    theme(legend.position = "right") +
    xlim(c(-max_x,max_x)) + 
    ylim(c(0,max_y)) +
    #xlab(paste("log2(", paste(c1, c2, sep = " / "), ")", sep = "")) +
    xlab("log2 Fold Change") + 
    ylab(expression(-log[10]("adjusted p-value"))) +
    labs(title = i) +
    geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray")
  
  if (logfc.thr > 0) {
    vol <- vol +
      geom_vline(xintercept = logfc.thr, colour = "darkgray") +
      geom_vline(xintercept = -logfc.thr, colour = "darkgray")
  }
  vol <- vol +
    geom_point(size = 1, alpha = 0.8, na.rm = T) + 
    scale_color_manual(name = "Expression",
                       values = c(Overexpressed = "#CD4F39",
                                  Underexpressed = "#0B5394",
                                  NonSignificant = "darkgray")) +
    geom_label_repel(data = hightext, aes(label = gene), show.legend = FALSE) +
    #geom_label_repel(data = genesofinterest, aes(label = gene, fill = color), show.legend = FALSE) + 
    theme(legend.position = c(0.5,0.9), plot.title = element_text(hjust = 0.5))  
  #scale_y_continuous(trans = "log2")
  
  ggsave(filename = paste(volcanodir,
                          paste(i,
                                "volcano.png",
                                sep = "-"),
                          sep = "/"),
         plot = vol,
         width = 10, height = 9, dpi = 600)
}

significant <- list()
for (k in names(comparisons_results)) {
  significant[[paste0(k,"_UP")]] <- subset(comparisons_results[[k]], subset = isDEG == T & logFC > 0)
  significant[[paste0(k,"_DOWN")]] <- subset(comparisons_results[[k]], subset = isDEG == T & logFC < 0)
  significant[[paste0(k,"_UP")]] <- rownames(significant[[paste0(k,"_UP")]])
  significant[[paste0(k,"_DOWN")]] <- rownames(significant[[paste0(k,"_DOWN")]])
}

# Connect to the database
ensembl = useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# Define the attributes to retrieve
attributes = c("external_gene_name", "description")

# Retrieve the information
gene_info = getBM(attributes, mart = ensembl)

comparisons_results[["intersection_sign_concordant_UP"]] <- as.data.frame(intersect(significant$T172h_vs_T124h_rd062_UP,
                                                                      significant$T172h_vs_T124h_202205_UP))
colnames(comparisons_results[["intersection_sign_concordant_UP"]]) <- "Concordant_UP"
rownames(comparisons_results[["intersection_sign_concordant_UP"]]) <- comparisons_results[["intersection_sign_concordant_UP"]]$Concordant_UP
tmpdf <- comparisons_results[["intersection_sign_concordant_UP"]]
tmpdf.annot <- merge(tmpdf, gene_info, by.x = "Concordant_UP", by.y = "external_gene_name", all.x = T, sort = F)
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_rd062[,c(1,6:7)], by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[3] <- "logFC_rd062"
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_202205[,c(1,6:7)], by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[6] <- "logFC_202205"

tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_rd062, by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)
tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_202205, by.x = "Concordant_UP", by.y = 0, all.x = T, sort = F)

comparisons_results[["intersection_sign_concordant_UP"]] <- tmpdf.annot

comparisons_results[["intersection_sign_concordant_DW"]] <- as.data.frame(intersect(significant$T172h_vs_T124h_rd062_DOWN,
                                                                                      significant$T172h_vs_T124h_202205_DOWN))

colnames(comparisons_results[["intersection_sign_concordant_DW"]]) <- "Concordant_DOWN"
rownames(comparisons_results[["intersection_sign_concordant_DW"]]) <- comparisons_results[["intersection_sign_concordant_DW"]]$Concordant_DOWN

tmpdf <- comparisons_results[["intersection_sign_concordant_DW"]]
tmpdf.annot <- merge(tmpdf, gene_info, by.x = "Concordant_DOWN", by.y = "external_gene_name", all.x = T, sort = F)
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_rd062[,c(1,6:7)], by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[3] <- "logFC_rd062"
tmpdf.annot <- merge(tmpdf.annot, comparisons_results$T172h_vs_T124h_202205[,c(1,6:7)], by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
colnames(tmpdf.annot)[6] <- "logFC_202205"

tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_rd062, by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)
tmpdf.annot <- merge.data.frame(tmpdf.annot, normalized_data$T172h_vs_T124h_202205, by.x = "Concordant_DOWN", by.y = 0, all.x = T, sort = F)

comparisons_results[["intersection_sign_concordant_DW"]] <- tmpdf.annot

saveRDS(comparisons_results, file = "Comparisons_exp1_1vs1.rds")
write.xlsx(comparisons_results, file = "Comparisons_exp1_1vs1.xlsx", 
           asTable = T, rowNames = T, overwrite = T, firstRow = T)

saveRDS(normalized_data, file = "SizeFactors_norm_cpm_exp1_1vs1.rds")
normalized_data$T172h_vs_T124h_rd062 <- as.data.frame(normalized_data$T172h_vs_T124h_rd062)
normalized_data$T172h_vs_T124h_202205 <- as.data.frame(normalized_data$T172h_vs_T124h_202205)
write.xlsx(normalized_data, file = "SizeFactors_norm_cpm_exp1_1vs1.xlsx", 
           asTable = T, rowNames = T, overwrite = T, firstRow = T)