From cad0b7de8de240280a1dafeadbb079c08da4c6b4 Mon Sep 17 00:00:00 2001 From: mmonti Date: Tue, 25 Mar 2025 23:33:54 +0100 Subject: [PATCH] I updated the scripts --- CB2025_figure_1_RNAseq.R | 157 ++++++++------- CB2025_figure_3_RNAseq.R | 393 ++++++++++++++++++++----------------- CB2025_figure_5_scRNAseq.R | 23 ++- TCGA_analysis.R | 79 +++++--- 4 files changed, 368 insertions(+), 284 deletions(-) diff --git a/CB2025_figure_1_RNAseq.R b/CB2025_figure_1_RNAseq.R index f06f822..5d41ec6 100644 --- a/CB2025_figure_1_RNAseq.R +++ b/CB2025_figure_1_RNAseq.R @@ -1,69 +1,88 @@ -library("readxl") -library("dplyr") - -#####Directories##### -us <- "/Users/Squadrito/" -us <-"C:/Users/bresesti.chiara/" -wdir1509<-paste0(us,"/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis") -wdir1510<-paste0(us,"/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX") -fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures") - - -#### Figure 1B #### -setwd(wdir1509) -df1 <- as.data.frame(read_excel("QIAseqUltraplexRNA_181342.xlsx",sheet=3,col_names = T,skip = 1))#sheet: umis.genes.polyA-mouse -df1 <-df1[,-c(1,3:6)] #keep only gene name and UMI counts -df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) #UMI normalized by CPM -gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax", - "Ccr2", "Itgam", "Mgl2", "Cd68", "Vcam1", "Csf1r", "Adgre1", - "Siglec1", "Hmox1", "Timd4", "Vsig4", "Clec4f", "Marco", "Pecam1", - "Tek", "Lyve1", "Stab2") -df2 <- df1[df1$gene%in%gene_vector,] %>% arrange(factor(gene, levels=gene_vector)) -rownames(df2) <- df2[,1] -df2.scaled <- as.data.frame(t(scale(t(df2[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform -setwd(fdir) -openxlsx::write.xlsx(df2.scaled,"Figure_1B_table.xlsx", rowNames=T) -#the df was exported and used to create a heatmap using Graphpad - -#### Figure 1C #### -# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator -# and merged with the plot generated by the code below -setwd(wdir1510) -df1 <- as.data.frame(read_excel("173308.all_samples.summary.xlsx",sheet=2,col_names = T))#sheet: miRNA_piRNA -df1 <-df1[,1:7]#UMI -rownames(df1)<-df1$miRNA -df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts -df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization -colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns -upper.panel<-function(x, y, ...){ - points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey") - above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5 - points(((x+y)/2)[above],(x-y)[above], col="red", cex=0.6, pch=19) - below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5 - points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19) -} #function for MA plot -pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel, - ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) #pairwise plot - -#### Figure 1D #### -setwd(wdir1510) -df1 <- as.data.frame(read_excel("173308.all_samples.summary.xlsx",sheet=2,col_names = T))#sheet: miRNA_piRNA -df1 <- df1[-which(grepl("piR",df1[,1])),1:7]#UMI and piRNA removing -df1[,1] <- gsub("/.*","",df1[,1]) #leave only first miRNA for ambiguous entries -df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000})#UMI normalized by CPM -df.families <- as.data.frame(read_excel("Analysis CB/miRNA Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase -df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries -df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1]) -df2 <- merge(df.families,df1,by=1) -df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family -gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1", - "miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p", - "miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p", - "miR-223-3p","miR-342-3p","miR-22-3p","miR-192-5p/215-5p", - "miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p") -df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector)) -rownames(df3) <- df3[,1] -df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform -setwd(fdir) -openxlsx::write.xlsx(df2.scaled,"Figure_1D_table.xlsx", rowNames=T) -#the df was exported and used to create a heatmap using Graphpad +library("readxl") +library("dplyr") + +#####Directories##### +# us <-"C:/Users/bresesti.chiara/" +# wdir1509<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis") +# wdir1510<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX") +# fdir <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures") +# +# input_d <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis") +# wdir1510 <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX") +# output_d <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures") + +input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference" +output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output" + +################################################################################ +# R script to generate data and figures for manuscript figures 1B, 1C, and 1D. +# +# This script reads RNA-seq and miRNA-seq data from Excel files, performs +# normalization and data transformation, and generates output tables and plots. +# +# Figures generated/data produced: +# - Figure 1B: Heatmap data for selected marker genes (Excel table) +# - Figure 1C: Pairwise MA plots for miRNA data +# - Figure 1D: Heatmap data for selected miRNA families (Excel table) +# +# Input data files: +# - Figure 1B: "QIAseqUltraplexRNA_181342.xlsx" (Sheet 3: umis.genes.polyA-mouse) +# - Figure 1C & 1D: "173308.all_samples.summary.xlsx" (Sheet 2: miRNA_piRNA) +# - Figure 1D: "miRNA_Family.xlsx" (Sheet 1) +# +################################################################################ + + +#### Figure 1B #### +df1 <- as.data.frame(read_excel(paste0(input_dir, "/QIAseqUltraplexRNA_181342.xlsx"), sheet=3, col_names = T, skip=1)) #sheet: umis.genes.polyA-mouse +df1 <-df1[,-c(1,3:6)] # keep only gene name and UMI counts +df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) # UMI normalized by CPM +gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax", + "Ccr2", "Itgam", "Mgl2", "Cd68", "Vcam1", "Csf1r", "Adgre1", + "Siglec1", "Hmox1", "Timd4", "Vsig4", "Clec4f", "Marco", "Pecam1", + "Tek", "Lyve1", "Stab2") +df2 <- df1[df1$gene%in%gene_vector,] %>% arrange(factor(gene, levels=gene_vector)) +rownames(df2) <- df2[,1] +df2.scaled <- as.data.frame(t(scale(t(df2[-1])))) # Zscore normalization. Scaled only works on columns, so need to transform +openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1B_table.xlsx"), rowNames=T) +# the df was exported and used to create a heatmap using Graphpad + +#### Figure 1C #### +# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator +# and merged with the plot generated by the code below +df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA +df1 <-df1[,1:7] #UMI +rownames(df1)<-df1$miRNA +df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts +df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization +colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns +upper.panel<-function(x, y, ...){ + points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey") + above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5 + points(((x+y)/2)[above],(x-y)[above], col="red", cex=0.6, pch=19) + below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5 + points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19) +} # function for MA plot +pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel, + ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) # pairwise plot + +#### Figure 1D #### +df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA +df1 <- df1[-which(grepl("piR",df1[,1])),1:7] # UMI and piRNA removing +df1[,1] <- gsub("/.*","",df1[,1]) # leave only first miRNA for ambiguous entries +df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000}) #UMI normalized by CPM +df.families <- as.data.frame(read_excel("Analysis CB/miRNA_Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase +df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries +df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1]) +df2 <- merge(df.families,df1,by=1) +df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family +gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1", + "miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p", + "miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p", + "miR-223-3p","miR-342-3p","miR-22-3p","miR-192-5p/215-5p", + "miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p") +df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector)) +rownames(df3) <- df3[,1] +df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform +openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1D_table.xlsx"), rowNames=T) +#the df was exported and used to create a heatmap using Graphpad diff --git a/CB2025_figure_3_RNAseq.R b/CB2025_figure_3_RNAseq.R index 75d05bb..b96d05d 100644 --- a/CB2025_figure_3_RNAseq.R +++ b/CB2025_figure_3_RNAseq.R @@ -1,185 +1,208 @@ -library(readxl) -library(ggplot2) -library(ggrepel) -library(dplyr) -library(fgsea) -library(clusterProfiler) -library(enrichplot) - -##### Directories ##### -#us <- "/Users/Squadrito/" -us <-"C:/Users/bresesti.chiara/" -wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr") -wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2") -fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures") - -#### Figure 3A&B #### - -#Import df -setwd(wdir) -miR_ctrl <- read_excel("edgeR_results.xlsx", sheet = "miR342-control") -sponge_ctrl <- read_excel("edgeR_results.xlsx", sheet = "spongeBT-control") - -#Add DEG color and label for volcano plot -miR_ctrl$DEG <- "NO" -miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP" -miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN" -miR_ctrl$DEG_label <- NA -miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"] - -sponge_ctrl$DEG <- "NO" -sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP" -sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN" -sponge_ctrl$DEG_label <- NA -sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"] - -#Volcano plot (left panel) -data <- miR_ctrl -ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) + - geom_point() + - theme_minimal() + - geom_text_repel() + - scale_color_manual(values=c("blue", "grey", "red")) + - geom_vline(xintercept=c(-1, 1), col="black") + - geom_hline(yintercept=-log10(0.05), col="black") + - scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) + - scale_y_continuous(limits = c(0,20)) + - xlab(bquote(Log[2](FC))) + - ylab(bquote(-Log[10](PVal))) - -data <- sponge_ctrl -ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) + - geom_point() + - theme_minimal() + - geom_text_repel() + - scale_color_manual(values=c("blue", "grey", "red")) + - geom_vline(xintercept=c(-1, 1), col="black") + - geom_hline(yintercept=-log10(0.05), col="black") + - scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) + - scale_y_continuous(limits = c(0,20)) + - xlab(bquote(Log[2](FC))) + - ylab(bquote(-Log[10](PVal))) - -#Import miR-342-3p target list (from TargetScan) -setwd(wdir_CB) -miR342_targets <- read_excel("TargetScan8.0__miR-342-3p.predicted_targets.xlsx") -miR342_targets <- filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3)) - -#ecdf plot (right panel) -data <- miR_ctrl -test <-filter(data, data$...1 %in% miR342_targets$'Target gene') -plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T, - ylab = "Fraction of DEG", - xlab = bquote(Log[2](FC)), - xlim=c(-0.5,0.5), - ylim=c(0,1), - main="ECDF of gene expression: miR vs mut") + - lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) + - abline(v=0, col="black") + - abline(h=0.5, col="black") + - legend("bottomright", c("All genes","miR-342-3p targets"), - col = c("black","blue"), lwd=2, cex = 0.7) -ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data - -data <- sponge_ctrl -test <-filter(data, data$...1 %in% miR342_targets$'Target gene') -plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T, - ylab = "Fraction of DEG", - xlab = bquote(Log[2](FC)), - xlim=c(-0.3,0.3), - ylim=c(0,1), - main="ECDF of gene expression: sponge vs scr") + - lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) + - abline(v=0, col="black") + - abline(h=0.5, col="black") + - legend("bottomright", c("All genes","miR-342-3p targets"), - col = c("black","red"), lwd=2, cex = 0.7) -ks.test(test$logFC, data$logFC, alternative = "l") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data - -#### Figure 3C&D #### - -#Upload df -setwd(wdir) -df1<- read_excel("EdgeR_results.xlsx",, sheet = "miR342-control") -setwd(wdir_CB) -miDB_sig5 <- readRDS("miDB_sig5.MLS.rds") #GO terms db - -#Rename pathways of interest in miDB_sig5 -names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process" -names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa" -names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1" -names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation" -names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production" -names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis" -names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity" -names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes" - -#Filter low expression, NA and order by FC -df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1)) -df1<-df1[order(df1$logFC),] -df2<-df1$logFC -names(df2)<-df1$...1 - -#Run GSEA with fgsea and filter by PVal -sig <- miDB_sig5 -test1<-fgsea(sig, df2,minSize = 7,maxSize =500, nproc=1) -PvalResult<-filter(test1, test1$padj<= 0.05) - -#DotPlot pathways of interest (Fig.3C) -pathways <- c("Oxydative phosphorylation", - "Regulation of cholesterol metabolic process", - "Response to IL12", - "Regulation of lipid biosynthetic process", - "MHC-I protein binding", - "TNFa mediated signalling pathway", - "Negative regulation of cell cycle progression", - "Fatty acyl-CoA biosynthetic process", - "Response to TNFa", - "G2M checkpoint", - "Response to type I interferon", - "IL1 mediated signalling pathway", - "Response to IL1", - "LPS response genes", - "Cytokine mediated signalling pathway", - "Myeloid cell differentiation", - "IL10 production", - "Regulation of endothelial cell proliferation", - "TNFa superfamily cytokine production", - "Angiogenesis", - "Pattern recognition receptor activity", - "PGE2 response genes") - -genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,]) -ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + - geom_point() + - scale_size_area(limits=c(10,450), max_size = 15) + - scale_colour_gradient(low="red",high="blue") + - labs(y='Pathway',x='NES') - -#Run GSEA with ClusterProfiler -genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)), - gene = unlist(miDB_sig5)) -df2 = sort(df2, decreasing = TRUE) -test2 <- GSEA(df2,TERM2GENE = genelist) - -#Enrichment map (Fig.3D) -test3 <- filter(test2, ID %in% pathways) -test3 <- pairwise_termsim(test3) -emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T)+ - scale_fill_gradient2(name=bquote(NES),low="blue", high="red") -#N.B. Produces slightly different plot every time, -#but connection between pathways stays the same \ No newline at end of file +library(readxl) +library(ggplot2) +library(ggrepel) +library(dplyr) +library(fgsea) +library(clusterProfiler) +library(enrichplot) + +##### Directories ##### +# us <-"C:/Users/bresesti.chiara/" +# wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr") +# wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2") +# fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures") + +input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference" +output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output" + +################################################################################ +# +# R script to generate data and figures for manuscript figures 3A, 3B, 3C, and 3D. +# This script performs differential gene expression analysis and gene set enrichment +# analysis to generate volcano plots, empirical cumulative distribution function (ECDF) plots, +# dot plots, and enrichment maps. +# +# Figures generated: +# - Figure 3A & 3B: Volcano plots of differentially expressed genes in miR-342-3p +# overexpression and sponge experiments, and ECDF plots comparing +# logFC distributions of all genes vs. miR-342-3p target genes. +# - Figure 3C: Dot plot visualizing enriched pathways from gene set enrichment analysis (GSEA). +# - Figure 3D: Enrichment map visualizing relationships between enriched pathways. +# +# Input data files: +# - Figure 3A & 3B: "edgeR_results.xlsx" (Sheet "miR342-control" and "spongeBT-control") +# (Output from differential gene expression analysis, likely using edgeR) +# - Figure 3A & 3B: "TargetScan8.0__miR-342-3p.predicted_targets.xlsx" (Sheet 1) +# (List of predicted miR-342-3p target genes from TargetScan) +# - Figure 3C & 3D: "EdgeR_results.xlsx" (Sheet "miR342-control") +# (Same as input for Figure 3A & 3B, used for GSEA) +# - Figure 3C & 3D: "miDB_sig5.MLS.rds" (RDS file containing gene sets for gene set enrichment analysis) +# +################################################################################ + + + +#### Figure 3A&B #### + +#Import df +miR_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "miR342-control") +sponge_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "spongeBT-control") + +#Add DEG color and label for volcano plot +miR_ctrl$DEG <- "NO" +miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP" +miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN" +miR_ctrl$DEG_label <- NA +miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"] + +sponge_ctrl$DEG <- "NO" +sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP" +sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN" +sponge_ctrl$DEG_label <- NA +sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"] + +#Volcano plot (left panel) +data <- miR_ctrl +ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) + + geom_point() + + theme_minimal() + + geom_text_repel() + + scale_color_manual(values=c("blue", "grey", "red")) + + geom_vline(xintercept=c(-1, 1), col="black") + + geom_hline(yintercept=-log10(0.05), col="black") + + scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) + + scale_y_continuous(limits = c(0,20)) + + xlab(bquote(Log[2](FC))) + + ylab(bquote(-Log[10](PVal))) + +data <- sponge_ctrl +ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) + + geom_point() + + theme_minimal() + + geom_text_repel() + + scale_color_manual(values=c("blue", "grey", "red")) + + geom_vline(xintercept=c(-1, 1), col="black") + + geom_hline(yintercept=-log10(0.05), col="black") + + scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) + + scale_y_continuous(limits = c(0,20)) + + xlab(bquote(Log[2](FC))) + + ylab(bquote(-Log[10](PVal))) + +# Import miR-342-3p target list (from TargetScan) +miR342_targets <- read_excel(paste0(input_dir, "/TargetScan8.0__miR-342-3p.predicted_targets.xlsx")) +miR342_targets <- filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3)) + +#ecdf plot (right panel) +data <- miR_ctrl +test <-filter(data, data$...1 %in% miR342_targets$'Target gene') +plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T, + ylab = "Fraction of DEG", + xlab = bquote(Log[2](FC)), + xlim=c(-0.5,0.5), + ylim=c(0,1), + main="ECDF of gene expression: miR vs mut") + + lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) + + abline(v=0, col="black") + + abline(h=0.5, col="black") + + legend("bottomright", c("All genes","miR-342-3p targets"), + col = c("black","blue"), lwd=2, cex = 0.7) +ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data + +data <- sponge_ctrl +test <-filter(data, data$...1 %in% miR342_targets$'Target gene') +plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T, + ylab = "Fraction of DEG", + xlab = bquote(Log[2](FC)), + xlim=c(-0.3,0.3), + ylim=c(0,1), + main="ECDF of gene expression: sponge vs scr") + + lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) + + abline(v=0, col="black") + + abline(h=0.5, col="black") + + legend("bottomright", c("All genes","miR-342-3p targets"), + col = c("black","red"), lwd=2, cex = 0.7) +ks.test(test$logFC, data$logFC, alternative = "l") # Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data + +#### Figure 3C&D #### + +# Load df +df1<- read_excel(paste0(input_dir, "/EdgeR_results.xlsx"), sheet = "miR342-control") +miDB_sig5 <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds")) #GO terms db + +#Rename pathways of interest in miDB_sig5 +names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process" +names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa" +names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1" +names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation" +names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production" +names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis" +names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity" +names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes" + +# Filter low expression, NA and order by FC +df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1)) +df1<-df1[order(df1$logFC),] +df2<-df1$logFC +names(df2)<-df1$...1 + +# Run GSEA with fgsea and filter by PVal +sig <- miDB_sig5 +test1 <- fgsea(sig, df2, minSize=7, maxSize=500, nproc=1) +PvalResult <- filter(test1, test1$padj <= 0.05) + +# DotPlot pathways of interest (Fig.3C) +pathways <- c("Oxydative phosphorylation", + "Regulation of cholesterol metabolic process", + "Response to IL12", + "Regulation of lipid biosynthetic process", + "MHC-I protein binding", + "TNFa mediated signalling pathway", + "Negative regulation of cell cycle progression", + "Fatty acyl-CoA biosynthetic process", + "Response to TNFa", + "G2M checkpoint", + "Response to type I interferon", + "IL1 mediated signalling pathway", + "Response to IL1", + "LPS response genes", + "Cytokine mediated signalling pathway", + "Myeloid cell differentiation", + "IL10 production", + "Regulation of endothelial cell proliferation", + "TNFa superfamily cytokine production", + "Angiogenesis", + "Pattern recognition receptor activity", + "PGE2 response genes") + +genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,]) +ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + + geom_point() + + scale_size_area(limits=c(10,450), max_size = 15) + + scale_colour_gradient(low="red",high="blue") + + labs(y='Pathway', x='NES') + +#Run GSEA with ClusterProfiler +genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)), + gene = unlist(miDB_sig5)) +df2 = sort(df2, decreasing = TRUE) +test2 <- GSEA(df2,TERM2GENE = genelist) + +#Enrichment map (Fig.3D) +test3 <- filter(test2, ID %in% pathways) +test3 <- pairwise_termsim(test3) +emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T) + + scale_fill_gradient2(name=bquote(NES),low="blue", high="red") \ No newline at end of file diff --git a/CB2025_figure_5_scRNAseq.R b/CB2025_figure_5_scRNAseq.R index ad39882..0e7a83e 100644 --- a/CB2025_figure_5_scRNAseq.R +++ b/CB2025_figure_5_scRNAseq.R @@ -29,7 +29,28 @@ dir.create(plot_dir, showWarnings=F, recursive=T) sig <- readRDS("/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/reference/miDB_sig5.MLS.rds") -############################################################################### +################################################################################ +# R script for single-cell RNA-seq data analysis and figure generation. +# +# This script performs cell annotation, visualization, differential gene expression +# analysis, and gene set enrichment analysis (GSEA) on single-cell RNA-seq data. +# +# Figures generated/data produced: +# - Figure 5A: UMAP visualization of cell clusters with final annotation +# - Figure 5B: UMAP plot with overlayed density of mOrange+ cells +# - Figure 5C: Dotplot of Slc7a11 gene expression across cell types and groups +# - Figure 5D: Barplot of GSEA results for selected gene sets +# - Figure 5E: Heatmap of cytokine signature GSEA results +# - Supplementary Figure 5A: Dotplot of marker gene expression across cell types +# - Supplementary Figure 5B: CSV tables for cell distribution per cluster and sample +# - Supplementary Figure 5C: CSV tables for mOrange+ cell distribution per cluster and sample +# +# Input data files: +# - RDS object: "CB1_CB3_CB4_final.rds" (Seurat object containing scRNA-seq data) +# - RDS object: "miDB_sig5.MLS.rds" (GO terms database for GSEA) +# +################################################################################ + set.seed(42) diff --git a/TCGA_analysis.R b/TCGA_analysis.R index a2a086d..05a3eea 100644 --- a/TCGA_analysis.R +++ b/TCGA_analysis.R @@ -7,44 +7,73 @@ library(readxl) library(survminer) library(gridExtra) library(ggplot2) + ##Load dataset -#username <- "C:/Users/notaro.marco/" -username <- "/Users/bresesti.chiara/" -wdir<- paste0(username, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al Cell Reports/TCGA_analysis") +wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts" setwd(wdir) -metaD<-data.frame(fread("TCGA_phenotype_denseDataOnlyDownload.tsv.gz", sep='\t',colClasses=c("character"),data.table=FALSE)) -Surv01<-data.frame(fread("Survival_SupplementalTable_S1_20171025_xena_sp", sep='\t',colClasses=c("character"),data.table=FALSE)) + +input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference" +output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output" + + +################################################################################ +# R script for TCGA survival analysis of miR-342-3p expression. +# +# This script analyzes TCGA pancancer miRNA expression and survival data to assess +# the prognostic value of hsa-miR-342-3p across different tumor types. +# It calculates hazard ratios (HR) and generates Kaplan-Meier survival curves +# to visualize the association between miR-342-3p expression levels and patient survival. +# +# Figures generated: +# - HR_alltumors_342.pdf (HR forest plot): Forest plot visualizing hazard ratios and +# significance of miR-342-3p expression on overall survival across various tumor types. +# - Survival_***_342 (Survival curves - multiple tumors): Set of Kaplan-Meier survival plots +# for tumor types showing significant hazard ratios, illustrating survival differences +# between patients with high and low miR-342-3p expression. +# - Survival_metastatic_342.pdf (Survival curve - metastatic tumors): Kaplan-Meier survival plot +# specifically for metastatic tumors, showing the impact of miR-342-3p expression on survival +# in this patient subgroup. +# +# Input data files: +# - TCGA_phenotype_denseDataOnlyDownload.tsv.gz: TCGA patient phenotype data (metadata), +# downloaded from TCGA or Xena, contains clinical and demographic information. +# - Survival_SupplementalTable_S1_20171025_xena_sp: TCGA patient survival data, provides overall survival (OS) time and status. +# - pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz: +# TCGA pancancer miRNA expression data (FPKM values), downloaded from Xena, +# contains miRNA expression levels across different tumor samples. +# +# Note: +# - The script filters tumor types based on p-value significance (p < 0.1 for HR plot, p < 0.05 for survival curves) +# and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script. +################################################################################ + + +metaD<-data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE)) +Surv01<-data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE)) # FPKM01<-fread("tcga_RSEM_Hugo_norm_count") -FPKM01<-fread("pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz") +FPKM01<-fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz")) FPKM01<-as.data.frame(FPKM01) -# Annot<-data.frame(fread("probeMap_gencode.v23.annotation.gene.probemap", sep='\t',colClasses=c("character"),data.table=FALSE)) +# Annot<-data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE)) allGenes <- FPKM01[,1] - - - ####################################################################### #####PLOT HR Score of signature for multiple tumor types############### ##################################################################### humanGenes<-list(c("hsa-miR-342-3p")) - ####create xlsx file with genes, patients and tumor types Pan.data3 <- metaD[,] Pan.data4 <- merge(Surv01,Pan.data3,by=1) Genes_selected <- FPKM01[FPKM01[,1]%in%humanGenes,] - rownames(Genes_selected)<-Genes_selected[,1] Genes_selected<-Genes_selected[,-1] Genes_selected <- data.frame(t(Genes_selected)) - Genes_selected$sample<-rownames(Genes_selected) Pan.data5 <- merge(Genes_selected,Pan.data4,by.y="sample") colnames(Pan.data5) -#setwd(wdir) -#write.xlsx(Pan.data5, "miR342_patients_survival.xlsx") +write.xlsx(Pan.data5, paste0(ourdir, "/miR342_patients_survival.xlsx")) #############Regression with defined k2 @@ -63,7 +92,7 @@ for (i in unique(Pan.data5$cancer.type.abbreviation)){ df2 <- rbind(df2,df1) df2<-df2[(order(df2$p)),] }else{}} -#write.xlsx(df2, "miR342_HR_by_tumortype.xlsx") +write.xlsx(df2, paste0(output_dir, "/miR342_HR_by_tumortype.xlsx")) ###Filters selected tumors @@ -75,21 +104,13 @@ a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label = geom_point(size = 4) + geom_errorbar(aes(ymin = pmax(HR-SE, 0), ymax = HR + SE), width = 0.2) + scale_color_manual(name = "Color", values = c("red", "blue")) + - labs(title = "HR by tumor type", - x = "Tumor Type", - y = "HR") + + labs(title = "HR by tumor type", x = "Tumor Type", y = "HR") + geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 1), fill = "lightblue", alpha = 0.01) + geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = 1, ymax = Inf), fill = "pink", alpha = 0.01)+ theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust= 1, color = "Black")) + - geom_hline(yintercept = 1, linetype = "dashed", color = "gray") - - -# pdf("HR_alltumors_342.pdf",width=9,height = 4) -a -# dev.off() - - + geom_hline(yintercept = 1, linetype = "dashed", color = "gray") +ggsave(filename = paste0(output_dir, "/HR_alltumors_342.pdf"), plot=a, width=9, height=4) ####Plot survival of chosen tumors @@ -98,7 +119,7 @@ df3<-df3[order(df3$HR),] df3<-df3[df3$p<0.05,] -# pdf("Survival_***_342",width=10,height = 6) +#pdf(paste0(output_dir, "/Survival_***_342"), width=10,height = 6) par(mfrow = c(2,4)) survival<-list() for (i in df3$Tumor){ @@ -141,7 +162,7 @@ formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001", as.numeric(round(p_val par(las = 0) -pdf("Survival_metastatic_342.pdf",width=5,height =5) +pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5,height =5) plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n", main = "Metastatic tumors") #Add legend -- GitLab