I updated the scripts

cad0b7de · Marco Monti · a78a5561 · cad0b7de · cad0b7de · cad0b7de
Commit cad0b7de authored Mar 25, 2025 by Marco Monti
--- a/CB2025_figure_1_RNAseq.R
+++ b/CB2025_figure_1_RNAseq.R
 library("readxl")
 library("dplyr")
 #####Directories#####
-us <- "/Users/Squadrito/"
+# us <-"C:/Users/bresesti.chiara/"
-us <-"C:/Users/bresesti.chiara/"
+# wdir1509<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
-wdir1509<-paste0(us,"/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
+# wdir1510<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
-wdir1510<-paste0(us,"/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
+# fdir <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
-fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
+# 
+# input_d <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
+# wdir1510 <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
-#### Figure 1B ####
+# output_d <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
-setwd(wdir1509)
-df1 <- as.data.frame(read_excel("QIAseqUltraplexRNA_181342.xlsx",sheet=3,col_names = T,skip = 1))#sheet: umis.genes.polyA-mouse
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
-df1 <-df1[,-c(1,3:6)] #keep only gene name and UMI counts
+output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
-df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) #UMI normalized by CPM
-gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax", 
+################################################################################
-                 "Ccr2", "Itgam", "Mgl2", "Cd68", "Vcam1", "Csf1r", "Adgre1", 
+# R script to generate data and figures for manuscript figures 1B, 1C, and 1D.
-                 "Siglec1", "Hmox1", "Timd4", "Vsig4", "Clec4f", "Marco", "Pecam1", 
+#
-                 "Tek", "Lyve1", "Stab2")
+# This script reads RNA-seq and miRNA-seq data from Excel files, performs
-df2 <- df1[df1$gene%in%gene_vector,] %>% arrange(factor(gene, levels=gene_vector))
+# normalization and data transformation, and generates output tables and plots.
-rownames(df2) <- df2[,1]
+#
-df2.scaled <- as.data.frame(t(scale(t(df2[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
+# Figures generated/data produced:
-setwd(fdir)
+#   - Figure 1B: Heatmap data for selected marker genes (Excel table)
-openxlsx::write.xlsx(df2.scaled,"Figure_1B_table.xlsx", rowNames=T)
+#   - Figure 1C: Pairwise MA plots for miRNA data
-#the df was exported and used to create a heatmap using Graphpad
+#   - Figure 1D: Heatmap data for selected miRNA families (Excel table)
+#
-#### Figure 1C ####
+# Input data files:
-# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator 
+#   - Figure 1B: "QIAseqUltraplexRNA_181342.xlsx" (Sheet 3: umis.genes.polyA-mouse)
-# and merged with the plot generated by the code below
+#   - Figure 1C & 1D: "173308.all_samples.summary.xlsx" (Sheet 2: miRNA_piRNA)
-setwd(wdir1510)
+#   - Figure 1D: "miRNA_Family.xlsx" (Sheet 1)
-df1 <- as.data.frame(read_excel("173308.all_samples.summary.xlsx",sheet=2,col_names = T))#sheet: miRNA_piRNA
+#
-df1 <-df1[,1:7]#UMI 
+################################################################################
-rownames(df1)<-df1$miRNA
-df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts
-df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization
+#### Figure 1B ####
-colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns
+df1 <- as.data.frame(read_excel(paste0(input_dir, "/QIAseqUltraplexRNA_181342.xlsx"), sheet=3, col_names = T, skip=1)) #sheet: umis.genes.polyA-mouse
-upper.panel<-function(x, y, ...){
+df1 <-df1[,-c(1,3:6)] # keep only gene name and UMI counts
-  points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey")
+df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) # UMI normalized by CPM
-  above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5
+gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax", 
-  points(((x+y)/2)[above],(x-y)[above], col="red", cex=0.6, pch=19)
+                 "Ccr2", "Itgam", "Mgl2", "Cd68", "Vcam1", "Csf1r", "Adgre1", 
-  below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5
+                 "Siglec1", "Hmox1", "Timd4", "Vsig4", "Clec4f", "Marco", "Pecam1", 
-  points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19)
+                 "Tek", "Lyve1", "Stab2")
-} #function for MA plot
+df2 <- df1[df1$gene%in%gene_vector,] %>% arrange(factor(gene, levels=gene_vector))
-pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel, 
+rownames(df2) <- df2[,1]
-      ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) #pairwise plot
+df2.scaled <- as.data.frame(t(scale(t(df2[-1])))) # Zscore normalization. Scaled only works on columns, so need to transform
+openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1B_table.xlsx"), rowNames=T)
-#### Figure 1D ####
+# the df was exported and used to create a heatmap using Graphpad
-setwd(wdir1510)
-df1 <- as.data.frame(read_excel("173308.all_samples.summary.xlsx",sheet=2,col_names = T))#sheet: miRNA_piRNA
+#### Figure 1C ####
-df1 <- df1[-which(grepl("piR",df1[,1])),1:7]#UMI and piRNA removing
+# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator 
-df1[,1] <- gsub("/.*","",df1[,1]) #leave only first miRNA for ambiguous entries
+# and merged with the plot generated by the code below
-df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000})#UMI normalized by CPM
+df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA
-df.families <- as.data.frame(read_excel("Analysis CB/miRNA Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase
+df1 <-df1[,1:7] #UMI 
-df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries
+rownames(df1)<-df1$miRNA
-df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1])
+df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts
-df2 <- merge(df.families,df1,by=1)
+df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization
-df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family
+colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns
-gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1",
+upper.panel<-function(x, y, ...){
-                 "miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p",
+  points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey")
-                 "miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p",
+  above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5
-                 "miR-223-3p","miR-342-3p","miR-22-3p","miR-192-5p/215-5p",
+  points(((x+y)/2)[above],(x-y)[above], col="red", cex=0.6, pch=19)
-                 "miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p")
+  below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5
-df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector))
+  points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19)
-rownames(df3) <- df3[,1]
+} # function for MA plot
-df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
+pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel, 
-setwd(fdir)
+      ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) # pairwise plot
-openxlsx::write.xlsx(df2.scaled,"Figure_1D_table.xlsx", rowNames=T)
-#the df was exported and used to create a heatmap using Graphpad
+#### Figure 1D ####
+df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA
+df1 <- df1[-which(grepl("piR",df1[,1])),1:7] # UMI and piRNA removing
+df1[,1] <- gsub("/.*","",df1[,1]) # leave only first miRNA for ambiguous entries
+df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000}) #UMI normalized by CPM
+df.families <- as.data.frame(read_excel("Analysis CB/miRNA_Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase
+df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries
+df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1])
+df2 <- merge(df.families,df1,by=1)
+df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family
+gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1",
+                 "miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p",
+                 "miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p",
+                 "miR-223-3p","miR-342-3p","miR-22-3p","miR-192-5p/215-5p",
+                 "miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p")
+df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector))
+rownames(df3) <- df3[,1]
+df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
+openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1D_table.xlsx"), rowNames=T)
+#the df was exported and used to create a heatmap using Graphpad
--- a/CB2025_figure_3_RNAseq.R
+++ b/CB2025_figure_3_RNAseq.R
 library(readxl)
 library(ggplot2)
 library(ggrepel)
 library(dplyr)
 library(fgsea)
 library(clusterProfiler)
 library(enrichplot)
 ##### Directories #####
-#us <- "/Users/Squadrito/"
+# us <-"C:/Users/bresesti.chiara/"
-us <-"C:/Users/bresesti.chiara/"
+# wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr")
-wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr")
+# wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
-wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
+# fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
-fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
-#### Figure 3A&B ####
+output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
-#Import df
+################################################################################
-setwd(wdir)
+# 
-miR_ctrl <- read_excel("edgeR_results.xlsx", sheet = "miR342-control")
+# R script to generate data and figures for manuscript figures 3A, 3B, 3C, and 3D.
-sponge_ctrl <- read_excel("edgeR_results.xlsx", sheet = "spongeBT-control")
+# This script performs differential gene expression analysis and gene set enrichment
+# analysis to generate volcano plots, empirical cumulative distribution function (ECDF) plots, 
-#Add DEG color and label for volcano plot
+# dot plots, and enrichment maps.
-miR_ctrl$DEG <- "NO"
+#
-miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP"
+# Figures generated:
-miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN"
+# - Figure 3A & 3B: Volcano plots of differentially expressed genes in miR-342-3p
-miR_ctrl$DEG_label <- NA
+# overexpression and sponge experiments, and ECDF plots comparing
-miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"]
+# logFC distributions of all genes vs. miR-342-3p target genes.
+# - Figure 3C: Dot plot visualizing enriched pathways from gene set enrichment analysis (GSEA).
-sponge_ctrl$DEG <- "NO"
+# - Figure 3D: Enrichment map visualizing relationships between enriched pathways.
-sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP"
+#
-sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN"
+# Input data files:
-sponge_ctrl$DEG_label <- NA
+# - Figure 3A & 3B: "edgeR_results.xlsx" (Sheet "miR342-control" and "spongeBT-control")
-sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"]
+# (Output from differential gene expression analysis, likely using edgeR)
+# - Figure 3A & 3B: "TargetScan8.0__miR-342-3p.predicted_targets.xlsx" (Sheet 1)
-#Volcano plot (left panel)
+# (List of predicted miR-342-3p target genes from TargetScan)
-data <- miR_ctrl
+# - Figure 3C & 3D: "EdgeR_results.xlsx" (Sheet "miR342-control")
-ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
+# (Same as input for Figure 3A & 3B, used for GSEA)
-  geom_point() + 
+# - Figure 3C & 3D: "miDB_sig5.MLS.rds" (RDS file containing gene sets for gene set enrichment analysis)
-  theme_minimal() +
+#
-  geom_text_repel() +
+################################################################################
-  scale_color_manual(values=c("blue", "grey", "red")) +
-  geom_vline(xintercept=c(-1, 1), col="black") +
-  geom_hline(yintercept=-log10(0.05), col="black") +
-  scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
+#### Figure 3A&B ####
-  scale_y_continuous(limits = c(0,20)) +
-  xlab(bquote(Log[2](FC))) +
+#Import df
-  ylab(bquote(-Log[10](PVal)))
+miR_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "miR342-control")
+sponge_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "spongeBT-control")
-data <- sponge_ctrl
-ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
+#Add DEG color and label for volcano plot
-  geom_point() + 
+miR_ctrl$DEG <- "NO"
-  theme_minimal() +
+miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP"
-  geom_text_repel() +
+miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN"
-  scale_color_manual(values=c("blue", "grey", "red")) +
+miR_ctrl$DEG_label <- NA
-  geom_vline(xintercept=c(-1, 1), col="black") +
+miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"]
-  geom_hline(yintercept=-log10(0.05), col="black") +
-  scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
+sponge_ctrl$DEG <- "NO"
-  scale_y_continuous(limits = c(0,20)) +
+sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP"
-  xlab(bquote(Log[2](FC))) +
+sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN"
-  ylab(bquote(-Log[10](PVal)))
+sponge_ctrl$DEG_label <- NA
+sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"]
-#Import miR-342-3p target list (from TargetScan)
-setwd(wdir_CB)
+#Volcano plot (left panel)
-miR342_targets <-  read_excel("TargetScan8.0__miR-342-3p.predicted_targets.xlsx")
+data <- miR_ctrl
-miR342_targets <-  filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3))
+ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
+  geom_point() + 
-#ecdf plot (right panel)
+  theme_minimal() +
-data <- miR_ctrl
+  geom_text_repel() +
-test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
+  scale_color_manual(values=c("blue", "grey", "red")) +
-plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
+  geom_vline(xintercept=c(-1, 1), col="black") +
-     ylab = "Fraction of DEG",
+  geom_hline(yintercept=-log10(0.05), col="black") +
-     xlab = bquote(Log[2](FC)),
+  scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
-     xlim=c(-0.5,0.5),
+  scale_y_continuous(limits = c(0,20)) +
-     ylim=c(0,1),
+  xlab(bquote(Log[2](FC))) +
-     main="ECDF of gene expression: miR vs mut") +
+  ylab(bquote(-Log[10](PVal)))
-  lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) +
-  abline(v=0, col="black") + 
+data <- sponge_ctrl
-  abline(h=0.5, col="black") +
+ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
-  legend("bottomright", c("All genes","miR-342-3p targets"),
+  geom_point() + 
-         col = c("black","blue"), lwd=2, cex = 0.7)
+  theme_minimal() +
-ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
+  geom_text_repel() +
+  scale_color_manual(values=c("blue", "grey", "red")) +
-data <- sponge_ctrl
+  geom_vline(xintercept=c(-1, 1), col="black") +
-test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
+  geom_hline(yintercept=-log10(0.05), col="black") +
-plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
+  scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
-     ylab = "Fraction of DEG",
+  scale_y_continuous(limits = c(0,20)) +
-     xlab = bquote(Log[2](FC)),
+  xlab(bquote(Log[2](FC))) +
-     xlim=c(-0.3,0.3),
+  ylab(bquote(-Log[10](PVal)))
-     ylim=c(0,1),
-     main="ECDF of gene expression: sponge vs scr") +
+# Import miR-342-3p target list (from TargetScan)
-  lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) +
+miR342_targets <-  read_excel(paste0(input_dir, "/TargetScan8.0__miR-342-3p.predicted_targets.xlsx"))
-  abline(v=0, col="black") + 
+miR342_targets <-  filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3))
-  abline(h=0.5, col="black") +
-  legend("bottomright", c("All genes","miR-342-3p targets"),
+#ecdf plot (right panel)
-         col = c("black","red"), lwd=2, cex = 0.7)
+data <- miR_ctrl
-ks.test(test$logFC, data$logFC, alternative = "l") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
+test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
+plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
-#### Figure 3C&D ####
+     ylab = "Fraction of DEG",
+     xlab = bquote(Log[2](FC)),
-#Upload df
+     xlim=c(-0.5,0.5),
-setwd(wdir)
+     ylim=c(0,1),
-df1<- read_excel("EdgeR_results.xlsx",, sheet = "miR342-control")
+     main="ECDF of gene expression: miR vs mut") +
-setwd(wdir_CB)
+  lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) +
-miDB_sig5 <- readRDS("miDB_sig5.MLS.rds") #GO terms db
+  abline(v=0, col="black") + 
+  abline(h=0.5, col="black") +
-#Rename pathways of interest in miDB_sig5
+  legend("bottomright", c("All genes","miR-342-3p targets"),
-names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation"
+         col = c("black","blue"), lwd=2, cex = 0.7)
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process"
+ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12"
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process"
+data <- sponge_ctrl
-names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding"
+test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway"
+plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression"
+     ylab = "Fraction of DEG",
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process"
+     xlab = bquote(Log[2](FC)),
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa"
+     xlim=c(-0.3,0.3),
-names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint"
+     ylim=c(0,1),
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon"
+     main="ECDF of gene expression: sponge vs scr") +
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway"
+  lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) +
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1"
+  abline(v=0, col="black") + 
-names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes"
+  abline(h=0.5, col="black") +
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway"
+  legend("bottomright", c("All genes","miR-342-3p targets"),
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation"
+         col = c("black","red"), lwd=2, cex = 0.7)
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production"
+ks.test(test$logFC, data$logFC, alternative = "l") # Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation"
-names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production"
+#### Figure 3C&D ####
-names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis"
-names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity"
+# Load df
-names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes"
+df1<- read_excel(paste0(input_dir, "/EdgeR_results.xlsx"), sheet = "miR342-control")
+miDB_sig5 <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds")) #GO terms db
-#Filter low expression, NA and order by FC
-df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1))
+#Rename pathways of interest in miDB_sig5
-df1<-df1[order(df1$logFC),]
+names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation"
-df2<-df1$logFC
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process"
-names(df2)<-df1$...1
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12"
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process"
-#Run GSEA with fgsea and filter by PVal
+names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding"
-sig <- miDB_sig5
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway"
-test1<-fgsea(sig, df2,minSize  = 7,maxSize  =500, nproc=1)
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression"
-PvalResult<-filter(test1, test1$padj<= 0.05)
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process"
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa"
-#DotPlot pathways of interest (Fig.3C)
+names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint"
-pathways <- c("Oxydative phosphorylation",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon"
-              "Regulation of cholesterol metabolic process",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway"
-              "Response to IL12",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1"
-              "Regulation of lipid biosynthetic process",
+names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes"
-              "MHC-I protein binding",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway"
-              "TNFa mediated signalling pathway",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation"
-              "Negative regulation of cell cycle progression",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production"
-              "Fatty acyl-CoA biosynthetic process",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation"
-              "Response to TNFa",
+names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production"
-              "G2M checkpoint",
+names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis"
-              "Response to type I interferon",
+names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity"
-              "IL1 mediated signalling pathway",
+names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes"
-              "Response to IL1",
-              "LPS response genes",
+# Filter low expression, NA and order by FC
-              "Cytokine mediated signalling pathway",
+df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1))
-              "Myeloid cell differentiation",
+df1<-df1[order(df1$logFC),]
-              "IL10 production",
+df2<-df1$logFC
-              "Regulation of endothelial cell proliferation",
+names(df2)<-df1$...1
-              "TNFa superfamily cytokine production",
-              "Angiogenesis",
+# Run GSEA with fgsea and filter by PVal
-              "Pattern recognition receptor activity",
+sig <- miDB_sig5
-              "PGE2 response genes")
+test1 <- fgsea(sig, df2, minSize=7, maxSize=500, nproc=1)
+PvalResult <- filter(test1, test1$padj <= 0.05)
-genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,])
-ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + 
+# DotPlot pathways of interest (Fig.3C)
-  geom_point() +
+pathways <- c("Oxydative phosphorylation",
-  scale_size_area(limits=c(10,450), max_size = 15) +
+              "Regulation of cholesterol metabolic process",
-  scale_colour_gradient(low="red",high="blue") +
+              "Response to IL12",
-  labs(y='Pathway',x='NES')
+              "Regulation of lipid biosynthetic process",
+              "MHC-I protein binding",
-#Run GSEA with ClusterProfiler
+              "TNFa mediated signalling pathway",
-genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)),
+              "Negative regulation of cell cycle progression",
-                       gene = unlist(miDB_sig5))
+              "Fatty acyl-CoA biosynthetic process",
-df2 = sort(df2, decreasing = TRUE)
+              "Response to TNFa",
-test2 <- GSEA(df2,TERM2GENE = genelist)
+              "G2M checkpoint",
+              "Response to type I interferon",
-#Enrichment map (Fig.3D)
+              "IL1 mediated signalling pathway",
-test3 <- filter(test2, ID %in% pathways)
+              "Response to IL1",
-test3 <- pairwise_termsim(test3)
+              "LPS response genes",
-emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T)+
+              "Cytokine mediated signalling pathway",
-  scale_fill_gradient2(name=bquote(NES),low="blue", high="red") 
+              "Myeloid cell differentiation",
-#N.B. Produces slightly different plot every time, 
+              "IL10 production",
-#but connection between pathways stays the same
+              "Regulation of endothelial cell proliferation",
\ No newline at end of file
+              "TNFa superfamily cytokine production",
+              "Angiogenesis",
+              "Pattern recognition receptor activity",
+              "PGE2 response genes")
+genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,])
+ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + 
+  geom_point() +
+  scale_size_area(limits=c(10,450), max_size = 15) +
+  scale_colour_gradient(low="red",high="blue") +
+  labs(y='Pathway', x='NES')
+#Run GSEA with ClusterProfiler
+genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)),
+                       gene = unlist(miDB_sig5))
+df2 = sort(df2, decreasing = TRUE)
+test2 <- GSEA(df2,TERM2GENE = genelist)
+#Enrichment map (Fig.3D)
+test3 <- filter(test2, ID %in% pathways)
+test3 <- pairwise_termsim(test3)
+emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T) +
+  scale_fill_gradient2(name=bquote(NES),low="blue", high="red") 
\ No newline at end of file
--- a/CB2025_figure_5_scRNAseq.R
+++ b/CB2025_figure_5_scRNAseq.R
@@ -29,7 +29,28 @@ dir.create(plot_dir, showWarnings=F, recursive=T)
 sig <- readRDS("/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/reference/miDB_sig5.MLS.rds")
-###############################################################################
+################################################################################
+# R script for single-cell RNA-seq data analysis and figure generation.
+#
+# This script performs cell annotation, visualization, differential gene expression
+# analysis, and gene set enrichment analysis (GSEA) on single-cell RNA-seq data.
+#
+# Figures generated/data produced:
+#   - Figure 5A: UMAP visualization of cell clusters with final annotation
+#   - Figure 5B: UMAP plot with overlayed density of mOrange+ cells
+#   - Figure 5C: Dotplot of Slc7a11 gene expression across cell types and groups
+#   - Figure 5D: Barplot of GSEA results for selected gene sets
+#   - Figure 5E: Heatmap of cytokine signature GSEA results
+#   - Supplementary Figure 5A: Dotplot of marker gene expression across cell types
+#   - Supplementary Figure 5B: CSV tables for cell distribution per cluster and sample
+#   - Supplementary Figure 5C: CSV tables for mOrange+ cell distribution per cluster and sample
+#
+# Input data files:
+#   - RDS object: "CB1_CB3_CB4_final.rds" (Seurat object containing scRNA-seq data)
+#   - RDS object: "miDB_sig5.MLS.rds" (GO terms database for GSEA)
+#
+################################################################################
 set.seed(42)

--- a/TCGA_analysis.R
+++ b/TCGA_analysis.R
@@ -7,44 +7,73 @@ library(readxl)
 library(survminer)
 library(gridExtra)
 library(ggplot2)
 ##Load dataset
-#username <- "C:/Users/notaro.marco/"
+wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts"
-username <- "/Users/bresesti.chiara/"
-wdir<- paste0(username, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al Cell Reports/TCGA_analysis")
 setwd(wdir)
-metaD<-data.frame(fread("TCGA_phenotype_denseDataOnlyDownload.tsv.gz", sep='\t',colClasses=c("character"),data.table=FALSE))
-Surv01<-data.frame(fread("Survival_SupplementalTable_S1_20171025_xena_sp", sep='\t',colClasses=c("character"),data.table=FALSE))
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
+output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
+################################################################################
+# R script for TCGA survival analysis of miR-342-3p expression.
+#
+# This script analyzes TCGA pancancer miRNA expression and survival data to assess
+# the prognostic value of hsa-miR-342-3p across different tumor types.
+# It calculates hazard ratios (HR) and generates Kaplan-Meier survival curves
+# to visualize the association between miR-342-3p expression levels and patient survival.
+#
+# Figures generated:
+#   - HR_alltumors_342.pdf (HR forest plot): Forest plot visualizing hazard ratios and
+#     significance of miR-342-3p expression on overall survival across various tumor types.
+#   - Survival_***_342 (Survival curves - multiple tumors): Set of Kaplan-Meier survival plots
+#     for tumor types showing significant hazard ratios, illustrating survival differences
+#     between patients with high and low miR-342-3p expression.
+#   - Survival_metastatic_342.pdf (Survival curve - metastatic tumors): Kaplan-Meier survival plot
+#     specifically for metastatic tumors, showing the impact of miR-342-3p expression on survival
+#     in this patient subgroup.
+#
+# Input data files:
+#   - TCGA_phenotype_denseDataOnlyDownload.tsv.gz: TCGA patient phenotype data (metadata),
+#     downloaded from TCGA or Xena, contains clinical and demographic information.
+#   - Survival_SupplementalTable_S1_20171025_xena_sp: TCGA patient survival data, provides overall survival (OS) time and status.
+#   - pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz:
+#     TCGA pancancer miRNA expression data (FPKM values), downloaded from Xena,
+#     contains miRNA expression levels across different tumor samples.
+#
+# Note:
+#   - The script filters tumor types based on p-value significance (p < 0.1 for HR plot, p < 0.05 for survival curves)
+#     and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script.
+################################################################################
+metaD<-data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE))
+Surv01<-data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE))
 # FPKM01<-fread("tcga_RSEM_Hugo_norm_count")
-FPKM01<-fread("pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz")
+FPKM01<-fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"))
 FPKM01<-as.data.frame(FPKM01)
-# Annot<-data.frame(fread("probeMap_gencode.v23.annotation.gene.probemap", sep='\t',colClasses=c("character"),data.table=FALSE))
+# Annot<-data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE))
 allGenes <- FPKM01[,1]
 #######################################################################
 #####PLOT HR Score of signature for multiple tumor types###############
 #####################################################################
 humanGenes<-list(c("hsa-miR-342-3p"))
 ####create xlsx file with genes, patients and tumor types
 Pan.data3 <- metaD[,]
 Pan.data4 <- merge(Surv01,Pan.data3,by=1)
 Genes_selected <- FPKM01[FPKM01[,1]%in%humanGenes,]
 rownames(Genes_selected)<-Genes_selected[,1]
 Genes_selected<-Genes_selected[,-1]
 Genes_selected <- data.frame(t(Genes_selected))
 Genes_selected$sample<-rownames(Genes_selected)
 Pan.data5 <- merge(Genes_selected,Pan.data4,by.y="sample")
 colnames(Pan.data5)
-#setwd(wdir)
+write.xlsx(Pan.data5, paste0(ourdir, "/miR342_patients_survival.xlsx"))
-#write.xlsx(Pan.data5, "miR342_patients_survival.xlsx")
 #############Regression with defined k2
@@ -63,7 +92,7 @@ for (i in unique(Pan.data5$cancer.type.abbreviation)){
    df2 <- rbind(df2,df1)
    df2<-df2[(order(df2$p)),]
  }else{}}
-#write.xlsx(df2, "miR342_HR_by_tumortype.xlsx")
+write.xlsx(df2, paste0(output_dir, "/miR342_HR_by_tumortype.xlsx"))
 ###Filters selected tumors
@@ -75,21 +104,13 @@ a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label =
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = pmax(HR-SE, 0), ymax = HR + SE), width = 0.2) +
  scale_color_manual(name = "Color", values = c("red", "blue")) +
-  labs(title = "HR by tumor type",
+  labs(title = "HR by tumor type", x = "Tumor Type", y = "HR") +
-       x = "Tumor Type",
-       y = "HR") +
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 1), fill = "lightblue", alpha = 0.01) +
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = 1, ymax = Inf), fill = "pink", alpha = 0.01)+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust= 1, color = "Black")) +
-  geom_hline(yintercept = 1, linetype = "dashed", color = "gray") 
+  geom_hline(yintercept = 1, linetype = "dashed", color = "gray")
+ggsave(filename = paste0(output_dir, "/HR_alltumors_342.pdf"), plot=a, width=9, height=4)
-# pdf("HR_alltumors_342.pdf",width=9,height = 4)
-a
-# dev.off()
 ####Plot survival of chosen tumors
@@ -98,7 +119,7 @@ df3<-df3[order(df3$HR),]
 df3<-df3[df3$p<0.05,]
-# pdf("Survival_***_342",width=10,height = 6)
+#pdf(paste0(output_dir, "/Survival_***_342"), width=10,height = 6)
 par(mfrow = c(2,4))
 survival<-list()
 for (i in df3$Tumor){
@@ -141,7 +162,7 @@ formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001",  as.numeric(round(p_val
 par(las = 0)
-pdf("Survival_metastatic_342.pdf",width=5,height =5)
+pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5,height =5)
 plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n",
     main = "Metastatic tumors") 
 #Add legend