Commit cad0b7de authored by Marco Monti's avatar Marco Monti
Browse files

I updated the scripts

parent a78a5561
library("readxl")
library("dplyr")
#####Directories#####
us <- "/Users/Squadrito/"
us <-"C:/Users/bresesti.chiara/"
wdir1509<-paste0(us,"/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
wdir1510<-paste0(us,"/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
#### Figure 1B ####
setwd(wdir1509)
df1 <- as.data.frame(read_excel("QIAseqUltraplexRNA_181342.xlsx",sheet=3,col_names = T,skip = 1))#sheet: umis.genes.polyA-mouse
df1 <-df1[,-c(1,3:6)] #keep only gene name and UMI counts
df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) #UMI normalized by CPM
gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax",
"Ccr2", "Itgam", "Mgl2", "Cd68", "Vcam1", "Csf1r", "Adgre1",
"Siglec1", "Hmox1", "Timd4", "Vsig4", "Clec4f", "Marco", "Pecam1",
"Tek", "Lyve1", "Stab2")
df2 <- df1[df1$gene%in%gene_vector,] %>% arrange(factor(gene, levels=gene_vector))
rownames(df2) <- df2[,1]
df2.scaled <- as.data.frame(t(scale(t(df2[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
setwd(fdir)
openxlsx::write.xlsx(df2.scaled,"Figure_1B_table.xlsx", rowNames=T)
#the df was exported and used to create a heatmap using Graphpad
#### Figure 1C ####
# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator
# and merged with the plot generated by the code below
setwd(wdir1510)
df1 <- as.data.frame(read_excel("173308.all_samples.summary.xlsx",sheet=2,col_names = T))#sheet: miRNA_piRNA
df1 <-df1[,1:7]#UMI
rownames(df1)<-df1$miRNA
df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts
df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization
colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns
upper.panel<-function(x, y, ...){
points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey")
above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5
points(((x+y)/2)[above],(x-y)[above], col="red", cex=0.6, pch=19)
below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5
points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19)
} #function for MA plot
pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel,
ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) #pairwise plot
#### Figure 1D ####
setwd(wdir1510)
df1 <- as.data.frame(read_excel("173308.all_samples.summary.xlsx",sheet=2,col_names = T))#sheet: miRNA_piRNA
df1 <- df1[-which(grepl("piR",df1[,1])),1:7]#UMI and piRNA removing
df1[,1] <- gsub("/.*","",df1[,1]) #leave only first miRNA for ambiguous entries
df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000})#UMI normalized by CPM
df.families <- as.data.frame(read_excel("Analysis CB/miRNA Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase
df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries
df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1])
df2 <- merge(df.families,df1,by=1)
df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family
gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1",
"miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p",
"miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p",
"miR-223-3p","miR-342-3p","miR-22-3p","miR-192-5p/215-5p",
"miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p")
df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector))
rownames(df3) <- df3[,1]
df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
setwd(fdir)
openxlsx::write.xlsx(df2.scaled,"Figure_1D_table.xlsx", rowNames=T)
#the df was exported and used to create a heatmap using Graphpad
library("readxl")
library("dplyr")
#####Directories#####
# us <-"C:/Users/bresesti.chiara/"
# wdir1509<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
# wdir1510<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
# fdir <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
#
# input_d <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
# wdir1510 <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
# output_d <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
################################################################################
# R script to generate data and figures for manuscript figures 1B, 1C, and 1D.
#
# This script reads RNA-seq and miRNA-seq data from Excel files, performs
# normalization and data transformation, and generates output tables and plots.
#
# Figures generated/data produced:
# - Figure 1B: Heatmap data for selected marker genes (Excel table)
# - Figure 1C: Pairwise MA plots for miRNA data
# - Figure 1D: Heatmap data for selected miRNA families (Excel table)
#
# Input data files:
# - Figure 1B: "QIAseqUltraplexRNA_181342.xlsx" (Sheet 3: umis.genes.polyA-mouse)
# - Figure 1C & 1D: "173308.all_samples.summary.xlsx" (Sheet 2: miRNA_piRNA)
# - Figure 1D: "miRNA_Family.xlsx" (Sheet 1)
#
################################################################################
#### Figure 1B ####
df1 <- as.data.frame(read_excel(paste0(input_dir, "/QIAseqUltraplexRNA_181342.xlsx"), sheet=3, col_names = T, skip=1)) #sheet: umis.genes.polyA-mouse
df1 <-df1[,-c(1,3:6)] # keep only gene name and UMI counts
df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) # UMI normalized by CPM
gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax",
"Ccr2", "Itgam", "Mgl2", "Cd68", "Vcam1", "Csf1r", "Adgre1",
"Siglec1", "Hmox1", "Timd4", "Vsig4", "Clec4f", "Marco", "Pecam1",
"Tek", "Lyve1", "Stab2")
df2 <- df1[df1$gene%in%gene_vector,] %>% arrange(factor(gene, levels=gene_vector))
rownames(df2) <- df2[,1]
df2.scaled <- as.data.frame(t(scale(t(df2[-1])))) # Zscore normalization. Scaled only works on columns, so need to transform
openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1B_table.xlsx"), rowNames=T)
# the df was exported and used to create a heatmap using Graphpad
#### Figure 1C ####
# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator
# and merged with the plot generated by the code below
df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA
df1 <-df1[,1:7] #UMI
rownames(df1)<-df1$miRNA
df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts
df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization
colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns
upper.panel<-function(x, y, ...){
points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey")
above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5
points(((x+y)/2)[above],(x-y)[above], col="red", cex=0.6, pch=19)
below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5
points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19)
} # function for MA plot
pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel,
ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) # pairwise plot
#### Figure 1D ####
df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA
df1 <- df1[-which(grepl("piR",df1[,1])),1:7] # UMI and piRNA removing
df1[,1] <- gsub("/.*","",df1[,1]) # leave only first miRNA for ambiguous entries
df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000}) #UMI normalized by CPM
df.families <- as.data.frame(read_excel("Analysis CB/miRNA_Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase
df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries
df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1])
df2 <- merge(df.families,df1,by=1)
df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family
gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1",
"miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p",
"miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p",
"miR-223-3p","miR-342-3p","miR-22-3p","miR-192-5p/215-5p",
"miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p")
df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector))
rownames(df3) <- df3[,1]
df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1D_table.xlsx"), rowNames=T)
#the df was exported and used to create a heatmap using Graphpad
library(readxl)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(fgsea)
library(clusterProfiler)
library(enrichplot)
##### Directories #####
#us <- "/Users/Squadrito/"
us <-"C:/Users/bresesti.chiara/"
wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr")
wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
#### Figure 3A&B ####
#Import df
setwd(wdir)
miR_ctrl <- read_excel("edgeR_results.xlsx", sheet = "miR342-control")
sponge_ctrl <- read_excel("edgeR_results.xlsx", sheet = "spongeBT-control")
#Add DEG color and label for volcano plot
miR_ctrl$DEG <- "NO"
miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP"
miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN"
miR_ctrl$DEG_label <- NA
miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"]
sponge_ctrl$DEG <- "NO"
sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP"
sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN"
sponge_ctrl$DEG_label <- NA
sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"]
#Volcano plot (left panel)
data <- miR_ctrl
ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "grey", "red")) +
geom_vline(xintercept=c(-1, 1), col="black") +
geom_hline(yintercept=-log10(0.05), col="black") +
scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
scale_y_continuous(limits = c(0,20)) +
xlab(bquote(Log[2](FC))) +
ylab(bquote(-Log[10](PVal)))
data <- sponge_ctrl
ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "grey", "red")) +
geom_vline(xintercept=c(-1, 1), col="black") +
geom_hline(yintercept=-log10(0.05), col="black") +
scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
scale_y_continuous(limits = c(0,20)) +
xlab(bquote(Log[2](FC))) +
ylab(bquote(-Log[10](PVal)))
#Import miR-342-3p target list (from TargetScan)
setwd(wdir_CB)
miR342_targets <- read_excel("TargetScan8.0__miR-342-3p.predicted_targets.xlsx")
miR342_targets <- filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3))
#ecdf plot (right panel)
data <- miR_ctrl
test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
ylab = "Fraction of DEG",
xlab = bquote(Log[2](FC)),
xlim=c(-0.5,0.5),
ylim=c(0,1),
main="ECDF of gene expression: miR vs mut") +
lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) +
abline(v=0, col="black") +
abline(h=0.5, col="black") +
legend("bottomright", c("All genes","miR-342-3p targets"),
col = c("black","blue"), lwd=2, cex = 0.7)
ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
data <- sponge_ctrl
test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
ylab = "Fraction of DEG",
xlab = bquote(Log[2](FC)),
xlim=c(-0.3,0.3),
ylim=c(0,1),
main="ECDF of gene expression: sponge vs scr") +
lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) +
abline(v=0, col="black") +
abline(h=0.5, col="black") +
legend("bottomright", c("All genes","miR-342-3p targets"),
col = c("black","red"), lwd=2, cex = 0.7)
ks.test(test$logFC, data$logFC, alternative = "l") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
#### Figure 3C&D ####
#Upload df
setwd(wdir)
df1<- read_excel("EdgeR_results.xlsx",, sheet = "miR342-control")
setwd(wdir_CB)
miDB_sig5 <- readRDS("miDB_sig5.MLS.rds") #GO terms db
#Rename pathways of interest in miDB_sig5
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process"
names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa"
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1"
names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production"
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis"
names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity"
names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes"
#Filter low expression, NA and order by FC
df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1))
df1<-df1[order(df1$logFC),]
df2<-df1$logFC
names(df2)<-df1$...1
#Run GSEA with fgsea and filter by PVal
sig <- miDB_sig5
test1<-fgsea(sig, df2,minSize = 7,maxSize =500, nproc=1)
PvalResult<-filter(test1, test1$padj<= 0.05)
#DotPlot pathways of interest (Fig.3C)
pathways <- c("Oxydative phosphorylation",
"Regulation of cholesterol metabolic process",
"Response to IL12",
"Regulation of lipid biosynthetic process",
"MHC-I protein binding",
"TNFa mediated signalling pathway",
"Negative regulation of cell cycle progression",
"Fatty acyl-CoA biosynthetic process",
"Response to TNFa",
"G2M checkpoint",
"Response to type I interferon",
"IL1 mediated signalling pathway",
"Response to IL1",
"LPS response genes",
"Cytokine mediated signalling pathway",
"Myeloid cell differentiation",
"IL10 production",
"Regulation of endothelial cell proliferation",
"TNFa superfamily cytokine production",
"Angiogenesis",
"Pattern recognition receptor activity",
"PGE2 response genes")
genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,])
ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) +
geom_point() +
scale_size_area(limits=c(10,450), max_size = 15) +
scale_colour_gradient(low="red",high="blue") +
labs(y='Pathway',x='NES')
#Run GSEA with ClusterProfiler
genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)),
gene = unlist(miDB_sig5))
df2 = sort(df2, decreasing = TRUE)
test2 <- GSEA(df2,TERM2GENE = genelist)
#Enrichment map (Fig.3D)
test3 <- filter(test2, ID %in% pathways)
test3 <- pairwise_termsim(test3)
emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T)+
scale_fill_gradient2(name=bquote(NES),low="blue", high="red")
#N.B. Produces slightly different plot every time,
#but connection between pathways stays the same
\ No newline at end of file
library(readxl)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(fgsea)
library(clusterProfiler)
library(enrichplot)
##### Directories #####
# us <-"C:/Users/bresesti.chiara/"
# wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr")
# wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
# fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
################################################################################
#
# R script to generate data and figures for manuscript figures 3A, 3B, 3C, and 3D.
# This script performs differential gene expression analysis and gene set enrichment
# analysis to generate volcano plots, empirical cumulative distribution function (ECDF) plots,
# dot plots, and enrichment maps.
#
# Figures generated:
# - Figure 3A & 3B: Volcano plots of differentially expressed genes in miR-342-3p
# overexpression and sponge experiments, and ECDF plots comparing
# logFC distributions of all genes vs. miR-342-3p target genes.
# - Figure 3C: Dot plot visualizing enriched pathways from gene set enrichment analysis (GSEA).
# - Figure 3D: Enrichment map visualizing relationships between enriched pathways.
#
# Input data files:
# - Figure 3A & 3B: "edgeR_results.xlsx" (Sheet "miR342-control" and "spongeBT-control")
# (Output from differential gene expression analysis, likely using edgeR)
# - Figure 3A & 3B: "TargetScan8.0__miR-342-3p.predicted_targets.xlsx" (Sheet 1)
# (List of predicted miR-342-3p target genes from TargetScan)
# - Figure 3C & 3D: "EdgeR_results.xlsx" (Sheet "miR342-control")
# (Same as input for Figure 3A & 3B, used for GSEA)
# - Figure 3C & 3D: "miDB_sig5.MLS.rds" (RDS file containing gene sets for gene set enrichment analysis)
#
################################################################################
#### Figure 3A&B ####
#Import df
miR_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "miR342-control")
sponge_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "spongeBT-control")
#Add DEG color and label for volcano plot
miR_ctrl$DEG <- "NO"
miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP"
miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN"
miR_ctrl$DEG_label <- NA
miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"]
sponge_ctrl$DEG <- "NO"
sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP"
sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN"
sponge_ctrl$DEG_label <- NA
sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"]
#Volcano plot (left panel)
data <- miR_ctrl
ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "grey", "red")) +
geom_vline(xintercept=c(-1, 1), col="black") +
geom_hline(yintercept=-log10(0.05), col="black") +
scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
scale_y_continuous(limits = c(0,20)) +
xlab(bquote(Log[2](FC))) +
ylab(bquote(-Log[10](PVal)))
data <- sponge_ctrl
ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "grey", "red")) +
geom_vline(xintercept=c(-1, 1), col="black") +
geom_hline(yintercept=-log10(0.05), col="black") +
scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
scale_y_continuous(limits = c(0,20)) +
xlab(bquote(Log[2](FC))) +
ylab(bquote(-Log[10](PVal)))
# Import miR-342-3p target list (from TargetScan)
miR342_targets <- read_excel(paste0(input_dir, "/TargetScan8.0__miR-342-3p.predicted_targets.xlsx"))
miR342_targets <- filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3))
#ecdf plot (right panel)
data <- miR_ctrl
test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
ylab = "Fraction of DEG",
xlab = bquote(Log[2](FC)),
xlim=c(-0.5,0.5),
ylim=c(0,1),
main="ECDF of gene expression: miR vs mut") +
lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) +
abline(v=0, col="black") +
abline(h=0.5, col="black") +
legend("bottomright", c("All genes","miR-342-3p targets"),
col = c("black","blue"), lwd=2, cex = 0.7)
ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
data <- sponge_ctrl
test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
ylab = "Fraction of DEG",
xlab = bquote(Log[2](FC)),
xlim=c(-0.3,0.3),
ylim=c(0,1),
main="ECDF of gene expression: sponge vs scr") +
lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) +
abline(v=0, col="black") +
abline(h=0.5, col="black") +
legend("bottomright", c("All genes","miR-342-3p targets"),
col = c("black","red"), lwd=2, cex = 0.7)
ks.test(test$logFC, data$logFC, alternative = "l") # Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
#### Figure 3C&D ####
# Load df
df1<- read_excel(paste0(input_dir, "/EdgeR_results.xlsx"), sheet = "miR342-control")
miDB_sig5 <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds")) #GO terms db
#Rename pathways of interest in miDB_sig5
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process"
names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa"
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1"
names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production"
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis"
names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity"
names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes"
# Filter low expression, NA and order by FC
df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1))
df1<-df1[order(df1$logFC),]
df2<-df1$logFC
names(df2)<-df1$...1
# Run GSEA with fgsea and filter by PVal
sig <- miDB_sig5
test1 <- fgsea(sig, df2, minSize=7, maxSize=500, nproc=1)
PvalResult <- filter(test1, test1$padj <= 0.05)
# DotPlot pathways of interest (Fig.3C)
pathways <- c("Oxydative phosphorylation",
"Regulation of cholesterol metabolic process",
"Response to IL12",
"Regulation of lipid biosynthetic process",
"MHC-I protein binding",
"TNFa mediated signalling pathway",
"Negative regulation of cell cycle progression",
"Fatty acyl-CoA biosynthetic process",
"Response to TNFa",
"G2M checkpoint",
"Response to type I interferon",
"IL1 mediated signalling pathway",
"Response to IL1",
"LPS response genes",
"Cytokine mediated signalling pathway",
"Myeloid cell differentiation",
"IL10 production",
"Regulation of endothelial cell proliferation",
"TNFa superfamily cytokine production",
"Angiogenesis",
"Pattern recognition receptor activity",
"PGE2 response genes")
genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,])
ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) +
geom_point() +
scale_size_area(limits=c(10,450), max_size = 15) +
scale_colour_gradient(low="red",high="blue") +
labs(y='Pathway', x='NES')
#Run GSEA with ClusterProfiler
genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)),
gene = unlist(miDB_sig5))
df2 = sort(df2, decreasing = TRUE)
test2 <- GSEA(df2,TERM2GENE = genelist)
#Enrichment map (Fig.3D)
test3 <- filter(test2, ID %in% pathways)
test3 <- pairwise_termsim(test3)
emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T) +
scale_fill_gradient2(name=bquote(NES),low="blue", high="red")
\ No newline at end of file
......@@ -29,7 +29,28 @@ dir.create(plot_dir, showWarnings=F, recursive=T)
sig <- readRDS("/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/reference/miDB_sig5.MLS.rds")
###############################################################################
################################################################################
# R script for single-cell RNA-seq data analysis and figure generation.
#
# This script performs cell annotation, visualization, differential gene expression
# analysis, and gene set enrichment analysis (GSEA) on single-cell RNA-seq data.
#
# Figures generated/data produced:
# - Figure 5A: UMAP visualization of cell clusters with final annotation
# - Figure 5B: UMAP plot with overlayed density of mOrange+ cells
# - Figure 5C: Dotplot of Slc7a11 gene expression across cell types and groups
# - Figure 5D: Barplot of GSEA results for selected gene sets
# - Figure 5E: Heatmap of cytokine signature GSEA results
# - Supplementary Figure 5A: Dotplot of marker gene expression across cell types
# - Supplementary Figure 5B: CSV tables for cell distribution per cluster and sample
# - Supplementary Figure 5C: CSV tables for mOrange+ cell distribution per cluster and sample
#
# Input data files:
# - RDS object: "CB1_CB3_CB4_final.rds" (Seurat object containing scRNA-seq data)
# - RDS object: "miDB_sig5.MLS.rds" (GO terms database for GSEA)
#
################################################################################
set.seed(42)
......
......@@ -7,44 +7,73 @@ library(readxl)
library(survminer)
library(gridExtra)
library(ggplot2)
##Load dataset
#username <- "C:/Users/notaro.marco/"
username <- "/Users/bresesti.chiara/"
wdir<- paste0(username, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al Cell Reports/TCGA_analysis")
wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts"
setwd(wdir)
metaD<-data.frame(fread("TCGA_phenotype_denseDataOnlyDownload.tsv.gz", sep='\t',colClasses=c("character"),data.table=FALSE))
Surv01<-data.frame(fread("Survival_SupplementalTable_S1_20171025_xena_sp", sep='\t',colClasses=c("character"),data.table=FALSE))
input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
################################################################################
# R script for TCGA survival analysis of miR-342-3p expression.
#
# This script analyzes TCGA pancancer miRNA expression and survival data to assess
# the prognostic value of hsa-miR-342-3p across different tumor types.
# It calculates hazard ratios (HR) and generates Kaplan-Meier survival curves
# to visualize the association between miR-342-3p expression levels and patient survival.
#
# Figures generated:
# - HR_alltumors_342.pdf (HR forest plot): Forest plot visualizing hazard ratios and
# significance of miR-342-3p expression on overall survival across various tumor types.
# - Survival_***_342 (Survival curves - multiple tumors): Set of Kaplan-Meier survival plots
# for tumor types showing significant hazard ratios, illustrating survival differences
# between patients with high and low miR-342-3p expression.
# - Survival_metastatic_342.pdf (Survival curve - metastatic tumors): Kaplan-Meier survival plot
# specifically for metastatic tumors, showing the impact of miR-342-3p expression on survival
# in this patient subgroup.
#
# Input data files:
# - TCGA_phenotype_denseDataOnlyDownload.tsv.gz: TCGA patient phenotype data (metadata),
# downloaded from TCGA or Xena, contains clinical and demographic information.
# - Survival_SupplementalTable_S1_20171025_xena_sp: TCGA patient survival data, provides overall survival (OS) time and status.
# - pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz:
# TCGA pancancer miRNA expression data (FPKM values), downloaded from Xena,
# contains miRNA expression levels across different tumor samples.
#
# Note:
# - The script filters tumor types based on p-value significance (p < 0.1 for HR plot, p < 0.05 for survival curves)
# and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script.
################################################################################
metaD<-data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE))
Surv01<-data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE))
# FPKM01<-fread("tcga_RSEM_Hugo_norm_count")
FPKM01<-fread("pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz")
FPKM01<-fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"))
FPKM01<-as.data.frame(FPKM01)
# Annot<-data.frame(fread("probeMap_gencode.v23.annotation.gene.probemap", sep='\t',colClasses=c("character"),data.table=FALSE))
# Annot<-data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE))
allGenes <- FPKM01[,1]
#######################################################################
#####PLOT HR Score of signature for multiple tumor types###############
#####################################################################
humanGenes<-list(c("hsa-miR-342-3p"))
####create xlsx file with genes, patients and tumor types
Pan.data3 <- metaD[,]
Pan.data4 <- merge(Surv01,Pan.data3,by=1)
Genes_selected <- FPKM01[FPKM01[,1]%in%humanGenes,]
rownames(Genes_selected)<-Genes_selected[,1]
Genes_selected<-Genes_selected[,-1]
Genes_selected <- data.frame(t(Genes_selected))
Genes_selected$sample<-rownames(Genes_selected)
Pan.data5 <- merge(Genes_selected,Pan.data4,by.y="sample")
colnames(Pan.data5)
#setwd(wdir)
#write.xlsx(Pan.data5, "miR342_patients_survival.xlsx")
write.xlsx(Pan.data5, paste0(ourdir, "/miR342_patients_survival.xlsx"))
#############Regression with defined k2
......@@ -63,7 +92,7 @@ for (i in unique(Pan.data5$cancer.type.abbreviation)){
df2 <- rbind(df2,df1)
df2<-df2[(order(df2$p)),]
}else{}}
#write.xlsx(df2, "miR342_HR_by_tumortype.xlsx")
write.xlsx(df2, paste0(output_dir, "/miR342_HR_by_tumortype.xlsx"))
###Filters selected tumors
......@@ -75,21 +104,13 @@ a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label =
geom_point(size = 4) +
geom_errorbar(aes(ymin = pmax(HR-SE, 0), ymax = HR + SE), width = 0.2) +
scale_color_manual(name = "Color", values = c("red", "blue")) +
labs(title = "HR by tumor type",
x = "Tumor Type",
y = "HR") +
labs(title = "HR by tumor type", x = "Tumor Type", y = "HR") +
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 1), fill = "lightblue", alpha = 0.01) +
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = 1, ymax = Inf), fill = "pink", alpha = 0.01)+
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust= 1, color = "Black")) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray")
# pdf("HR_alltumors_342.pdf",width=9,height = 4)
a
# dev.off()
geom_hline(yintercept = 1, linetype = "dashed", color = "gray")
ggsave(filename = paste0(output_dir, "/HR_alltumors_342.pdf"), plot=a, width=9, height=4)
####Plot survival of chosen tumors
......@@ -98,7 +119,7 @@ df3<-df3[order(df3$HR),]
df3<-df3[df3$p<0.05,]
# pdf("Survival_***_342",width=10,height = 6)
#pdf(paste0(output_dir, "/Survival_***_342"), width=10,height = 6)
par(mfrow = c(2,4))
survival<-list()
for (i in df3$Tumor){
......@@ -141,7 +162,7 @@ formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001", as.numeric(round(p_val
par(las = 0)
pdf("Survival_metastatic_342.pdf",width=5,height =5)
pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5,height =5)
plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n",
main = "Metastatic tumors")
#Add legend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment