Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
custom
Bresesti Cell Reports 2025
Commits
cad0b7de
Commit
cad0b7de
authored
Mar 25, 2025
by
Marco Monti
Browse files
I updated the scripts
parent
a78a5561
Changes
4
Hide whitespace changes
Inline
Side-by-side
CB2025_figure_1_RNAseq.R
View file @
cad0b7de
library
(
"readxl"
)
library
(
"readxl"
)
library
(
"dplyr"
)
library
(
"dplyr"
)
#####Directories#####
#####Directories#####
us
<-
"/Users/Squadrito/"
# us <-"C:/Users/bresesti.chiara/"
us
<-
"C:/Users/bresesti.chiara/"
# wdir1509<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
wdir1509
<-
paste0
(
us
,
"/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis"
)
# wdir1510<-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
wdir1510
<-
paste0
(
us
,
"/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX"
)
# fdir <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
fdir
<-
paste0
(
us
,
"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures"
)
#
# input_d <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1509_RNASeq_QIAseq_UPX/QIAseqUltraplexRNA_181342/primary_analysis")
# wdir1510 <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
#### Figure 1B ####
# output_d <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
setwd
(
wdir1509
)
df1
<-
as.data.frame
(
read_excel
(
"QIAseqUltraplexRNA_181342.xlsx"
,
sheet
=
3
,
col_names
=
T
,
skip
=
1
))
#sheet: umis.genes.polyA-mouse
input_dir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
df1
<-
df1
[,
-
c
(
1
,
3
:
6
)]
#keep only gene name and UMI counts
output_dir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
df1
[,
-1
]
<-
apply
(
df1
[,
-1
],
2
,
function
(
x
){
x
/
sum
(
x
)
*
1000000
})
#UMI normalized by CPM
gene_vector
<-
c
(
"Cd19"
,
"Ms4a1"
,
"Fcer2a"
,
"Ighm"
,
"Cd8a"
,
"Xcr1"
,
"Itgae"
,
"Itgax"
,
################################################################################
"Ccr2"
,
"Itgam"
,
"Mgl2"
,
"Cd68"
,
"Vcam1"
,
"Csf1r"
,
"Adgre1"
,
# R script to generate data and figures for manuscript figures 1B, 1C, and 1D.
"Siglec1"
,
"Hmox1"
,
"Timd4"
,
"Vsig4"
,
"Clec4f"
,
"Marco"
,
"Pecam1"
,
#
"Tek"
,
"Lyve1"
,
"Stab2"
)
# This script reads RNA-seq and miRNA-seq data from Excel files, performs
df2
<-
df1
[
df1
$
gene
%in%
gene_vector
,]
%>%
arrange
(
factor
(
gene
,
levels
=
gene_vector
))
# normalization and data transformation, and generates output tables and plots.
rownames
(
df2
)
<-
df2
[,
1
]
#
df2.scaled
<-
as.data.frame
(
t
(
scale
(
t
(
df2
[
-1
]))))
#Zscore normalization. Scaled only works on columns, so need to transform
# Figures generated/data produced:
setwd
(
fdir
)
# - Figure 1B: Heatmap data for selected marker genes (Excel table)
openxlsx
::
write.xlsx
(
df2.scaled
,
"Figure_1B_table.xlsx"
,
rowNames
=
T
)
# - Figure 1C: Pairwise MA plots for miRNA data
#the df was exported and used to create a heatmap using Graphpad
# - Figure 1D: Heatmap data for selected miRNA families (Excel table)
#
#### Figure 1C ####
# Input data files:
# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator
# - Figure 1B: "QIAseqUltraplexRNA_181342.xlsx" (Sheet 3: umis.genes.polyA-mouse)
# and merged with the plot generated by the code below
# - Figure 1C & 1D: "173308.all_samples.summary.xlsx" (Sheet 2: miRNA_piRNA)
setwd
(
wdir1510
)
# - Figure 1D: "miRNA_Family.xlsx" (Sheet 1)
df1
<-
as.data.frame
(
read_excel
(
"173308.all_samples.summary.xlsx"
,
sheet
=
2
,
col_names
=
T
))
#sheet: miRNA_piRNA
#
df1
<-
df1
[,
1
:
7
]
#UMI
################################################################################
rownames
(
df1
)
<-
df1
$
miRNA
df2
<-
apply
(
df1
[,
-1
],
2
,
function
(
x
)
log
(
x
))
#Log counts
df2
<-
as.data.frame
(
limma
::
normalizeCyclicLoess
(
df2
,
weights
=
NULL
,
span
=
0.7
,
iterations
=
5
,
method
=
"pairs"
))
#Cyclic loess normalization
#### Figure 1B ####
colnames
(
df2
)
<-
c
(
"B cell"
,
"RPM"
,
"cDC1"
,
"cDC2"
,
"LSEC"
,
"KC"
)
#Rename columns
df1
<-
as.data.frame
(
read_excel
(
paste0
(
input_dir
,
"/QIAseqUltraplexRNA_181342.xlsx"
),
sheet
=
3
,
col_names
=
T
,
skip
=
1
))
#sheet: umis.genes.polyA-mouse
upper.panel
<-
function
(
x
,
y
,
...
){
df1
<-
df1
[,
-
c
(
1
,
3
:
6
)]
# keep only gene name and UMI counts
points
(((
x
+
y
)
/
2
),(
x
-
y
),
cex
=
0.6
,
pch
=
19
,
col
=
"grey"
)
df1
[,
-1
]
<-
apply
(
df1
[,
-1
],
2
,
function
(
x
){
x
/
sum
(
x
)
*
1000000
})
# UMI normalized by CPM
above
<-
(
x
-
y
-1
)
*
((
x
+
y
)
/
2-5
)
>
5
&
((
x
+
y
)
/
2
)
>
5
gene_vector
<-
c
(
"Cd19"
,
"Ms4a1"
,
"Fcer2a"
,
"Ighm"
,
"Cd8a"
,
"Xcr1"
,
"Itgae"
,
"Itgax"
,
points
(((
x
+
y
)
/
2
)[
above
],(
x
-
y
)[
above
],
col
=
"red"
,
cex
=
0.6
,
pch
=
19
)
"Ccr2"
,
"Itgam"
,
"Mgl2"
,
"Cd68"
,
"Vcam1"
,
"Csf1r"
,
"Adgre1"
,
below
<-
(
x
-
y
+1
)
*
((
x
+
y
)
/
2-5
)
<
-5
&
((
x
+
y
)
/
2
)
>
5
"Siglec1"
,
"Hmox1"
,
"Timd4"
,
"Vsig4"
,
"Clec4f"
,
"Marco"
,
"Pecam1"
,
points
(((
x
+
y
)
/
2
)[
below
],(
x
-
y
)[
below
],
col
=
"blue"
,
cex
=
0.6
,
pch
=
19
)
"Tek"
,
"Lyve1"
,
"Stab2"
)
}
#function for MA plot
df2
<-
df1
[
df1
$
gene
%in%
gene_vector
,]
%>%
arrange
(
factor
(
gene
,
levels
=
gene_vector
))
pairs
(
df2
[,
1
:
6
],
lower.panel
=
NULL
,
upper.panel
=
upper.panel
,
rownames
(
df2
)
<-
df2
[,
1
]
ylim
=
c
(
-8.5
,
8.5
),
xlim
=
c
(
2
,
14
),
cex.labels
=
2
)
#pairwise plot
df2.scaled
<-
as.data.frame
(
t
(
scale
(
t
(
df2
[
-1
]))))
# Zscore normalization. Scaled only works on columns, so need to transform
openxlsx
::
write.xlsx
(
df2.scaled
,
paste0
(
output_dir
,
"/Figure_1B_table.xlsx"
),
rowNames
=
T
)
#### Figure 1D ####
# the df was exported and used to create a heatmap using Graphpad
setwd
(
wdir1510
)
df1
<-
as.data.frame
(
read_excel
(
"173308.all_samples.summary.xlsx"
,
sheet
=
2
,
col_names
=
T
))
#sheet: miRNA_piRNA
#### Figure 1C ####
df1
<-
df1
[
-
which
(
grepl
(
"piR"
,
df1
[,
1
])),
1
:
7
]
#UMI and piRNA removing
# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator
df1
[,
1
]
<-
gsub
(
"/.*"
,
""
,
df1
[,
1
])
#leave only first miRNA for ambiguous entries
# and merged with the plot generated by the code below
df1
[,
-1
]
<-
apply
(
df1
[,
-1
],
2
,
function
(
x
){
x
/
sum
(
x
,
na.rm
=
T
)
*
1000000
})
#UMI normalized by CPM
df1
<-
as.data.frame
(
read_excel
(
paste0
(
input_dir
,
"/173308.all_samples.summary.xlsx"
),
sheet
=
2
,
col_names
=
T
))
#sheet: miRNA_piRNA
df.families
<-
as.data.frame
(
read_excel
(
"Analysis CB/miRNA Family.xlsx"
,
sheet
=
1
,
col_names
=
T
))
#miRNA families from miRBase
df1
<-
df1
[,
1
:
7
]
#UMI
df.families
<-
df.families
[
df.families
[,
3
]
==
10090
,
c
(
4
,
1
)]
#select mouse entries
rownames
(
df1
)
<-
df1
$
miRNA
df.families
[,
1
]
<-
sub
(
pattern
=
"p.*"
,
replacement
=
"p"
,
x
=
df.families
[,
1
])
df2
<-
apply
(
df1
[,
-1
],
2
,
function
(
x
)
log
(
x
))
#Log counts
df2
<-
merge
(
df.families
,
df1
,
by
=
1
)
df2
<-
as.data.frame
(
limma
::
normalizeCyclicLoess
(
df2
,
weights
=
NULL
,
span
=
0.7
,
iterations
=
5
,
method
=
"pairs"
))
#Cyclic loess normalization
df2
<-
aggregate
(
df2
[,
-
c
(
1
:
2
)],
by
=
df2
[
"miR family"
],
FUN
=
sum
,
na.rm
=
T
)
#sum counts by family
colnames
(
df2
)
<-
c
(
"B cell"
,
"RPM"
,
"cDC1"
,
"cDC2"
,
"LSEC"
,
"KC"
)
#Rename columns
gene_vector
<-
c
(
"miR-150-5p"
,
"miR-25-3p/32-5p/92-3p/363-3p/367-3p"
,
"miR-142-3p.1"
,
upper.panel
<-
function
(
x
,
y
,
...
){
"miR-17-5p/20-5p/93-5p/106-5p"
,
"miR-191-5p"
,
points
(((
x
+
y
)
/
2
),(
x
-
y
),
cex
=
0.6
,
pch
=
19
,
col
=
"grey"
)
"miR-15-5p/16-5p/195-5p/322-5p/497-5p"
,
"miR-26-5p"
,
"miR-138-5p"
,
above
<-
(
x
-
y
-1
)
*
((
x
+
y
)
/
2-5
)
>
5
&
((
x
+
y
)
/
2
)
>
5
"miR-223-3p"
,
"miR-342-3p"
,
"miR-22-3p"
,
"miR-192-5p/215-5p"
,
points
(((
x
+
y
)
/
2
)[
above
],(
x
-
y
)[
above
],
col
=
"red"
,
cex
=
0.6
,
pch
=
19
)
"miR-125-5p/351-5p"
,
"miR-126-3p.1"
,
"miR-199-3p"
)
below
<-
(
x
-
y
+1
)
*
((
x
+
y
)
/
2-5
)
<
-5
&
((
x
+
y
)
/
2
)
>
5
df3
<-
subset
(
df2
,
df2
$
`miR family`
%in%
gene_vector
)
%>%
arrange
(
factor
(
`miR family`
,
levels
=
gene_vector
))
points
(((
x
+
y
)
/
2
)[
below
],(
x
-
y
)[
below
],
col
=
"blue"
,
cex
=
0.6
,
pch
=
19
)
rownames
(
df3
)
<-
df3
[,
1
]
}
# function for MA plot
df3.scaled
<-
as.data.frame
(
t
(
scale
(
t
(
df3
[
-1
]))))
#Zscore normalization. Scaled only works on columns, so need to transform
pairs
(
df2
[,
1
:
6
],
lower.panel
=
NULL
,
upper.panel
=
upper.panel
,
setwd
(
fdir
)
ylim
=
c
(
-8.5
,
8.5
),
xlim
=
c
(
2
,
14
),
cex.labels
=
2
)
# pairwise plot
openxlsx
::
write.xlsx
(
df2.scaled
,
"Figure_1D_table.xlsx"
,
rowNames
=
T
)
#the df was exported and used to create a heatmap using Graphpad
#### Figure 1D ####
df1
<-
as.data.frame
(
read_excel
(
paste0
(
input_dir
,
"/173308.all_samples.summary.xlsx"
),
sheet
=
2
,
col_names
=
T
))
#sheet: miRNA_piRNA
df1
<-
df1
[
-
which
(
grepl
(
"piR"
,
df1
[,
1
])),
1
:
7
]
# UMI and piRNA removing
df1
[,
1
]
<-
gsub
(
"/.*"
,
""
,
df1
[,
1
])
# leave only first miRNA for ambiguous entries
df1
[,
-1
]
<-
apply
(
df1
[,
-1
],
2
,
function
(
x
){
x
/
sum
(
x
,
na.rm
=
T
)
*
1000000
})
#UMI normalized by CPM
df.families
<-
as.data.frame
(
read_excel
(
"Analysis CB/miRNA_Family.xlsx"
,
sheet
=
1
,
col_names
=
T
))
#miRNA families from miRBase
df.families
<-
df.families
[
df.families
[,
3
]
==
10090
,
c
(
4
,
1
)]
#select mouse entries
df.families
[,
1
]
<-
sub
(
pattern
=
"p.*"
,
replacement
=
"p"
,
x
=
df.families
[,
1
])
df2
<-
merge
(
df.families
,
df1
,
by
=
1
)
df2
<-
aggregate
(
df2
[,
-
c
(
1
:
2
)],
by
=
df2
[
"miR family"
],
FUN
=
sum
,
na.rm
=
T
)
#sum counts by family
gene_vector
<-
c
(
"miR-150-5p"
,
"miR-25-3p/32-5p/92-3p/363-3p/367-3p"
,
"miR-142-3p.1"
,
"miR-17-5p/20-5p/93-5p/106-5p"
,
"miR-191-5p"
,
"miR-15-5p/16-5p/195-5p/322-5p/497-5p"
,
"miR-26-5p"
,
"miR-138-5p"
,
"miR-223-3p"
,
"miR-342-3p"
,
"miR-22-3p"
,
"miR-192-5p/215-5p"
,
"miR-125-5p/351-5p"
,
"miR-126-3p.1"
,
"miR-199-3p"
)
df3
<-
subset
(
df2
,
df2
$
`miR family`
%in%
gene_vector
)
%>%
arrange
(
factor
(
`miR family`
,
levels
=
gene_vector
))
rownames
(
df3
)
<-
df3
[,
1
]
df3.scaled
<-
as.data.frame
(
t
(
scale
(
t
(
df3
[
-1
]))))
#Zscore normalization. Scaled only works on columns, so need to transform
openxlsx
::
write.xlsx
(
df2.scaled
,
paste0
(
output_dir
,
"/Figure_1D_table.xlsx"
),
rowNames
=
T
)
#the df was exported and used to create a heatmap using Graphpad
CB2025_figure_3_RNAseq.R
View file @
cad0b7de
library
(
readxl
)
library
(
readxl
)
library
(
ggplot2
)
library
(
ggplot2
)
library
(
ggrepel
)
library
(
ggrepel
)
library
(
dplyr
)
library
(
dplyr
)
library
(
fgsea
)
library
(
fgsea
)
library
(
clusterProfiler
)
library
(
clusterProfiler
)
library
(
enrichplot
)
library
(
enrichplot
)
##### Directories #####
##### Directories #####
#us <- "/Users/Squadrito/"
# us <-"C:/Users/bresesti.chiara/"
us
<-
"C:/Users/bresesti.chiara/"
# wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr")
wdir
<-
paste0
(
us
,
"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr"
)
# wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
wdir_CB
<-
paste0
(
us
,
"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2"
)
# fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")
fdir
<-
paste0
(
us
,
"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures"
)
input_dir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
#### Figure 3A&B ####
output_dir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
#Import df
################################################################################
setwd
(
wdir
)
#
miR_ctrl
<-
read_excel
(
"edgeR_results.xlsx"
,
sheet
=
"miR342-control"
)
# R script to generate data and figures for manuscript figures 3A, 3B, 3C, and 3D.
sponge_ctrl
<-
read_excel
(
"edgeR_results.xlsx"
,
sheet
=
"spongeBT-control"
)
# This script performs differential gene expression analysis and gene set enrichment
# analysis to generate volcano plots, empirical cumulative distribution function (ECDF) plots,
#Add DEG color and label for volcano plot
# dot plots, and enrichment maps.
miR_ctrl
$
DEG
<-
"NO"
#
miR_ctrl
$
DEG
[
miR_ctrl
$
logFC
>
1
&
miR_ctrl
$
PValue
<
0.05
]
<-
"UP"
# Figures generated:
miR_ctrl
$
DEG
[
miR_ctrl
$
logFC
<
(
-1
)
&
miR_ctrl
$
PValue
<
0.05
]
<-
"DOWN"
# - Figure 3A & 3B: Volcano plots of differentially expressed genes in miR-342-3p
miR_ctrl
$
DEG_label
<-
NA
# overexpression and sponge experiments, and ECDF plots comparing
miR_ctrl
$
DEG_label
[
miR_ctrl
$
DEG
!=
"NO"
]
<-
miR_ctrl
$
...1
[
miR_ctrl
$
DEG
!=
"NO"
]
# logFC distributions of all genes vs. miR-342-3p target genes.
# - Figure 3C: Dot plot visualizing enriched pathways from gene set enrichment analysis (GSEA).
sponge_ctrl
$
DEG
<-
"NO"
# - Figure 3D: Enrichment map visualizing relationships between enriched pathways.
sponge_ctrl
$
DEG
[
sponge_ctrl
$
logFC
>
1
&
sponge_ctrl
$
PValue
<
0.05
]
<-
"UP"
#
sponge_ctrl
$
DEG
[
sponge_ctrl
$
logFC
<
(
-1
)
&
sponge_ctrl
$
PValue
<
0.05
]
<-
"DOWN"
# Input data files:
sponge_ctrl
$
DEG_label
<-
NA
# - Figure 3A & 3B: "edgeR_results.xlsx" (Sheet "miR342-control" and "spongeBT-control")
sponge_ctrl
$
DEG_label
[
sponge_ctrl
$
DEG
!=
"NO"
]
<-
sponge_ctrl
$
...1
[
sponge_ctrl
$
DEG
!=
"NO"
]
# (Output from differential gene expression analysis, likely using edgeR)
# - Figure 3A & 3B: "TargetScan8.0__miR-342-3p.predicted_targets.xlsx" (Sheet 1)
#Volcano plot (left panel)
# (List of predicted miR-342-3p target genes from TargetScan)
data
<-
miR_ctrl
# - Figure 3C & 3D: "EdgeR_results.xlsx" (Sheet "miR342-control")
ggplot
(
data
=
data
,
aes
(
x
=
logFC
,
y
=-
log10
(
PValue
),
col
=
DEG
,
label
=
DEG_label
))
+
# (Same as input for Figure 3A & 3B, used for GSEA)
geom_point
()
+
# - Figure 3C & 3D: "miDB_sig5.MLS.rds" (RDS file containing gene sets for gene set enrichment analysis)
theme_minimal
()
+
#
geom_text_repel
()
+
################################################################################
scale_color_manual
(
values
=
c
(
"blue"
,
"grey"
,
"red"
))
+
geom_vline
(
xintercept
=
c
(
-1
,
1
),
col
=
"black"
)
+
geom_hline
(
yintercept
=-
log10
(
0.05
),
col
=
"black"
)
+
scale_x_continuous
(
limits
=
c
(
-4.1
,
4.1
),
breaks
=
seq
(
-4
,
4
,
1
))
+
#### Figure 3A&B ####
scale_y_continuous
(
limits
=
c
(
0
,
20
))
+
xlab
(
bquote
(
Log
[
2
](
FC
)))
+
#Import df
ylab
(
bquote
(
-
Log
[
10
](
PVal
)))
miR_ctrl
<-
read_excel
(
paste0
(
input_dir
,
"/edgeR_results.xlsx"
),
sheet
=
"miR342-control"
)
sponge_ctrl
<-
read_excel
(
paste0
(
input_dir
,
"/edgeR_results.xlsx"
),
sheet
=
"spongeBT-control"
)
data
<-
sponge_ctrl
ggplot
(
data
=
data
,
aes
(
x
=
logFC
,
y
=-
log10
(
PValue
),
col
=
DEG
,
label
=
DEG_label
))
+
#Add DEG color and label for volcano plot
geom_point
()
+
miR_ctrl
$
DEG
<-
"NO"
theme_minimal
()
+
miR_ctrl
$
DEG
[
miR_ctrl
$
logFC
>
1
&
miR_ctrl
$
PValue
<
0.05
]
<-
"UP"
geom_text_repel
()
+
miR_ctrl
$
DEG
[
miR_ctrl
$
logFC
<
(
-1
)
&
miR_ctrl
$
PValue
<
0.05
]
<-
"DOWN"
scale_color_manual
(
values
=
c
(
"blue"
,
"grey"
,
"red"
))
+
miR_ctrl
$
DEG_label
<-
NA
geom_vline
(
xintercept
=
c
(
-1
,
1
),
col
=
"black"
)
+
miR_ctrl
$
DEG_label
[
miR_ctrl
$
DEG
!=
"NO"
]
<-
miR_ctrl
$
...1
[
miR_ctrl
$
DEG
!=
"NO"
]
geom_hline
(
yintercept
=-
log10
(
0.05
),
col
=
"black"
)
+
scale_x_continuous
(
limits
=
c
(
-4.1
,
4.1
),
breaks
=
seq
(
-4
,
4
,
1
))
+
sponge_ctrl
$
DEG
<-
"NO"
scale_y_continuous
(
limits
=
c
(
0
,
20
))
+
sponge_ctrl
$
DEG
[
sponge_ctrl
$
logFC
>
1
&
sponge_ctrl
$
PValue
<
0.05
]
<-
"UP"
xlab
(
bquote
(
Log
[
2
](
FC
)))
+
sponge_ctrl
$
DEG
[
sponge_ctrl
$
logFC
<
(
-1
)
&
sponge_ctrl
$
PValue
<
0.05
]
<-
"DOWN"
ylab
(
bquote
(
-
Log
[
10
](
PVal
)))
sponge_ctrl
$
DEG_label
<-
NA
sponge_ctrl
$
DEG_label
[
sponge_ctrl
$
DEG
!=
"NO"
]
<-
sponge_ctrl
$
...1
[
sponge_ctrl
$
DEG
!=
"NO"
]
#Import miR-342-3p target list (from TargetScan)
setwd
(
wdir_CB
)
#Volcano plot (left panel)
miR342_targets
<-
read_excel
(
"TargetScan8.0__miR-342-3p.predicted_targets.xlsx"
)
data
<-
miR_ctrl
miR342_targets
<-
filter
(
miR342_targets
,
miR342_targets
$
`Cumulative weighted context++ score`
<
(
-0.3
))
ggplot
(
data
=
data
,
aes
(
x
=
logFC
,
y
=-
log10
(
PValue
),
col
=
DEG
,
label
=
DEG_label
))
+
geom_point
()
+
#ecdf plot (right panel)
theme_minimal
()
+
data
<-
miR_ctrl
geom_text_repel
()
+
test
<-
filter
(
data
,
data
$
...1
%in%
miR342_targets
$
'Target gene'
)
scale_color_manual
(
values
=
c
(
"blue"
,
"grey"
,
"red"
))
+
plot
(
ecdf
(
data
$
logFC
),
lwd
=
2
,
do.points
=
F
,
verticals
=
T
,
geom_vline
(
xintercept
=
c
(
-1
,
1
),
col
=
"black"
)
+
ylab
=
"Fraction of DEG"
,
geom_hline
(
yintercept
=-
log10
(
0.05
),
col
=
"black"
)
+
xlab
=
bquote
(
Log
[
2
](
FC
)),
scale_x_continuous
(
limits
=
c
(
-4.1
,
4.1
),
breaks
=
seq
(
-4
,
4
,
1
))
+
xlim
=
c
(
-0.5
,
0.5
),
scale_y_continuous
(
limits
=
c
(
0
,
20
))
+
ylim
=
c
(
0
,
1
),
xlab
(
bquote
(
Log
[
2
](
FC
)))
+
main
=
"ECDF of gene expression: miR vs mut"
)
+
ylab
(
bquote
(
-
Log
[
10
](
PVal
)))
lines
(
ecdf
(
test
$
logFC
),
col
=
"blue"
,
do.points
=
F
,
lwd
=
2
,
verticals
=
T
)
+
abline
(
v
=
0
,
col
=
"black"
)
+
data
<-
sponge_ctrl
abline
(
h
=
0.5
,
col
=
"black"
)
+
ggplot
(
data
=
data
,
aes
(
x
=
logFC
,
y
=-
log10
(
PValue
),
col
=
DEG
,
label
=
DEG_label
))
+
legend
(
"bottomright"
,
c
(
"All genes"
,
"miR-342-3p targets"
),
geom_point
()
+
col
=
c
(
"black"
,
"blue"
),
lwd
=
2
,
cex
=
0.7
)
theme_minimal
()
+
ks.test
(
test
$
logFC
,
data
$
logFC
,
alternative
=
"g"
)
#Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
geom_text_repel
()
+
scale_color_manual
(
values
=
c
(
"blue"
,
"grey"
,
"red"
))
+
data
<-
sponge_ctrl
geom_vline
(
xintercept
=
c
(
-1
,
1
),
col
=
"black"
)
+
test
<-
filter
(
data
,
data
$
...1
%in%
miR342_targets
$
'Target gene'
)
geom_hline
(
yintercept
=-
log10
(
0.05
),
col
=
"black"
)
+
plot
(
ecdf
(
data
$
logFC
),
lwd
=
2
,
do.points
=
F
,
verticals
=
T
,
scale_x_continuous
(
limits
=
c
(
-4.1
,
4.1
),
breaks
=
seq
(
-4
,
4
,
1
))
+
ylab
=
"Fraction of DEG"
,
scale_y_continuous
(
limits
=
c
(
0
,
20
))
+
xlab
=
bquote
(
Log
[
2
](
FC
)),
xlab
(
bquote
(
Log
[
2
](
FC
)))
+
xlim
=
c
(
-0.3
,
0.3
),
ylab
(
bquote
(
-
Log
[
10
](
PVal
)))
ylim
=
c
(
0
,
1
),
main
=
"ECDF of gene expression: sponge vs scr"
)
+
# Import miR-342-3p target list (from TargetScan)
lines
(
ecdf
(
test
$
logFC
),
col
=
"red"
,
do.points
=
F
,
lwd
=
2
,
verticals
=
T
)
+
miR342_targets
<-
read_excel
(
paste0
(
input_dir
,
"/TargetScan8.0__miR-342-3p.predicted_targets.xlsx"
))
abline
(
v
=
0
,
col
=
"black"
)
+
miR342_targets
<-
filter
(
miR342_targets
,
miR342_targets
$
`Cumulative weighted context++ score`
<
(
-0.3
))
abline
(
h
=
0.5
,
col
=
"black"
)
+
legend
(
"bottomright"
,
c
(
"All genes"
,
"miR-342-3p targets"
),
#ecdf plot (right panel)
col
=
c
(
"black"
,
"red"
),
lwd
=
2
,
cex
=
0.7
)
data
<-
miR_ctrl
ks.test
(
test
$
logFC
,
data
$
logFC
,
alternative
=
"l"
)
#Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
test
<-
filter
(
data
,
data
$
...1
%in%
miR342_targets
$
'Target gene'
)
plot
(
ecdf
(
data
$
logFC
),
lwd
=
2
,
do.points
=
F
,
verticals
=
T
,
#### Figure 3C&D ####
ylab
=
"Fraction of DEG"
,
xlab
=
bquote
(
Log
[
2
](
FC
)),
#Upload df
xlim
=
c
(
-0.5
,
0.5
),
setwd
(
wdir
)
ylim
=
c
(
0
,
1
),
df1
<-
read_excel
(
"EdgeR_results.xlsx"
,,
sheet
=
"miR342-control"
)
main
=
"ECDF of gene expression: miR vs mut"
)
+
setwd
(
wdir_CB
)
lines
(
ecdf
(
test
$
logFC
),
col
=
"blue"
,
do.points
=
F
,
lwd
=
2
,
verticals
=
T
)
+
miDB_sig5
<-
readRDS
(
"miDB_sig5.MLS.rds"
)
#GO terms db
abline
(
v
=
0
,
col
=
"black"
)
+
abline
(
h
=
0.5
,
col
=
"black"
)
+
#Rename pathways of interest in miDB_sig5
legend
(
"bottomright"
,
c
(
"All genes"
,
"miR-342-3p targets"
),
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"HALLMARK_OXIDATIVE_PHOSPHORYLATION"
]
<-
"Oxydative phosphorylation"
col
=
c
(
"black"
,
"blue"
),
lwd
=
2
,
cex
=
0.7
)
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"
]
<-
"Regulation of cholesterol metabolic process"
ks.test
(
test
$
logFC
,
data
$
logFC
,
alternative
=
"g"
)
#Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_INTERLEUKIN_12"
]
<-
"Response to IL12"
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"
]
<-
"Regulation of lipid biosynthetic process"
data
<-
sponge_ctrl
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOMF_MHC_CLASS_I_PROTEIN_BINDING"
]
<-
"MHC-I protein binding"
test
<-
filter
(
data
,
data
$
...1
%in%
miR342_targets
$
'Target gene'
)
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"
]
<-
"TNFa mediated signalling pathway"
plot
(
ecdf
(
data
$
logFC
),
lwd
=
2
,
do.points
=
F
,
verticals
=
T
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"
]
<-
"Negative regulation of cell cycle progression"
ylab
=
"Fraction of DEG"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"
]
<-
"Fatty acyl-CoA biosynthetic process"
xlab
=
bquote
(
Log
[
2
](
FC
)),
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"
]
<-
"Response to TNFa"
xlim
=
c
(
-0.3
,
0.3
),
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"HALLMARK_G2M_CHECKPOINT"
]
<-
"G2M checkpoint"
ylim
=
c
(
0
,
1
),
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_TYPE_I_INTERFERON"
]
<-
"Response to type I interferon"
main
=
"ECDF of gene expression: sponge vs scr"
)
+
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"
]
<-
"IL1 mediated signalling pathway"
lines
(
ecdf
(
test
$
logFC
),
col
=
"red"
,
do.points
=
F
,
lwd
=
2
,
verticals
=
T
)
+
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_INTERLEUKIN_1"
]
<-
"Response to IL1"
abline
(
v
=
0
,
col
=
"black"
)
+
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"LPS_RO"
]
<-
"LPS response genes"
abline
(
h
=
0.5
,
col
=
"black"
)
+
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"
]
<-
"Cytokine mediated signalling pathway"
legend
(
"bottomright"
,
c
(
"All genes"
,
"miR-342-3p targets"
),
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_MYELOID_CELL_DIFFERENTIATION"
]
<-
"Myeloid cell differentiation"
col
=
c
(
"black"
,
"red"
),
lwd
=
2
,
cex
=
0.7
)
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_INTERLEUKIN_10_PRODUCTION"
]
<-
"IL10 production"
ks.test
(
test
$
logFC
,
data
$
logFC
,
alternative
=
"l"
)
# Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"
]
<-
"Regulation of endothelial cell proliferation"
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"
]
<-
"TNFa superfamily cytokine production"
#### Figure 3C&D ####
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"HALLMARK_ANGIOGENESIS"
]
<-
"Angiogenesis"
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"
]
<-
"Pattern recognition receptor activity"
# Load df
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"PGE2_RO"
]
<-
"PGE2 response genes"
df1
<-
read_excel
(
paste0
(
input_dir
,
"/EdgeR_results.xlsx"
),
sheet
=
"miR342-control"
)
miDB_sig5
<-
readRDS
(
paste0
(
input_dir
,
"/miDB_sig5.MLS.rds"
))
#GO terms db
#Filter low expression, NA and order by FC
df1
<-
filter
(
df1
,
df1
$
logCPM
>
5
&
!
is.na
(
df1
$
...1
))
#Rename pathways of interest in miDB_sig5
df1
<-
df1
[
order
(
df1
$
logFC
),]
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"HALLMARK_OXIDATIVE_PHOSPHORYLATION"
]
<-
"Oxydative phosphorylation"
df2
<-
df1
$
logFC
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"
]
<-
"Regulation of cholesterol metabolic process"
names
(
df2
)
<-
df1
$
...1
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_INTERLEUKIN_12"
]
<-
"Response to IL12"
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"
]
<-
"Regulation of lipid biosynthetic process"
#Run GSEA with fgsea and filter by PVal
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOMF_MHC_CLASS_I_PROTEIN_BINDING"
]
<-
"MHC-I protein binding"
sig
<-
miDB_sig5
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"
]
<-
"TNFa mediated signalling pathway"
test1
<-
fgsea
(
sig
,
df2
,
minSize
=
7
,
maxSize
=
500
,
nproc
=
1
)
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"
]
<-
"Negative regulation of cell cycle progression"
PvalResult
<-
filter
(
test1
,
test1
$
padj
<=
0.05
)
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"
]
<-
"Fatty acyl-CoA biosynthetic process"
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"
]
<-
"Response to TNFa"
#DotPlot pathways of interest (Fig.3C)
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"HALLMARK_G2M_CHECKPOINT"
]
<-
"G2M checkpoint"
pathways
<-
c
(
"Oxydative phosphorylation"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_TYPE_I_INTERFERON"
]
<-
"Response to type I interferon"
"Regulation of cholesterol metabolic process"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"
]
<-
"IL1 mediated signalling pathway"
"Response to IL12"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_RESPONSE_TO_INTERLEUKIN_1"
]
<-
"Response to IL1"
"Regulation of lipid biosynthetic process"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"LPS_RO"
]
<-
"LPS response genes"
"MHC-I protein binding"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"
]
<-
"Cytokine mediated signalling pathway"
"TNFa mediated signalling pathway"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_MYELOID_CELL_DIFFERENTIATION"
]
<-
"Myeloid cell differentiation"
"Negative regulation of cell cycle progression"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_INTERLEUKIN_10_PRODUCTION"
]
<-
"IL10 production"
"Fatty acyl-CoA biosynthetic process"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"
]
<-
"Regulation of endothelial cell proliferation"
"Response to TNFa"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"
]
<-
"TNFa superfamily cytokine production"
"G2M checkpoint"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"HALLMARK_ANGIOGENESIS"
]
<-
"Angiogenesis"
"Response to type I interferon"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"
]
<-
"Pattern recognition receptor activity"
"IL1 mediated signalling pathway"
,
names
(
miDB_sig5
)[
names
(
miDB_sig5
)
==
"PGE2_RO"
]
<-
"PGE2 response genes"
"Response to IL1"
,
"LPS response genes"
,
# Filter low expression, NA and order by FC
"Cytokine mediated signalling pathway"
,
df1
<-
filter
(
df1
,
df1
$
logCPM
>
5
&
!
is.na
(
df1
$
...1
))
"Myeloid cell differentiation"
,
df1
<-
df1
[
order
(
df1
$
logFC
),]
"IL10 production"
,
df2
<-
df1
$
logFC
"Regulation of endothelial cell proliferation"
,
names
(
df2
)
<-
df1
$
...1
"TNFa superfamily cytokine production"
,
"Angiogenesis"
,
# Run GSEA with fgsea and filter by PVal
"Pattern recognition receptor activity"
,
sig
<-
miDB_sig5
"PGE2 response genes"
)
test1
<-
fgsea
(
sig
,
df2
,
minSize
=
7
,
maxSize
=
500
,
nproc
=
1
)
PvalResult
<-
filter
(
test1
,
test1
$
padj
<=
0.05
)
genelist
<-
as.data.frame
(
PvalResult
[
PvalResult
$
pathway
%in%
pathways
,])
ggplot
(
genelist
,
aes
(
x
=
NES
,
y
=
reorder
(
pathway
,
NES
),
size
=
size
,
color
=
padj
))
+
# DotPlot pathways of interest (Fig.3C)
geom_point
()
+
pathways
<-
c
(
"Oxydative phosphorylation"
,
scale_size_area
(
limits
=
c
(
10
,
450
),
max_size
=
15
)
+
"Regulation of cholesterol metabolic process"
,
scale_colour_gradient
(
low
=
"red"
,
high
=
"blue"
)
+
"Response to IL12"
,
labs
(
y
=
'Pathway'
,
x
=
'NES'
)
"Regulation of lipid biosynthetic process"
,
"MHC-I protein binding"
,
#Run GSEA with ClusterProfiler
"TNFa mediated signalling pathway"
,
genelist
<-
data.frame
(
term
=
rep
(
names
(
miDB_sig5
),
sapply
(
miDB_sig5
,
length
)),
"Negative regulation of cell cycle progression"
,
gene
=
unlist
(
miDB_sig5
))
"Fatty acyl-CoA biosynthetic process"
,
df2
=
sort
(
df2
,
decreasing
=
TRUE
)
"Response to TNFa"
,
test2
<-
GSEA
(
df2
,
TERM2GENE
=
genelist
)
"G2M checkpoint"
,
"Response to type I interferon"
,
#Enrichment map (Fig.3D)
"IL1 mediated signalling pathway"
,
test3
<-
filter
(
test2
,
ID
%in%
pathways
)
"Response to IL1"
,
test3
<-
pairwise_termsim
(
test3
)
"LPS response genes"
,
emapplot
(
test3
,
min_edge
=
0.01
,
color
=
"NES"
,
layout
=
"fr"
,
repel
=
T
)
+
"Cytokine mediated signalling pathway"
,
scale_fill_gradient2
(
name
=
bquote
(
NES
),
low
=
"blue"
,
high
=
"red"
)
"Myeloid cell differentiation"
,
#N.B. Produces slightly different plot every time,
"IL10 production"
,
#but connection between pathways stays the same
"Regulation of endothelial cell proliferation"
,
\ No newline at end of file
"TNFa superfamily cytokine production"
,
"Angiogenesis"
,
"Pattern recognition receptor activity"
,
"PGE2 response genes"
)
genelist
<-
as.data.frame
(
PvalResult
[
PvalResult
$
pathway
%in%
pathways
,])
ggplot
(
genelist
,
aes
(
x
=
NES
,
y
=
reorder
(
pathway
,
NES
),
size
=
size
,
color
=
padj
))
+
geom_point
()
+
scale_size_area
(
limits
=
c
(
10
,
450
),
max_size
=
15
)
+
scale_colour_gradient
(
low
=
"red"
,
high
=
"blue"
)
+
labs
(
y
=
'Pathway'
,
x
=
'NES'
)
#Run GSEA with ClusterProfiler
genelist
<-
data.frame
(
term
=
rep
(
names
(
miDB_sig5
),
sapply
(
miDB_sig5
,
length
)),
gene
=
unlist
(
miDB_sig5
))
df2
=
sort
(
df2
,
decreasing
=
TRUE
)
test2
<-
GSEA
(
df2
,
TERM2GENE
=
genelist
)
#Enrichment map (Fig.3D)
test3
<-
filter
(
test2
,
ID
%in%
pathways
)
test3
<-
pairwise_termsim
(
test3
)
emapplot
(
test3
,
min_edge
=
0.01
,
color
=
"NES"
,
layout
=
"fr"
,
repel
=
T
)
+
scale_fill_gradient2
(
name
=
bquote
(
NES
),
low
=
"blue"
,
high
=
"red"
)
\ No newline at end of file
CB2025_figure_5_scRNAseq.R
View file @
cad0b7de
...
@@ -29,7 +29,28 @@ dir.create(plot_dir, showWarnings=F, recursive=T)
...
@@ -29,7 +29,28 @@ dir.create(plot_dir, showWarnings=F, recursive=T)
sig
<-
readRDS
(
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/reference/miDB_sig5.MLS.rds"
)
sig
<-
readRDS
(
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/reference/miDB_sig5.MLS.rds"
)
###############################################################################
################################################################################
# R script for single-cell RNA-seq data analysis and figure generation.
#
# This script performs cell annotation, visualization, differential gene expression
# analysis, and gene set enrichment analysis (GSEA) on single-cell RNA-seq data.
#
# Figures generated/data produced:
# - Figure 5A: UMAP visualization of cell clusters with final annotation
# - Figure 5B: UMAP plot with overlayed density of mOrange+ cells
# - Figure 5C: Dotplot of Slc7a11 gene expression across cell types and groups
# - Figure 5D: Barplot of GSEA results for selected gene sets
# - Figure 5E: Heatmap of cytokine signature GSEA results
# - Supplementary Figure 5A: Dotplot of marker gene expression across cell types
# - Supplementary Figure 5B: CSV tables for cell distribution per cluster and sample
# - Supplementary Figure 5C: CSV tables for mOrange+ cell distribution per cluster and sample
#
# Input data files:
# - RDS object: "CB1_CB3_CB4_final.rds" (Seurat object containing scRNA-seq data)
# - RDS object: "miDB_sig5.MLS.rds" (GO terms database for GSEA)
#
################################################################################
set.seed
(
42
)
set.seed
(
42
)
...
...
TCGA_analysis.R
View file @
cad0b7de
...
@@ -7,44 +7,73 @@ library(readxl)
...
@@ -7,44 +7,73 @@ library(readxl)
library
(
survminer
)
library
(
survminer
)
library
(
gridExtra
)
library
(
gridExtra
)
library
(
ggplot2
)
library
(
ggplot2
)
##Load dataset
##Load dataset
#username <- "C:/Users/notaro.marco/"
wdir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts"
username
<-
"/Users/bresesti.chiara/"
wdir
<-
paste0
(
username
,
"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al Cell Reports/TCGA_analysis"
)
setwd
(
wdir
)
setwd
(
wdir
)
metaD
<-
data.frame
(
fread
(
"TCGA_phenotype_denseDataOnlyDownload.tsv.gz"
,
sep
=
'\t'
,
colClasses
=
c
(
"character"
),
data.table
=
FALSE
))
Surv01
<-
data.frame
(
fread
(
"Survival_SupplementalTable_S1_20171025_xena_sp"
,
sep
=
'\t'
,
colClasses
=
c
(
"character"
),
data.table
=
FALSE
))
input_dir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
output_dir
<-
"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"
################################################################################
# R script for TCGA survival analysis of miR-342-3p expression.
#
# This script analyzes TCGA pancancer miRNA expression and survival data to assess
# the prognostic value of hsa-miR-342-3p across different tumor types.
# It calculates hazard ratios (HR) and generates Kaplan-Meier survival curves
# to visualize the association between miR-342-3p expression levels and patient survival.
#
# Figures generated:
# - HR_alltumors_342.pdf (HR forest plot): Forest plot visualizing hazard ratios and
# significance of miR-342-3p expression on overall survival across various tumor types.
# - Survival_***_342 (Survival curves - multiple tumors): Set of Kaplan-Meier survival plots
# for tumor types showing significant hazard ratios, illustrating survival differences
# between patients with high and low miR-342-3p expression.
# - Survival_metastatic_342.pdf (Survival curve - metastatic tumors): Kaplan-Meier survival plot
# specifically for metastatic tumors, showing the impact of miR-342-3p expression on survival
# in this patient subgroup.
#
# Input data files:
# - TCGA_phenotype_denseDataOnlyDownload.tsv.gz: TCGA patient phenotype data (metadata),
# downloaded from TCGA or Xena, contains clinical and demographic information.
# - Survival_SupplementalTable_S1_20171025_xena_sp: TCGA patient survival data, provides overall survival (OS) time and status.
# - pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz:
# TCGA pancancer miRNA expression data (FPKM values), downloaded from Xena,
# contains miRNA expression levels across different tumor samples.
#
# Note:
# - The script filters tumor types based on p-value significance (p < 0.1 for HR plot, p < 0.05 for survival curves)
# and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script.
################################################################################
metaD
<-
data.frame
(
fread
(
paste0
(
input_dir
,
"/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"
),
sep
=
'\t'
,
colClasses
=
c
(
"character"
),
data.table
=
FALSE
))
Surv01
<-
data.frame
(
fread
(
paste0
(
input_dir
,
"/Survival_SupplementalTable_S1_20171025_xena_sp"
),
sep
=
'\t'
,
colClasses
=
c
(
"character"
),
data.table
=
FALSE
))
# FPKM01<-fread("tcga_RSEM_Hugo_norm_count")
# FPKM01<-fread("tcga_RSEM_Hugo_norm_count")
FPKM01
<-
fread
(
"
pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"
)
FPKM01
<-
fread
(
paste0
(
input_dir
,
"/
pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"
)
)
FPKM01
<-
as.data.frame
(
FPKM01
)
FPKM01
<-
as.data.frame
(
FPKM01
)
# Annot<-data.frame(fread(
"
probeMap_gencode.v23.annotation.gene.probemap", sep='\t',colClasses=c("character"),data.table=FALSE))
# Annot<-data.frame(fread(
paste0(input_dir, "/
probeMap_gencode.v23.annotation.gene.probemap"
)
, sep='\t',colClasses=c("character"),data.table=FALSE))
allGenes
<-
FPKM01
[,
1
]
allGenes
<-
FPKM01
[,
1
]
#######################################################################
#######################################################################
#####PLOT HR Score of signature for multiple tumor types###############
#####PLOT HR Score of signature for multiple tumor types###############
#####################################################################
#####################################################################
humanGenes
<-
list
(
c
(
"hsa-miR-342-3p"
))
humanGenes
<-
list
(
c
(
"hsa-miR-342-3p"
))
####create xlsx file with genes, patients and tumor types
####create xlsx file with genes, patients and tumor types
Pan.data3
<-
metaD
[,]
Pan.data3
<-
metaD
[,]
Pan.data4
<-
merge
(
Surv01
,
Pan.data3
,
by
=
1
)
Pan.data4
<-
merge
(
Surv01
,
Pan.data3
,
by
=
1
)
Genes_selected
<-
FPKM01
[
FPKM01
[,
1
]
%in%
humanGenes
,]
Genes_selected
<-
FPKM01
[
FPKM01
[,
1
]
%in%
humanGenes
,]
rownames
(
Genes_selected
)
<-
Genes_selected
[,
1
]
rownames
(
Genes_selected
)
<-
Genes_selected
[,
1
]
Genes_selected
<-
Genes_selected
[,
-1
]
Genes_selected
<-
Genes_selected
[,
-1
]
Genes_selected
<-
data.frame
(
t
(
Genes_selected
))
Genes_selected
<-
data.frame
(
t
(
Genes_selected
))
Genes_selected
$
sample
<-
rownames
(
Genes_selected
)
Genes_selected
$
sample
<-
rownames
(
Genes_selected
)
Pan.data5
<-
merge
(
Genes_selected
,
Pan.data4
,
by.y
=
"sample"
)
Pan.data5
<-
merge
(
Genes_selected
,
Pan.data4
,
by.y
=
"sample"
)
colnames
(
Pan.data5
)
colnames
(
Pan.data5
)
#setwd(wdir)
write.xlsx
(
Pan.data5
,
paste0
(
ourdir
,
"/miR342_patients_survival.xlsx"
))
#write.xlsx(Pan.data5, "miR342_patients_survival.xlsx")
#############Regression with defined k2
#############Regression with defined k2
...
@@ -63,7 +92,7 @@ for (i in unique(Pan.data5$cancer.type.abbreviation)){
...
@@ -63,7 +92,7 @@ for (i in unique(Pan.data5$cancer.type.abbreviation)){
df2
<-
rbind
(
df2
,
df1
)
df2
<-
rbind
(
df2
,
df1
)
df2
<-
df2
[(
order
(
df2
$
p
)),]
df2
<-
df2
[(
order
(
df2
$
p
)),]
}
else
{}}
}
else
{}}
#
write.xlsx(df2,
"
miR342_HR_by_tumortype.xlsx")
write.xlsx
(
df2
,
paste0
(
output_dir
,
"/
miR342_HR_by_tumortype.xlsx"
)
)
###Filters selected tumors
###Filters selected tumors
...
@@ -75,21 +104,13 @@ a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label =
...
@@ -75,21 +104,13 @@ a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label =
geom_point
(
size
=
4
)
+
geom_point
(
size
=
4
)
+
geom_errorbar
(
aes
(
ymin
=
pmax
(
HR
-
SE
,
0
),
ymax
=
HR
+
SE
),
width
=
0.2
)
+
geom_errorbar
(
aes
(
ymin
=
pmax
(
HR
-
SE
,
0
),
ymax
=
HR
+
SE
),
width
=
0.2
)
+
scale_color_manual
(
name
=
"Color"
,
values
=
c
(
"red"
,
"blue"
))
+
scale_color_manual
(
name
=
"Color"
,
values
=
c
(
"red"
,
"blue"
))
+
labs
(
title
=
"HR by tumor type"
,
labs
(
title
=
"HR by tumor type"
,
x
=
"Tumor Type"
,
y
=
"HR"
)
+
x
=
"Tumor Type"
,
y
=
"HR"
)
+
geom_rect
(
aes
(
xmin
=
-
Inf
,
xmax
=
Inf
,
ymin
=
-
Inf
,
ymax
=
1
),
fill
=
"lightblue"
,
alpha
=
0.01
)
+
geom_rect
(
aes
(
xmin
=
-
Inf
,
xmax
=
Inf
,
ymin
=
-
Inf
,
ymax
=
1
),
fill
=
"lightblue"
,
alpha
=
0.01
)
+
geom_rect
(
aes
(
xmin
=
-
Inf
,
xmax
=
Inf
,
ymin
=
1
,
ymax
=
Inf
),
fill
=
"pink"
,
alpha
=
0.01
)
+
geom_rect
(
aes
(
xmin
=
-
Inf
,
xmax
=
Inf
,
ymin
=
1
,
ymax
=
Inf
),
fill
=
"pink"
,
alpha
=
0.01
)
+
theme_minimal
()
+
theme_minimal
()
+
theme
(
axis.text.x
=
element_text
(
angle
=
45
,
hjust
=
1
,
color
=
"Black"
))
+
theme
(
axis.text.x
=
element_text
(
angle
=
45
,
hjust
=
1
,
color
=
"Black"
))
+
geom_hline
(
yintercept
=
1
,
linetype
=
"dashed"
,
color
=
"gray"
)
geom_hline
(
yintercept
=
1
,
linetype
=
"dashed"
,
color
=
"gray"
)
ggsave
(
filename
=
paste0
(
output_dir
,
"/HR_alltumors_342.pdf"
),
plot
=
a
,
width
=
9
,
height
=
4
)
# pdf("HR_alltumors_342.pdf",width=9,height = 4)
a
# dev.off()
####Plot survival of chosen tumors
####Plot survival of chosen tumors
...
@@ -98,7 +119,7 @@ df3<-df3[order(df3$HR),]
...
@@ -98,7 +119,7 @@ df3<-df3[order(df3$HR),]
df3
<-
df3
[
df3
$
p
<
0.05
,]
df3
<-
df3
[
df3
$
p
<
0.05
,]
#
pdf(
"
Survival_***_342"
,
width=10,height = 6)
#pdf(
paste0(output_dir, "/
Survival_***_342"
),
width=10,height = 6)
par
(
mfrow
=
c
(
2
,
4
))
par
(
mfrow
=
c
(
2
,
4
))
survival
<-
list
()
survival
<-
list
()
for
(
i
in
df3
$
Tumor
){
for
(
i
in
df3
$
Tumor
){
...
@@ -141,7 +162,7 @@ formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001", as.numeric(round(p_val
...
@@ -141,7 +162,7 @@ formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001", as.numeric(round(p_val
par
(
las
=
0
)
par
(
las
=
0
)
pdf
(
"
Survival_metastatic_342.pdf"
,
width
=
5
,
height
=
5
)
pdf
(
paste0
(
output_dir
,
"/
Survival_metastatic_342.pdf"
),
width
=
5
,
height
=
5
)
plot
(
fit.score
,
lty
=
c
(
1
,
1
),
col
=
c
(
"blue"
,
"red"
),
xlab
=
"Time (d)"
,
ylab
=
"Overall Survival"
,
lwd
=
2
,
bty
=
"n"
,
plot
(
fit.score
,
lty
=
c
(
1
,
1
),
col
=
c
(
"blue"
,
"red"
),
xlab
=
"Time (d)"
,
ylab
=
"Overall Survival"
,
lwd
=
2
,
bty
=
"n"
,
main
=
"Metastatic tumors"
)
main
=
"Metastatic tumors"
)
#Add legend
#Add legend
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment