scRNAseq realted scripts

1f735baa · Matteo Barcella · 01d2c18d · 1f735baa · 1f735baa · 1f735baa
Commit 1f735baa authored Feb 06, 2023 by Matteo Barcella
--- a/scRNAseq/Figure_5H_5G_data.rds
+++ b/scRNAseq/Figure_5H_5G_data.rds
--- a/scRNAseq/Figure_5H_data.gmt
+++ b/scRNAseq/Figure_5H_data.gmt
--- a/scRNAseq/Figure_5H_term2groups.txt
+++ b/scRNAseq/Figure_5H_term2groups.txt
+TERM,GROUP
+FRIDMAN_SENESCENCE_UP,A
+REACTOME_CELLULAR_SENESCENCE,A
+REACTOME_SENESCENCE_ASSOCIATED_SECRETORY_PHENOTYPE_SASP,A
+CHICAS_RB1_TARGETS_SENESCENT,A
+WP_SENESCENCE_AND_AUTOPHAGY_IN_CANCER,A
+DEMAGALHAES_AGING_UP,A
+DEMAGALHAES_AGING_DN,A
+BIOCARTA_P53_PATHWAY,B
+INGA_TP53_TARGETS,B
+HALLMARK_INFLAMMATORY_RESPONSE,C
+REACTOME_INTERFERON_SIGNALING,C
+INTERFERON_SIGNALING (REACTOME_INTERFERON_SIGNALLING),C
+REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM,C
+WNT_SIGNALING,D
+REACTOME_SIGNALING_BY_WNT,D
+KEGG_WNT_SIGNALING_PATHWAY,D
+BIOCARTA_WNT_PATHWAY,D
+PID_WNT_CANONICAL_PATHWAY,D
+PID_WNT_SIGNALING_PATHWAY,D
+PID WNT SIGNALING PATHWAY,D
+GO_CANONICAL_WNT_SIGNALING,D
+GO_CANONICAL_WNT_SIGNALING_PATHWAY,D
+GSE26351_UNSTIM_VS_WNT_PATHWAY_STIM_HEMATOPOIETIC_PROGENITORS_UP,D
+WNT_CELL_GROWTH_AND_PROLIFERATION,D
+WNT_SIGNALING_PCR_ARRAY,D
+ST_WNT_BETA_CATENIN_PATHWAY,D
+WNT_STANFORD,D
+WNT_UP.V1_DN,D
+HALLMARK_NOTCH_SIGNALING,E
+KEGG_NOTCH_SIGNALING_PATHWAY,E
+GO_NOTCH_SIGNALING_PATHWAY,E
+PID_NOTCH_PATHWAY,E
+GO_REGULATION_OF_NOTCH_SIGNALING_PATHWAY,E
+NOTCH_TARGETS_PCR_ARRAY,E
+VILIMAS_NOTCH1_TARGETS_UP,E
+NOTCH_DN.V1_UP,E
+NOTCH_DN.V1_DN,E
+GO_NEGATIVE_REGULATION_OF_NOTCH_SIGNALING_PATHWAY,E
+NGUYEN_NOTCH1_TARGETS_DN,E
+KEGG_ECM_RECEPTOR_INTERACTION,F
+KEGG_CELL_ADHESION_MOLECULES_CAMS,F
+CELL_ADHESION_MOLECULES,F
+TSAI_RESPONSE_TO_RADIATION_THERAPY,G
+CHIARETTI_T_ALL_REFRACTORY_TO_THERAPY,G
+KAN_RESPONSE_TO_ARSENIC_TRIOXIDE,G
+KIM_RESPONSE_TO_TSA_AND_DECITABINE_UP,G
+CHIBA_RESPONSE_TO_TSA_UP,G
+RHEIN_ALL_GLUCOCORTICOID_THERAPY_UP,G
+BOROVIAK_DIAPAUSE_UP,H
+BOROVIAK_DIAPAUSE_DN,H
+REACTOME_REGULATION_OF_MITOTIC_CELL_CYCLE,H
\ No newline at end of file
--- a/scRNAseq/Figure_5K.R
+++ b/scRNAseq/Figure_5K.R
+# Volcano plot T vs C cluster 1 focus
+
+library(Seurat)
+library(ggplot2)
+library(ggrepel)
+library(RColorBrewer)
+library(dplyr)
+library(scales)
+
+TvsC_markers <- readRDS(file = "Figure_5K_data.rds")
+markers <- TvsC_markers$Cluster_1
+
+markers$p_val_adj[which(markers$p_val_adj == 0)] <- 1e-300
+
+adj.pval.thr=0.05
+logfc.thr = 0.15
+
+# Volcano plot
+data <- data.frame(gene = row.names(markers), pval = -log10(markers$p_val_adj), lfc = markers$avg_logFC)
+data <- na.omit(data)
+data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",
+                                       data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",
+                                       abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
+data <- data[order(data$pval, decreasing = TRUE),]
+data <- data[order(data$lfc, decreasing = TRUE),]
+highl <- head(subset(data, color != "NonSignificant"), 12)
+vol <- ggplot(data, aes(x = lfc, y = pval, colour = color)) +
+  theme_bw(base_size = 12) +
+  theme(legend.position = "right") +
+  ylim(c(0,350)) +
+  xlab("log2 Fold Change") + 
+  ylab(expression(-log[10]("adjusted p-value"))) +
+  geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray")
+
+tn = function() trans_new('cuberoot', transform = function(x) x^(1/3),
+                          inverse = function(x) x^3)
+
+vol <- vol +
+  geom_point(size = 2, alpha = 0.8, na.rm = T) +
+  scale_color_manual(name = "Expression",
+                     values = c(Overexpressed = "#ae0000",
+                                Underexpressed = "#095786",
+                                NonSignificant = "darkgray")) +
+  geom_text_repel(data = highl, mapping = aes(label = gene), size = 5, segment.alpha = 0.4,force = 50,show.legend = FALSE, max.overlaps = 30) + 
+  theme(panel.grid.major = element_blank(), 
+        axis.text = element_text(size = 12),
+        panel.grid.minor = element_blank())
+plot(vol)
+
+
+for (i in 1:20) {
+  png(filename = paste0("Plot_n_",i, ".png"), width = 9, height = 6, units = "in", res = 200)
+  print(plot(vol))
+  dev.off()
+}
+
+png("Figure_5K.png", units = "in", res = 700, height = 4.5, width = 9)
+plot(vol)
+dev.off()
+
--- a/scRNAseq/Figure_5K_data.rds
+++ b/scRNAseq/Figure_5K_data.rds
--- a/scRNAseq/Figure_5M.R
+++ b/scRNAseq/Figure_5M.R
+library(Seurat)
+library(ggplot2)
+library(dittoSeq)
+library(dplyr)
+library(RColorBrewer)
+library(stats)
+library(openxlsx)
+library(psych)
+
+PDX <- readRDS("PDX_object.rds")
+
+# Analysis of PDX LSC subsets in T vs C
+table(PDX$RNA_snn_h.RNA_DonorID_res.1.2, PDX$RNA_Condition)
+
+PDX <- SetIdent(PDX, value = "RNA_snn_h.RNA_DonorID_res.1.2")
+cluster6cells = WhichCells(object = PDX, idents = "6")
+cluster5cells = WhichCells(object = PDX, idents = "5")
+cluster19cells = WhichCells(object = PDX, idents = "19")
+
+PDX <- SetIdent(PDX, value = "RNA_Condition")
+
+T_cells = WhichCells(object = PDX, idents = "Treated")
+C_cells = WhichCells(object = PDX, idents = "Control")
+
+cluster6_T = intersect(cluster6cells, T_cells)
+cluster6_C = intersect(cluster6cells, C_cells)
+cluster5_C = intersect(cluster5cells, C_cells)
+Cluster6_TvsC_markers = FindMarkers(object = PDX, 
+                                    ident.1 = cluster6_T, 
+                                    ident.2 = cluster6_C)
+
+Cluster6C_vs_5C_markers = FindMarkers(object = PDX,
+                                      ident.1 = cluster6_C,
+                                      ident.2 = cluster5_C)
+
+PDX = SetIdent(PDX, value = "RNA_snn_h.RNA_DonorID_res.1.2")
+
+cluster5_vs6and19_markers = FindMarkers(object = PDX,
+                                        ident.1 = cluster5cells,
+                                        ident.2 = c(cluster6cells, cluster19cells))
+cluster6_vs5and19_markers = FindMarkers(object = PDX,
+                                        ident.1 = cluster6cells,
+                                        ident.2 = c(cluster5cells, cluster19cells))
+cluster19_vs5and6_markers = FindMarkers(object = PDX,
+                                        ident.1 = cluster19cells,
+                                        ident.2 = c(cluster5cells, cluster6cells))
+
+# These xlsx are stored in Figure_5M_data.zip
+openxlsx::write.xlsx(x = cluster5_vs6and19_markers, file = "res12_cluster5_vs6and19_markers.xlsx", row.names = T)
+openxlsx::write.xlsx(x = cluster6_vs5and19_markers, file = "res12_cluster6_vs5and19_markers.xlsx", row.names = T)
+openxlsx::write.xlsx(x = cluster19_vs5and6_markers, file = "res12_cluster19_vs5and6_markers.xlsx", row.names = T)
+
+
+top10markers_eachcluster = c(rownames(cluster5_vs6and19_markers %>% slice_max(order_by = avg_logFC, n = 10)),
+                             rownames(cluster6_vs5and19_markers %>% slice_max(order_by = avg_logFC, n = 10)),
+                             rownames(cluster19_vs5and6_markers %>% slice_max(order_by = avg_logFC, n = 10)))
+
+
+DotPlot(PDX, features = top15markers_eachcluster, idents = c("5","6","19"))
+plotinfo = dittoSeq::dittoDotPlot(object = PDX, vars = top15markers_eachcluster, 
+                                  group.by = "RNA_snn_h.RNA_DonorID_res.1.2", 
+                                  cells.use = WhichCells(PDX, idents = c("5","6","19")), 
+                                  ylab = "LSC subclusters", y.reorder = c(2,3,1), 
+                                  main = "Top 15 markers by logFC per cluster", data.out = T) 
+svg(filename = "Figure_5M_left.png", width = 9.9, height = 3)
+ggplot(data = plotinfo$data, aes(x = var, y = grouping)) + 
+  geom_point(aes(color = color, size = size)) +
+  scale_color_gradient2(high = "#b31b2c", mid = "white", low = "#2268ad") +
+  theme_bw() + theme(panel.grid = element_blank(), 
+                     axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = rel(1.25)),
+                     axis.text.y = element_text(size = rel(1.5))) +
+  xlab("Top 15 cluster markers by logFC") + ylab("LSC subclusters") + 
+  labs(color = paste("Relative","expression",sep="\n"), size = paste("Percent","expression",sep="\n"))
+dev.off()
+
+plotinfo_reduced = dittoSeq::dittoDotPlot(object = PDX, vars = c("CDK6","INKA1","NECTIN2","PAK4","SIRT1"), 
+                                          group.by = "RNA_snn_h.RNA_DonorID_res.1.2", 
+                                          cells.use = WhichCells(PDX, idents = c("5","6","19")), 
+                                          ylab = "LSC subclusters", y.reorder = c(2,3,1), 
+                                          main = "Focus CDK6 INKA1", data.out = T) 
+
+svg(filename = "Figure_5M_right.svg", width = 3.5, height = 3)
+ggplot(data = plotinfo_reduced$data, aes(x = var, y = grouping)) + 
+  geom_point(aes(color = color, size = size)) +
+  scale_color_gradient2(high = "#b31b2c", mid = "white", low = "#2268ad") +
+  theme_bw() + theme(panel.grid = element_blank(), 
+                     axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = rel(1.25)),
+                     axis.text.y = element_text(size = rel(1.5))) +
+  xlab("HSC latency markers") + ylab("LSC subclusters") + 
+  labs(color = paste("Relative","expression",sep="\n"), size = paste("Percent","expression",sep="\n"))
+dev.off()
--- a/scRNAseq/Figure_5M_data.zip
+++ b/scRNAseq/Figure_5M_data.zip
--- a/scRNAseq/Figure_6At_6Bl_6Er_6H.R
+++ b/scRNAseq/Figure_6At_6Bl_6Er_6H.R
+library(Seurat)
+library(ggplot2)
+library(dplyr)
+library(Hmisc)
+
+# Import AML dataset
+AML <- readRDS(file = "AML_object.rds")
+AMLcells <- rownames(AML@meta.data)
+
+# Import the new updated module scores
+MS <- readRDS(file = "Figure_6A_data.rds")
+
+# ##########################
+# FeaturePlots of 126High and 126Low signature:
+# ##########################
+
+AML <- AddMetaData(AML, metadata = MS %>% filter(PatientID %nin% c("PT19","PT20")) %>% select(-PatientID) %>% select(-Timepoint)) 
+
+FPdata <- FetchData(AML, vars = c("X126up","X126down", "UMAP_1", "UMAP_2", "Timepoint_corr", "PatientID"))
+
+png(filename = "Figure_6A_top.png", width = 3, height = 3.3, units = "in", res = 1200)
+ggplot(FPdata %>% filter(Timepoint_corr == "DX") %>% arrange(X126up))+
+  geom_point(aes(x = UMAP_1, y = UMAP_2, color = X126up), size =0.01) +
+  scale_color_viridis_c(option = "inferno", limits = c(-0.1,0.06)) +
+  guides(color = guide_colorbar(reverse = F)) + theme_void() + 
+  labs(title = "126High signature") +
+  theme(legend.position = "top", legend.title = element_blank(), plot.title = element_text(hjust = 0.5),
+        legend.key.width=unit(0.33,"in"), legend.key.height = unit(0.2, "in"))
+dev.off()
+
+# ##########################
+# ViolinPlots at diagnosis:
+# ##########################
+
+AML = SetIdent(AML, value = "Timepoint_corr")
+AML$PatientID = factor(AML$PatientID, levels = c("PT01", "PT02", "PT13", 
+                                                 "PT09","PT10","PT08","PT15",
+                                                 "PT06","PT07","PT12"))
+svg(filename = "Figure_6B_left.svg", 
+    width = 5, height = 3.3)
+VlnPlot(AML, idents = "DX", features = "X126up", 
+        group.by = "PatientID" , pt.size = 0,
+        cols = c("#c00000","#c00000","#c00000",
+                 "#ed7d31","#ed7d31","#ed7d31","#ed7d31",
+                 "#70ad47","#70ad47","#70ad47")) + 
+  geom_hline(yintercept=0, linetype="dashed", color = "black", alpha = 0.5) +
+  theme_bw(base_size = 6) + ggtitle("126High Signature") +
+  theme(legend.text=element_text(size=rel(1.5)), legend.position = "none") + 
+  theme(plot.title = element_text(size = rel(1.5), hjust = 0.5),
+        axis.title.x = element_blank(), axis.text = element_text(size = rel(1.5)), 
+        panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank())
+dev.off()
+
+round(prop.table(table(AML$X126up>0,AML$PatientID,AML$Timepoint_corr)[,,"DX"], margin = 2)*100, digits = 2)
+#       PT01  PT02  PT13  PT09  PT10  PT08  PT15  PT06  PT07  PT12
+# FALSE 56.10 59.67 67.44 78.45 77.86 93.29 94.38 95.84 99.86 99.22
+# TRUE  43.90 40.33 32.56 21.55 22.14  6.71  5.62  4.16  0.14  0.78
+
+# ##########################
+# Vlnplots dx and rel comparison
+# ##########################
+
+# ALL diagnosis vs Relapses single patients
+DiagnosisVLN <- MS %>% filter(Timepoint == "DX")
+DiagnosisVLN$Facet <- "Diagnosis"
+DiagnosisVLNsubset <- DiagnosisVLN %>% group_by(PatientID) %>% sample_n(1400) %>% ungroup()
+
+RelapseVLN <- MS %>% filter(Timepoint == "REL")
+RelapseVLN$Facet <- "Relapse"
+RelapseVLNsubset <- RelapseVLN %>% group_by(PatientID) %>% sample_n(1200) %>% ungroup()
+
+PT08VLN <- MS %>% filter(PatientID == "PT08" & Timepoint %in% c("DX", "REL", "REL_NR"))
+PT08VLN$Facet <- "PT08"
+PT15VLN <- MS %>% filter(PatientID == "PT15")
+PT15VLN$Facet <- "PT15"
+PT19VLN <- MS %>% filter(PatientID == "PT19" & Timepoint == "REL")
+PT19VLN$Facet <- "PT19"
+PT20VLN <- MS %>% filter(PatientID == "PT20" & Timepoint == "REL1")
+PT20VLN$Facet <- "PT20"
+PT20VLN$Timepoint <- "REL"
+
+VLN <- rbind(RelapseVLNsubset, DiagnosisVLNsubset, PT08VLN, PT15VLN, PT19VLN, PT20VLN)
+VLN$Facet <- factor(VLN$Facet, levels = c("Diagnosis", "Relapse", "PT08", "PT15", "PT19", "PT20"))
+
+# Diagnosis WITHOUT REFR AML
+VLNnoREFR <- VLN
+VLNnoREFR <- VLNnoREFR %>% filter(!PatientID %in% c("PT01", "PT02", "PT13"))
+
+VLNnoREFR_noRELpool <- VLNnoREFR
+VLNnoREFR_noRELpool <- VLNnoREFR_noRELpool %>% filter(!Facet == "Relapse")
+VLNnoREFR_noRELpool$Facet <- ifelse(VLNnoREFR_noRELpool$Facet == "Diagnosis", "non Ref. \n AML", paste0(VLNnoREFR_noRELpool$Facet))
+svg(filename = "Figure_6G.svg", height = 3.3, 
+    width = 4.4)
+ggplot(VLNnoREFR_noRELpool) +
+  geom_violin(aes(x = Timepoint, y = `126up`, fill = Timepoint)) +
+  facet_grid(~Facet, scales = "free_x", space = "free_x") + 
+  geom_hline(yintercept=0, linetype="dashed", color = "black", alpha = 0.5) +
+  scale_fill_manual(values = c(CR = "#70ad47", fut_REL = "#ed7d31", REFR = "#c00000", 
+                               DX = "#cfcfcf", REL = "#458fe8", REL_NR = "#19385c")) +
+  theme_bw(base_size = 6) + ggtitle("126High Signature") +
+  theme(legend.text=element_text(size=rel(1.5)), legend.position = "none") + 
+  theme(plot.title = element_text(size = rel(2), hjust = 0.5), axis.title.y = element_blank(),
+        axis.title.x = element_blank(), axis.text = element_text(size = rel(1.5)), strip.text = element_text(size = rel(2)),
+        panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), strip.background = element_blank())
+dev.off()
+
+
+# ##########################
+# Vlnplot LSC subclusters
+# ##########################
+AML_0_11_14_final <- readRDS("AML_subclustering.rds")
+AML_0_11_14_final_cells <- rownames(AML_0_11_14_final@meta.data)
+MS_cl_0_11_14 <- MS[AML_0_11_14_final_cells,]
+MS_cl_0_11_14 <- MS_cl_0_11_14 %>% select(-Timepoint) %>% select(-PatientID)
+AML_0_11_14_final <- AddMetaData(AML_0_11_14_final, metadata = MS_cl_0_11_14)
+AML_0_11_14_final$Timepoint_corr <- factor(AML_0_11_14_final$Timepoint_corr, levels = c("DX","D14","D30","REL","REL_NR"))
+
+# ##########################
+# DotPlots
+# ##########################
+
+plotinfo_latency = dittoSeq::dittoDotPlot(object = AML_0_11_14_final, vars = c("X126up","CDK6","INKA1"), 
+                                          group.by = "RNA_snn_h.PatientID_res.0.6", 
+                                          ylab = "LSC subclusters", 
+                                          data.out = T) 
+
+
+svg(filename = "Figure_6E_HSC_latency.png", width = 2.5, height = 4)
+ggplot(data = plotinfo_latency$data, aes(x = var, y = grouping)) + 
+  geom_point(aes(color = color, size = size)) +
+  scale_color_gradient2(high = "#b31b2c", mid = "white", low = "#2268ad") +
+  theme_bw() + theme(panel.grid = element_blank(), 
+                     axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = rel(1.25)),
+                     axis.text.y = element_text(size = rel(1.5))) + scale_x_discrete(labels= c("126High","CDK6", "INKA1")) +
+  xlab("HSC latency") + ylab("LSC subclusters") + 
+  labs(color = paste("Relative","expression",sep="\n"), size = paste("Percent","expression",sep="\n"))
+dev.off()
+
+
+# ##########################
+# CellCycle Barplots
+# ##########################
+
+PT20_active <- readRDS("PT20_active.rds")
+PT20cells <- rownames(PT20_active@meta.data)
+PT19_active <- readRDS("PT19_active.rds")
+PT19cells <- rownames(PT19_active@meta.data)
+
+MS_pt19 <- MS[PT19cells,]
+MS_pt19 <- MS_pt19 %>% select(-PatientID) %>% select(-Timepoint)
+MS_pt20 <- MS[PT20cells,]
+MS_pt20 <- MS_pt20 %>% select(-PatientID) %>% select(-Timepoint)
+PT19_active <- AddMetaData(object = PT19_active, metadata = MS_pt19)
+PT20_active <- AddMetaData(object = PT20_active, metadata = MS_pt20)
+
+PT19data <- FetchData(object = PT19_active, vars = c("RNA_PatientID","RNA_Timepoint", "Phase", "X126up"))
+PT19data$AML_up_full_cat <- ifelse(PT19data$X126up > 0, "POS", "NEG")
+
+PT20data <- FetchData(object = PT20_active, vars = c("RNA_PatientID","RNA_Timepoint", "Phase", "X126up"))
+PT20data$AML_up_full_cat <- ifelse(PT20data$X126up > 0, "POS", "NEG")
+PT20data$RNA_Timepoint <- ifelse(PT20data$RNA_Timepoint == "REL1", "REL", paste0(PT20data$RNA_Timepoint))
+
+AMLdata <- FetchData(object = AML, vars = c("PatientID","Timepoint_corr", "Phase", "X126up"))
+AMLdata$AML_up_full_cat <- ifelse(AMLdata$X126up > 0, "POS", "NEG")
+
+colnames(AMLdata) <- colnames(PT19data)
+
+Data <- rbind(AMLdata, PT19data, PT20data)
+Data_rel <- filter(Data, RNA_Timepoint == "REL")
+Data_rel_summ <- Data_rel %>% group_by(RNA_PatientID, AML_up_full_cat, Phase) %>% summarise(n = n())
+
+png(filename = "Figure_6H_bottom_left.png", units = "in", height = 3, width = 3, res = 300)
+ggplot(Data_rel_summ) +
+  geom_col(aes(x= AML_up_full_cat, y = n, fill = Phase), position="fill", alpha = 0.8) +
+  scale_fill_manual(values = c("lightgrey","orange","red")) +
+  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
+  facet_wrap(~RNA_PatientID, nrow = 1) + theme_bw() + 
+  ggtitle("Relapse AML") + xlab("126High Signature") +
+  theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(),
+        strip.background = element_blank(), axis.title.y = element_blank(), 
+        plot.title = element_text(hjust = 0.5), legend.position = "bottom", axis.text.x = element_text())
+dev.off()
+
+
+Data_dx <- filter(Data, RNA_Timepoint == "DX")
+Data_dx_summ <- Data_dx %>% group_by(RNA_PatientID, AML_up_full_cat, Phase) %>% summarise(n = n())
+
+png(filename = "Figure_6H_top.png", units = "in", height = 3, width = 3.3*2.5, res = 300)
+ggplot(Data_dx_summ) +
+  geom_col(aes(x= AML_up_full_cat, y = n, fill = Phase), position="fill") +
+  scale_fill_manual(values = c("lightgrey","orange","red")) +
+  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
+  facet_wrap(~RNA_PatientID, nrow = 1) + theme_bw() + 
+  ggtitle("Diagnosis AML") + xlab("126High Signature") +
+  theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(),
+        strip.background = element_blank(), axis.title.y = element_blank(),
+        plot.title = element_text(hjust = 0.5), legend.position = "bottom")
+dev.off()
+
+Data_relnr <- filter(Data, RNA_Timepoint == "REL_NR")
+Data_relnr_summ <- Data_relnr %>% group_by(RNA_PatientID, AML_up_full_cat, Phase) %>% summarise(n = n())
+
+png(filename = "Figure_6H_bottom_right.png", units = "in", height = 3, width = 2.75, res = 300)
+ggplot(Data_relnr_summ) +
+  geom_col(aes(x= AML_up_full_cat, y = n, fill = Phase), position="fill") +
+  scale_fill_manual(values = c("lightgrey","orange","red")) +
+  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
+  facet_wrap(~RNA_PatientID, nrow = 1) + theme_bw() + 
+  ggtitle("Relapse NR AML") + xlab("126High Signature") +
+  theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(),
+        strip.background = element_blank(), axis.title.y = element_blank(),
+        plot.title = element_text(hjust = 0.5), legend.position = "bottom")
+dev.off()
--- a/scRNAseq/Figure_S1F.R
+++ b/scRNAseq/Figure_S1F.R
+library(Seurat)
+library(ggplot2)
+library(ComplexHeatmap)
+library(dplyr)
+library(dittoSeq)
+
+AML <- readRDS("AML_object.rds")
+markers_res06 <- AML@misc$clustermarkers$res0.6
+AML$RNA_snn_res.0.6 <- factor(AML$RNA_snn_res.0.6, levels = c("0","11","14",
+                                                              "3","1",
+                                                              "9","7","13",
+                                                              "2","10",
+                                                              "5","6","4","8","12"))
+
+markers_res06_up <- list()
+for (x in levels(AML$RNA_snn_res.0.6)) {
+  markers_res06_up[[paste0("cl",x)]] <- markers_res06[[paste0("cl",x)]][["up"]]
+}
+
+fun <- function(x) {
+  x %>% slice_max(order_by = avg_logFC, n=20)
+}
+markers_res06_top20 <- lapply(markers_res06_up, FUN = fun)
+
+genes_to_plot <- c()
+tmp <- c()
+
+for (x in levels(AML$RNA_snn_res.0.6)) {
+  tmp <- markers_res06_top20[[paste0("cl",x)]]$gene
+  genes_to_plot <- c(genes_to_plot, tmp)
+}
+genes_to_plot_unique <- unique(genes_to_plot)
+
+genenames_to_plot <- c("CD52","FAM30A","IGHM","CD99","TNFRSF4","GUCY1A1","EGFL7","ADGRE5","GADD45A","IGFBP2","MDM2",
+                       "BST2","IFIT3","HSPA1A","HSPA1B","RAB31","HLA-A","CD34","LTB","HLA-DRB1","HOPX","IRF1","RPL3",
+                       "MSI2","RPL5","RPS2","RPS18","RPS3","MZB1","RPL10A","SELENOP","RPS5","EIF3E","MPO","ELANE","AZU1",
+                       "CXCL8","SOX4","PRSS57","CAT","AREG","HBD","FCER1A","MS4A3","TPSAB1","GATA2","HBB","HBD","HBA1","HBA2","CA1",
+                       "AHSP","BLVRB","KLF1","PRTN3","IL2RA","IGLL1","MZB1","MPO","PCLAF","TYMS","PCNA","TUBA1B","DUT","TUBB","HMGB2",
+                       "TK1","HELLS","MCM7","MCM4","HMGB1","CDT1","CENPF","TOP2A","ASPM","MKI67","TUBB4B","CTSG","LYZ","CXCR4","HLA-DQA1",
+                       "HLA-DQB1","LGALS2","HLA-DRA","HLA-DPB1","HLA-DPA1","HLA-DRB1","CD74","FCER1A","S100A9","S100A8","LYZ","S100A12",
+                       "CD14","NCF1","GRN","S100A10","FCER1G","S100A6","S100A11","FCGR3A","MS4A7","FCER1G","S100A11","CD48",
+                       "GNLY","NKG7","GZMB","GZMA","CD7")
+genenames_to_plot <- unique(genenames_to_plot)
+labelindexes <- match(genenames_to_plot,genes_to_plot_unique)
+
+AML <- SetIdent(AML, value = "Timepoint_corr")
+cellstoplot<- WhichCells(object = AML,idents = c("DX","D14","D30","REL","REL_NR"))
+
+AML$TimepointID <- AML$Timepoint_corr
+AML$TimepointID <- factor(AML$TimepointID, levels = c("DX","D14","D30","REL","REL_NR"))
+PandY <- PurpleAndYellow(k = 50)
+Set1 <- RColorBrewer::brewer.pal(9, "Set1")
+Set2 <- RColorBrewer::brewer.pal(8, "Set2")
+Set3 <- RColorBrewer::brewer.pal(12, "Set3")
+values = c(Set1, Set2, Set3)[1:length(unique(AML$RNA_snn_res.0.6))]
+valuesdonor = c(RColorBrewer::brewer.pal(12, "Paired"))[1:length(unique(AML$PatientID))]
+valuestimepoint = c("#e5e5e5","#b2b2b2","#7f7f7f","#4c4c4c","#000000")
+
+genenames_to_plot_reduced <- c("CD52","FAM30A","IGHM","CD99","TNFRSF4","GUCY1A1","EGFL7","ADGRE5","IGFBP2","MDM2",
+                               "IFIT3","HSPA1A","HSPA1B","RAB31","HLA-A","CD34","LTB","HLA-DRB1","HOPX","IRF1","RPL3",
+                               "MSI2","RPL5","RPS2","RPS18","MZB1","EIF3E","MPO","ELANE","AZU1",
+                               "CXCL8","SOX4","PRSS57","CAT","AREG","HBD","FCER1A","GATA2","HBB","HBD","CA1",
+                               "AHSP","BLVRB","KLF1","IL2RA","IGLL1","MZB1","MPO","PCNA","TUBA1B","TUBB","HMGB2",
+                               "HELLS","MCM7","MCM4","HMGB1","CENPF","TOP2A","MKI67","TUBB4B","CTSG","LYZ","CXCR4","HLA-DQA1",
+                               "HLA-DQB1","HLA-DRA","HLA-DPB1","CD74","FCER1A","S100A9","S100A8","LYZ",
+                               "CD14","NCF1","GRN","FCER1G","S100A6","FCGR3A","MS4A7","FCER1G","CD48",
+                               "GNLY","NKG7","GZMB","CD7")
+genenames_to_plot_reduced <- unique(genenames_to_plot_reduced)
+labelindexes_reduced <- match(genenames_to_plot_reduced,genes_to_plot_unique)
+
+png(filename = "Figure_S1F.png", width = 16, height = 12, res = 700, units = "in")
+dittoHeatmap(object = AML, genes = genes_to_plot_unique, complex = TRUE,
+             order.by = c("RNA_snn_res.0.6"), annot.by = c("RNA_snn_res.0.6","PatientID","TimepointID"),heatmap.colors = PandY,
+             use_raster = TRUE, cluster_cols = F, cluster_rows = F, scale = "none", slot = "scale.data", fontsize_row = 12, 
+             breaks = seq(-2, 2, length.out = 51), show_rownames = F, cells.use = cellstoplot,
+             annot.colors = c(values, valuesdonor, valuestimepoint)) +
+  rowAnnotation(link = anno_mark(at = labelindexes_reduced, 
+                                 labels = genenames_to_plot_reduced, 
+                                 labels_gp = gpar(fontsize = 8), padding = unit(1, "mm")))
+dev.off()
+
+png(filename = "Figure_1B.png", width = 4, height = 4.4, res = 700, units = "in")
+DimPlot(AML, group.by = "RNA_snn_res.0.6", label = T, 
+        pt.size = 0.1, shuffle = T, cols = values, label.size = 5) + theme_void() +
+  ggtitle("NPM1mut AML blasts") + theme(plot.title = element_text(hjust = 0.5), legend.position = "none") 
+dev.off()
+
--- a/scRNAseq/Figure_S1_G.R
+++ b/scRNAseq/Figure_S1_G.R
+
+library(Seurat)
+library(ggplot2)
+library(ComplexHeatmap)
+library(dplyr)
+library(dittoSeq)
+
+AML <- readRDS("../activeobj/Del7_active.rds")
+
+markers_res06 <- AML@misc$markers_h$RNA_snn_h.orig.ident_res.0.6
+markers_res06 <- subset.data.frame(x = markers_res06, subset = avg_logFC > 0)
+markers_res06_l <- split(markers_res06, f = markers_res06$cluster)
+names(markers_res06_l) <- paste0("cl",names(markers_res06_l))
+
+pdf("Vlnplot_signature_AML_full_ordered.pdf")
+VlnPlot(object = AML, features = "AML_up_full", sort = T, pt.size = 0, group.by = "RNA_snn_h.orig.ident_res.0.6")
+dev.off()
+
+AML$RNA_snn_res.0.6 <- factor(AML$RNA_snn_h.orig.ident_res.0.6, levels = c("10","1","0","5","9","3","4","6","2","8","7" ,"11"))
+
+total_levels <-  c("10","1","0","5","9","3","4","6","2","8","7" ,"11")
+
+markers_res06_up <- list()
+
+for (x in levels(AML$RNA_snn_res.0.6)) {
+  markers_res06_up[[paste0("cl",x)]] <- markers_res06_l[[paste0("cl",x)]]
+}
+
+for(nel in c(15,20,30,40)){
+  fun <- function(x) {
+    x %>% slice_max(order_by = avg_logFC, n = nel)
+  }
+  
+  markers_res06_top <- lapply(markers_res06_up, FUN = fun)
+  genes_to_plot <- c()
+  tmp <- c()
+  
+  for (x in total_levels) {
+    tmp <- markers_res06_top[[paste0("cl",x)]]$gene
+    genes_to_plot <- c(genes_to_plot, tmp)
+  }
+  
+  genes_to_plot_unique <- unique(genes_to_plot)
+  
+  genenames_to_plot <- c("NPDC1","HOPX","CD52","FAM30A","IGHM","PRDX1","CD99","NRIP1","SPINK2",
+                         "SNHG12","HEMGN","CD34","IGHM","CD164","ANGPT1","S100A9","S100A8","LYZ",
+                         "CD14","AREG","NEGR1","AGR2","CXCR4","FCER1A","CXCL8","GATA2","HBD","ID3",
+                         "ID1","HES1","ID2","HBD","TUBA1B","TUBB","MKI67","TOP2A","CENPF","PCNA",
+                         "MT-ND1","MT-ND2","MT-ATP6","MT-CYB","MPO","AZU1","LYZ","HLA-DPB1","HLA-DRB1",
+                         "MS4A6A","HLA-DQA1","HLA-DRA","HLA-DQB1","HLA-DPA1","LTB","SPINK2",
+                         "JUN","GNLY","GZMB","CD7","KLRD1","GZMK")
+
+  genenames_to_plot <- unique(genenames_to_plot)
+  labelindexes <- match(genenames_to_plot,genes_to_plot_unique)
+  
+  AML <- SetIdent(AML, value = "RNA_Timepoint")
+  cellstoplot <- WhichCells(object = AML,idents = c("DX","D14","D30"))
+  
+  AML$TimepointID <- AML$RNA_Timepoint
+  AML$TimepointID <- factor(AML$TimepointID, levels = c("DX","D14","D30"))
+  PandY <- PurpleAndYellow(k = 100)
+  Set1 <- RColorBrewer::brewer.pal(9, "Set1")
+  Set2 <- RColorBrewer::brewer.pal(8, "Set2")
+  Set3 <- RColorBrewer::brewer.pal(12, "Set3")
+  values = c(Set1, Set2, Set3)[1:length(total_levels)]
+  valuesdonor = c(RColorBrewer::brewer.pal(12, "Paired"))[1:length(unique(AML$RNA_PatientID))]
+  valuestimepoint = c("#e5e5e5","#b2b2b2","#7f7f7f")
+  
+  AML$Timepoint <- AML$TimepointID
+  AML$Patient <- AML$RNA_PatientID
+  AML$Cluster <- AML$RNA_snn_res.0.6
+  
+  png(filename = paste0("Top",nel,"markers_cl_res06_limitPandY.png"), width = 16, height = 12, res = 700, units = "in")
+  a <- dittoHeatmap(object = AML, genes = genes_to_plot_unique, complex = TRUE, border_color = NA, name = " ",
+                    order.by = c("Cluster"), annot.by = c("Cluster","Patient","Timepoint"),heatmap.colors = PandY,
+                    use_raster = TRUE, cluster_cols = F, cluster_rows = F, scale = "none", slot = "scale.data", fontsize_row = 12, 
+                    breaks = seq(-2, 2, length.out = 101), show_rownames = F, cells.use = cellstoplot,
+                    annot.colors = c(values, valuesdonor, valuestimepoint)) +
+    rowAnnotation(link = anno_mark(at = labelindexes, 
+                                   labels = genenames_to_plot, 
+                                   labels_gp = gpar(fontsize = 7), padding = unit(1, "mm")))
+  
+  draw(a, padding = unit(c(5, 5, 5, 5), "mm"))
+  dev.off()
+  
+}
+
--- a/scRNAseq/Figure_S4D.R
+++ b/scRNAseq/Figure_S4D.R
+library(DESeq2)
+library(openxlsx)
+library(ggplot2)
+library(reshape2)
+library(clusterProfiler)
+library(pheatmap)
+library(RColorBrewer)
+library(dplyr)
+
+dge_obj_full <- readRDS("20210804_202201_DGE.rds")
+counts_normalized <- DESeq2::counts(object = dge_obj_full$deseq$ds, normalized = T)
+
+# Gene expression heatmap of genes 
+# open rds of clustering - fetch terms and retrieve genes
+
+clustering_data <- readRDS(file = "Figure_S4D_data.rds")
+
+# genes2plot
+
+genes2plot <- c(clustering_data$genelist$`GO:0006911_phagocytosis, engulfment`,
+                clustering_data$genelist$`GO:0099024_plasma membrane invagination`,
+                clustering_data$genelist$`GO:0006958_complement activation, classical pathway`,
+                clustering_data$genelist$`GO:0006910_phagocytosis, recognition`,
+                clustering_data$genelist$`GO:0008037_cell recognition`,
+                clustering_data$genelist$`GO:0006959_humoral immune response`,
+                clustering_data$genelist$`GO:0042742_defense response to bacterium`,
+                clustering_data$genelist$`GO:0050853_B cell receptor signaling pathway`,
+                clustering_data$genelist$`GO:0042113_B cell activation`,
+                clustering_data$genelist$`GO:0002250_adaptive immune response`,
+                clustering_data$genelist$`GO:0002449_lymphocyte mediated immunity`,
+                clustering_data$genelist$`GO:0051249_regulation of lymphocyte activation`,
+                clustering_data$genelist$`GO:0032944_regulation of mononuclear cell proliferation`,
+                clustering_data$genelist$`GO:0070661_leukocyte proliferation`,
+                clustering_data$genelist$`GO:0046651_lymphocyte proliferation`,
+                clustering_data$genelist$`GO:0030098_lymphocyte differentiation`,
+                clustering_data$genelist$`GO:0045580_regulation of T cell differentiation`,
+                clustering_data$genelist$`GO:0050870_positive regulation of T cell activation`,
+                clustering_data$genelist$`GO:0007159_leukocyte cell-cell adhesion`,
+                clustering_data$genelist$`GO:1903039_positive regulation of leukocyte cell-cell adhesion`,
+                clustering_data$genelist$`GO:0002228_natural killer cell mediated immunity`,
+                clustering_data$genelist$`GO:0002715_regulation of natural killer cell mediated immunity`,
+                clustering_data$genelist$`GO:0001910_regulation of leukocyte mediated cytotoxicity`,
+                clustering_data$genelist$`GO:0001909_leukocyte mediated cytotoxicity`,
+                clustering_data$genelist$`GO:0031341_regulation of cell killing`)
+
+genes2plot <- base::unique(genes2plot)
+counts_rlog_corr_genes2plot <- dge_obj_full$deseq$rlog_corr[genes2plot,]
+
+counts_rlog_corr_genes2plot_df <- as.data.frame(t(counts_rlog_corr_genes2plot))
+counts_rlog_corr_genes2plot_df <- merge.data.frame(x =counts_rlog_corr_genes2plot_df, y = dge_obj_full$sample_info)
+
+counts_rlog_corr_genes2plot_df_melt <- melt(data = counts_rlog_corr_genes2plot_df, id.vars = c("patient","condition"))
+
+counts_rlog_corr_genes2plot_df_light <- counts_rlog_corr_genes2plot_df_melt %>%
+  group_by(variable,condition,patient) %>%
+  summarize(AvgExpression_ptcondition = mean(value))
+
+counts_rlog_corr_genes2plot_df_light$combo <- paste(counts_rlog_corr_genes2plot_df_light$patient, counts_rlog_corr_genes2plot_df_light$condition, sep = "_")
+counts_rlog_corr_genes2plot_df_light_casted <- dcast(data = counts_rlog_corr_genes2plot_df_light, formula = combo ~ variable, value.var = "AvgExpression_ptcondition")
+
+Annotations_colors_4_gexp_heatmap <- readRDS("Figure_4CDE_S4DE_colors.rds")
+Annotations_colors_4_gexp_heatmap_nomir = Annotations_colors_4_gexp_heatmap
+Annotations_colors_4_gexp_heatmap_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp <- readRDS("Figure_4CDE_S4DE_annots.rds")
+SampleInfo_annotation_heatmap_gexp_nomir = SampleInfo_annotation_heatmap_gexp
+SampleInfo_annotation_heatmap_gexp_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$condition = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$Patient = SampleInfo_annotation_heatmap_gexp_nomir$patient
+SampleInfo_annotation_heatmap_gexp_nomir$patient = NULL
+
+colorsZ2bis <- c(seq(-4,4,by=0.1))
+my_palette <- c("#021830",colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2bis)-3),"#33000f")
+
+png("Figure_S4D.png", height = 12, width = 12, res = 700, units = "in")
+pheatmap(mat = counts_rlog_corr_genes2plot, cellwidth = 8, cellheight = 4,fontsize_col = 5,
+         clustering_distance_cols = "manhattan", annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+         annotation_colors = Annotations_colors_4_gexp_heatmap_nomir,
+         border_color = FALSE, 
+         color = my_palette, breaks = colorsZ2bis, scale = "row", fontsize_row = 4)
+
+dev.off()
+
+
--- a/scRNAseq/Figure_S4D_data.rds
+++ b/scRNAseq/Figure_S4D_data.rds
--- a/scRNAseq/Figure_S4E.R
+++ b/scRNAseq/Figure_S4E.R
+# BulkRNAseq data analysis with PathfindR:
+library(pathfindR)
+library(pathfindR.data)
+library(dplyr)
+library(DESeq2)
+
+# Load expression matrix:
+
+rawdata <- read.table(file = "Full-gene-counts.txt") # GEO data
+rawdata_m <- as.matrix(rawdata)
+samples <- colnames(rawdata)
+
+# Sample info
+GFPhighsamples <- grep(".+H$", samples, value = T)
+GFPlowsamples <- grep(".+L$", samples, value = T)
+Treatedsamples <- c(grep(pattern = ".+TH$", samples, value = T), grep(pattern = ".+TL$", samples, value = T))
+Controlsamples <- c(grep(pattern = ".+CH$", samples, value = T), grep(pattern = ".+CL$", samples, value = T))
+
+HighOnly_TvsC <- read_excel("Figure_S4E_data.xlsx") 
+HighOnly_TvsC$Down_regulated <- NA
+HighOnly_TvsC$Down_regulated <- as.character(HighOnly_TvsC$Down_regulated)
+
+# load DGE analysis
+
+DGE = readRDS("20210804_202201_DGE.rds")
+normcounts = counts(DGE[["deseq"]][["ds"]], normalized=T)
+score_matrix_HighOnly_TvsC <- score_terms(enrichment_table = HighOnly_TvsC,
+                                          exp_mat = normcounts[,GFPhighsamples],
+                                          cases = intersect(GFPhighsamples, Treatedsamples), 
+                                          use_description = TRUE, # default FALSE
+                                          label_samples = FALSE, # default = TRUE
+                                          case_title = "Treated",  # default = "Case"
+                                          control_title = "Control", # default = "Control"
+                                          low = "#f7797d", # default = "green"
+                                          mid = "#fffde4", # default = "black"
+                                          high = "#1f4037") # default = "red"
+
+# annotation vectors/colors
+Annotations_colors_4_gexp_heatmap <- readRDS("Figure_4CDE_S4DE_colors.rds")
+Annotations_colors_4_gexp_heatmap_nomir = Annotations_colors_4_gexp_heatmap
+Annotations_colors_4_gexp_heatmap_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp <- readRDS("Figure_4CDE_S4DE_annots.rds")
+SampleInfo_annotation_heatmap_gexp_nomir = SampleInfo_annotation_heatmap_gexp
+SampleInfo_annotation_heatmap_gexp_nomir$`miR-126 activity` = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$condition = NULL
+SampleInfo_annotation_heatmap_gexp_nomir$Patient = SampleInfo_annotation_heatmap_gexp_nomir$patient
+SampleInfo_annotation_heatmap_gexp_nomir$patient = NULL
+
+colorsZ2bis <- c(seq(-2,2,by=0.1))
+my_palette <- c("#021830",colorRampPalette(rev(brewer.pal(n=11,name="RdBu")))(n = length(colorsZ2bis)-3),"#33000f")
+
+png(filename = "Figure_S4E.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_HighOnly_TvsC, scale = "none", 
+                          border_color = NA, fontsize_row = 8,  
+                          color = my_palette, breaks = colorsZ2bis,
+                          show_rownames = T, annotation_names_col = T, 
+                          annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                          annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                          cellwidth = 9, cellheight = 7,
+                          cluster_rows = T, cutree_cols = 2, cutree_rows = 2)
+dev.off()
+
+callback = function(hc, score_matrix_HighOnly_TvsC){
+  sv = svd(t(score_matrix_HighOnly_TvsC))$v[,1]
+  dend = reorder(as.dendrogram(hc), wts = sv)
+  as.hclust(dend)
+}
+
+png(filename = "Figure_S4E_callback.png", width = 16, height = 9, res = 700, units = "in")
+pheatmap::pheatmap(score_matrix_HighOnly_TvsC, scale = "none", 
+                   border_color = NA, fontsize_row = 8,  
+                   color = my_palette, breaks = colorsZ2bis,
+                   show_rownames = T, annotation_names_col = T, 
+                   annotation_col = SampleInfo_annotation_heatmap_gexp_nomir,
+                   annotation_colors = Annotations_colors_4_gexp_heatmap_nomir, legend = T,
+                   cellwidth = 9, cellheight = 7,
+                   cluster_rows = T, cutree_cols = 1, cutree_rows = 2, clustering_callback = callback)
+dev.off()
+
+
--- a/scRNAseq/Figure_S4E_data.xlsx
+++ b/scRNAseq/Figure_S4E_data.xlsx
--- a/scRNAseq/Figure_S5A_B_C_D.R
+++ b/scRNAseq/Figure_S5A_B_C_D.R
+library(Seurat)
+library(SingleR)
+library(Seurat)
+library(SingleR)
+library(scuttle)
+
+# read all raw matrices and create a merge object
+# on this dataset add metadata and remove all cells that do not have the annotation field
+# Perform log normalization
+# Convert to singlecellexperiment
+# Perform singleR annotation by using custom dataset from VanGalen
+
+# Downaload all raw data from VanGalen (GSE116256)
+
+sampleids <- gsub(x = grep(x = grep(x = unlist(strsplit(x = list.files(path = "GSE116256_RAW/"), split = "_")), 
+                                    pattern = "AML.*dem\\.txt$", value = T, perl = T), 
+                           pattern = "nanopore", value = T, invert = T), pattern = ".dem.txt", replacement = "")
+
+# populated annotation and counts
+
+all.objects <- list()
+
+for (sid in sampleids) {
+  all.objects[[sid]][["annot"]] <- read.delim(file = list.files(path = "GSE116256_RAW/", pattern = paste0(sid,".anno"), full.names = T))
+  all.objects[[sid]][["matrix"]] <- read.delim(file = list.files(path = "GSE116256_RAW/", pattern = paste0(sid,".dem"), full.names = T), header = T, row.names = 1)
+}
+
+dir.create("rds_from_raw/")
+
+for(samples in sampleids){
+  
+  colnames(all.objects[[samples]][["matrix"]]) <- gsub(pattern = "\\.", replacement = "-", x = colnames(all.objects[[samples]][["matrix"]]))
+  all.objects[[samples]][["annot"]]$Cell <- as.character(all.objects[[samples]][["annot"]]$Cell)
+  rownames(all.objects[[samples]][["annot"]]) <- all.objects[[samples]][["annot"]]$Cell
+  
+  all.objects[[samples]][["rds"]] <- CreateSeuratObject(counts = all.objects[[samples]][["matrix"]], project = paste0("VG_",samples), 
+                                                        min.cells = 0, min.features = 0, meta.data = all.objects[[samples]][["annot"]] )
+  all.objects[[samples]][["rds"]][["percent.mt"]] <- PercentageFeatureSet(all.objects[[samples]][["rds"]], pattern = "^MT-")
+  saveRDS(object = all.objects[[samples]][["rds"]], file = paste0("rds_from_raw/", samples,".rds"))
+}
+
+rds_objects <- list()
+for(samples in sampleids[-1]){
+  rds_objects[[samples]] <- all.objects[[samples]][["rds"]]
+}
+
+obj <- merge(x = all.objects[[sampleids[1]]][["rds"]], y = rds_objects, project = "FullVGalen")
+
+############# Creating singleR reference dataset ################
+
+# Convert seurat object to singlecellexperiment
+
+obj.sce <- as.SingleCellExperiment(obj)
+obj.sce <- logNormCounts(obj.sce)
+
+obj <- readRDS(file = "PDX_full.rds") # PDF full seurat object
+
+obj@misc[["VGalen_CellType"]] <- SingleR(test = obj@assays$RNA@data,
+                                            ref = obj.sce,
+                                            labels = obj.sce$CellType, de.method="wilcox")
+
+obj@meta.data[["VGalen_CellType"]] <- obj@misc[["VGalen_CellType"]]$pruned.labels
+
+
+obj@misc[["VGalen_PredictionRefined"]] <- SingleR(test = obj@assays$RNA@data,
+                                            ref = obj.sce,
+                                            labels = obj.sce$PredictionRefined, de.method="wilcox")
+
+obj@meta.data[["VGalen_PredictionRefined"]] <- obj@misc[["VGalen_PredictionRefined"]]$pruned.labels
+
+obj.sce$CellPred_CellType <- paste0(obj.sce$PredictionRefined,"_",obj.sce$CellType)
+obj@misc[["CellPred_CellType"]] <- SingleR(test = obj@assays$RNA@data,
+                                            ref = obj.sce,
+                                            labels = obj.sce$CellPred_CellType, de.method="wilcox")
+
+obj@meta.data[["CellPred_CellType"]] <- obj@misc[["CellPred_CellType"]]$pruned.labels
+
+Figure5A_B_C_D_data <- FetchData(object = obj, vars(c("CellID","RNA_DonorID","UMAP_1","UMAP_2","UMAPh_1","UMAPh_2",
+                                                      "SingleRrefined_BlueprintEncodeData_labels","VGalen_CellType","VGalen_PredictionRefined",
+                                                      "RNA_snn_h.RNA_DonorID_res.1.2")))
+
+Figure5A_B_C_D_data$CellID <- rownames(Figure5A_B_C_D_data)
+colnames(Figure5A_B_C_D_data)[1] <- "Patient"
+colnames(Figure5A_B_C_D_data)[6] <- "BPE_refined"
+colnames(Figure5A_B_C_D_data)[9] <- "Cluster_res_1.2"
+ 
+write.xlsx(x = Figure5A_B_C_D_data, file = "Figure_S5_A_B_C_D_data.xlsx")
--- a/scRNAseq/Figure_S5_A_B_C_D_data.xlsx
+++ b/scRNAseq/Figure_S5_A_B_C_D_data.xlsx
--- a/scRNAseq/Figure_S6E.R
+++ b/scRNAseq/Figure_S6E.R
+library(Seurat)
+library(ggplot2)
+library(ComplexHeatmap)
+library(dplyr)
+library(dittoSeq)
+library(patchwork)
+library(RColorBrewer)
+
+
+# Load LSC subclustering object
+obj <- readRDS("AML_LSC_subclustering.rds")
+
+# Load markers from LSC subclustering object at resolution 0.6
+
+markers <- readRDS("Figure_S6E_data.rds")
+
+markers_split <- list()
+for (x in unique(markers$cluster)) {
+  markers_split[[paste0("Cluster_",x)]] <- markers %>% filter(cluster == x)
+  write.table(markers_split[[paste0("Cluster_",x)]], file = paste0("Cluster_",x,"_markers.txt"), 
+              sep = '\t', quote = FALSE)
+}
+
+fun <- function(x) {
+  x %>% slice_max(order_by = avg_logFC, n=10)
+}
+markers_split_up_top10 <- lapply(markers_split, FUN = fun)
+
+genes_to_plot <- c()
+tmp <- c()
+
+for (x in names(markers_split_up_top10)) {
+  tmp <- markers_split_up_top10[[x]]$gene
+  genes_to_plot <- c(genes_to_plot, tmp)
+}
+genes_to_plot_unique <- unique(genes_to_plot)
+
+PandY <- PurpleAndYellow(k = 50)
+Set1 <- RColorBrewer::brewer.pal(9, "Set1")
+Set2 <- RColorBrewer::brewer.pal(8, "Set2")
+Set3 <- RColorBrewer::brewer.pal(12, "Set3")
+values = c(Set1, Set2, Set3)[1:length(unique(obj$RNA_snn_h.PatientID_res.0.6))]
+valuesdonor = c(RColorBrewer::brewer.pal(12, "Paired"))[1:length(unique(obj$PatientID))]
+valuestimepoint = c("#e5e5e5","#b2b2b2","#7f7f7f","#4c4c4c","#000000")
+obj$TimepointID <- obj$Timepoint_corr
+
+png(filename = "Figure_S6E.png", width = 8, height = 8, res = 700, units = "in")
+dittoHeatmap(object = obj, genes = genes_to_plot_unique, complex = TRUE,
+             order.by = "RNA_snn_h.PatientID_res.0.6", annot.by = c("RNA_snn_h.PatientID_res.0.6","PatientID", "TimepointID"), heatmap.colors = PandY,
+             use_raster = TRUE, cluster_cols = F, cluster_rows = F, scale = "none", slot = "scale.data", fontsize_row = 12, 
+             breaks = seq(-2, 2, length.out = 51), show_rownames = T,
+             annot.colors = c(values, valuesdonor, valuestimepoint))
+dev.off()
+
+
--- a/scRNAseq/Figure_S6E_data.rds
+++ b/scRNAseq/Figure_S6E_data.rds
--- a/scRNAseq/Figure_S7ABC_a.R
+++ b/scRNAseq/Figure_S7ABC_a.R
+
+NMF <- function(inbam, sampleid, statsfile, Chromium_10X_CB_indexes, tags.of.interest = c("CB","UB","GN","RE")){
+  
+  # load libraries
+  require(Rsamtools)
+  require(plyr)
+  require(ggplot2)
+  
+  # define interval for fetching reads
+  
+  which <- IRangesList("5"=IRanges(171410538L, 171410543L))
+  
+  # Extract interesting tags
+  
+  param <- ScanBamParam(which=which,tag=tags.of.interest, what=scanBamWhat())
+  filteredbam <- scanBam(inbam, param=param)
+  
+  # Defining NPM1 mutated patterns
+  
+  NPM1_patterns <- c("^CTCTGTCTGGCAG","^TCTGTCTGGCAG", "^CTGTCTGGCAG", 
+                     "^TGTCTGGCAG", "^GTCTGGCAG", "TCTCTGTCTGGC", 
+                     "GATCTCTGTCTGG$","GATCTCTGTCTG$", "GATCTCTGTCT$", "GATCTCTGTC$")
+  
+  WT_pattern <- "TCTCTGGCAG"
+  
+  # Reads matching NPM1 patterns
+  
+  matches_NPM1_patterns_index <- grep(x = filteredbam$`5:171410538-171410543`$seq, pattern = paste(NPM1_patterns,collapse="|"), perl = T)
+  
+  #cat(paste("Sampleid","Variable", "Value", sep=","), file=paste0(sampleid,"_stats.txt"), append=FALSE, sep = "\n")
+  cat(paste(sampleid,"N_Reads_matching_NPM1_patterns", length(matches_NPM1_patterns_index), sep=","), file=statsfile, append=TRUE, sep = "\n")
+  
+  # Reads not matchingNPM1 pattern
+  
+  matches_NO_NPM1_patterns_index <- grep(x = filteredbam$`5:171410538-171410543`$seq, 
+                                       pattern = paste(NPM1_patterns,collapse="|"), perl = T, invert = T)
+  
+  cat(paste(sampleid, "N_Reads_NOT_matching_NPM1_patterns", length(matches_NO_NPM1_patterns_index), sep=","), file=statsfile, append=TRUE, sep = "\n")
+  
+  # Create temporal data.frame to retrieve WT patterns on original dataset
+  
+  WT_tmp_df <- cbind(as.data.frame(filteredbam$`5:171410538-171410543`$seq[matches_NO_NPM1_patterns_index]), matches_NO_NPM1_patterns_index)
+  base::colnames(WT_tmp_df) <- c("seq","index_no_NPM1_pattern_main_bam")
+  
+  matches_WT_patterns_index <- WT_tmp_df[grep(x = WT_tmp_df$seq, pattern = "TCTCTGGCAG"),]$index_no_NPM1_pattern_main_bam
+  
+  cat(paste(sampleid,"N_Reads_NOT_matching_NPM1_patterns_but_matching_WT_pattern", length(matches_WT_patterns_index), sep=","), file=statsfile, append=TRUE, sep = "\n")
+  
+  # subsetting values according to index from matches
+  
+  mydata <- list()
+  for(i in names(filteredbam$`5:171410538-171410543`$tag)){
+    if(length(matches_NPM1_patterns_index) > 0){
+      mydata[["MUT"]][[i]] <- filteredbam$`5:171410538-171410543`$tag[[i]][matches_NPM1_patterns_index]
+    }
+    if(length(matches_WT_patterns_index) > 0){
+      mydata[["WT"]][[i]] <- filteredbam$`5:171410538-171410543`$tag[[i]][matches_WT_patterns_index]
+    }
+  }
+
+  # for both mut and wt:
+  
+  # 1. convert list of tags to data.frame
+  # 2. add sampled id
+  # 3. filtering out reads that have NULL CB
+  # 4. filtering out reads with BC not in the sample 4-indexes seed (non possibile con certi bams)
+  # 5. Discard UMI duplicates
+  # 6. Counting N° UMI (UB) belonging to each barcode
+  
+  df <- list()
+  counting <- list()
+  flag <- NULL
+  
+  for(k in c("MUT", "WT")){
+    
+    if(k == "MUT" & length(matches_NPM1_patterns_index) == 0){
+      flag <- "MUT"
+      cat(paste0(sampleid,",","N_Reads_supporting_", k, ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      cat(paste0(sampleid,",","N_Reads_supporting_", k, "_CB_BC_filt", ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      cat(paste0(sampleid,",","N_UMI_supporting_", k, ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      cat(paste0(sampleid,",","N_CB_supporting_", k, ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      
+      next} 
+    if(k == "WT" & length(matches_WT_patterns_index) == 0){
+      flag <- "WT"
+      cat(paste0(sampleid,",","N_Reads_supporting_", k, ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      cat(paste0(sampleid,",","N_Reads_supporting_", k, "_CB_BC_filt", ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      cat(paste0(sampleid,",","N_UMI_supporting_", k, ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      cat(paste0(sampleid,",","N_CB_supporting_", k, ",", 0), 
+          file=statsfile, append=TRUE, sep = "\n")
+      next
+    }
+    if(length(matches_NPM1_patterns_index) > 0 & length(matches_WT_patterns_index) > 0){
+      flag <- "WT_MUT"
+    }
+    # 1. convert list of tags to data.frame
+    
+    df[[k]] <- do.call(cbind, lapply(mydata[[k]], as.data.frame))
+    base::colnames(df[[k]]) <- tags.of.interest
+    
+    # 2. add sampled id
+    df[[k]]$SM <- sampleid
+    
+    cat(paste0(sampleid,",","N_Reads_supporting_", k, ",", base::nrow(df[[k]])), 
+        file=statsfile, append=TRUE, sep = "\n")
+    
+    # 3. filtering out reads that have NULL CB
+    # 4. filtering out reads with BC not in the sample 4-indexes seed
+    
+#    df[[k]] <- base::subset.data.frame(x = df[[k]], subset = !is.na(CB) & BC %in% Chromium_10X_CB_indexes)
+    df[[k]] <- base::subset.data.frame(x = df[[k]], subset = !is.na(CB))
+
+    cat(paste0(sampleid,",","N_Reads_supporting_", k, "_CB_BC_filt", ",", base::nrow(df[[k]])), 
+        file=statsfile, append=TRUE, sep = "\n")
+    
+    # 5. Discard UMI duplicates
+    
+    df[[k]] <- base::unique(df[[k]][,c("CB","UB")])
+    
+    cat(paste0(sampleid,",","N_UMI_supporting_", k, ",", base::nrow(df[[k]])), 
+        file=statsfile, append=TRUE, sep = "\n")
+    
+    # 6. Counting N° UMI (UB) belonging to each barcode
+    
+    counting[[k]] <- plyr::count(df = df[[k]], "CB")
+    
+    cat(paste0(sampleid,",","N_CB_supporting_", k, ",", base::nrow(counting[[k]])), 
+        file=statsfile, append=TRUE, sep = "\n")
+    
+    if(k == "MUT"){
+      base::colnames(counting[[k]]) <- c("CB","MUT")
+    }
+    if(k == "WT"){
+      base::colnames(counting[[k]]) <- c("CB","WT")
+    }
+  }
+
+  print(paste0("Sample: ",sampleid ,"My flag: ", flag))
+  
+  if(flag == "MUT"){
+    total <- counting[["WT"]]
+    total$MUT <- 0
+  }
+  if(flag == "WT"){
+    total <- counting[["MUT"]]
+    total$WT <- 0
+  }
+  if(!flag %in% c("WT","MUT")){
+    total <- merge.data.frame(x = counting[["MUT"]], y = counting[["WT"]], by = 'CB', all = T)
+  }
+
+  cat(paste0(sampleid,",","N_CB_supporting_both_MUT_and_WT", ",", base::nrow(total)), 
+      file=statsfile, append=TRUE, sep = "\n")
+  
+  # converting NA to 0
+  total[is.na(total)] <- 0
+  
+  # recoding Cellbarcodes in order to fit with Seurat metadata
+  
+  total$CB <- gsub(x = gsub(x = total$CB, pattern = "^", replacement = paste0(sampleid,"_"), perl = T), pattern = "-1$", replacement = "", perl = T)
+  
+  return(total)
+  
+}
--- a/scRNAseq/Figure_S7ABC_b.R
+++ b/scRNAseq/Figure_S7ABC_b.R
+# Performing analysis on all samples
+
+# loading function:
+
+source(file = "NMF.R")
+
+# write header stats.txt file
+cat(paste("Sampleid","Variable", "Value", sep=","), file="NMF_full_stats.txt", append=FALSE, sep = "\n")
+
+# read samplesheet file to match sampleid ---> 10X indexes
+
+samplesheet <- read.csv(file = "SampleSheet_PT19.csv", stringsAsFactors = F)
+
+# read 10x indexes
+
+indexes_10X <- read.csv(file = "Chromium_10X_indexes.txt", header = F, stringsAsFactors = F)
+
+# initialize list of results
+
+npm1_metadata <- list()
+
+for(sid in samplesheet$Sample){
+  
+  current_SI <- subset.data.frame(x = samplesheet, subset = Sample == sid, select = 'Index')$Index
+  current_cb_seed <- as.character(subset.data.frame(x = indexes_10X, subset = V1 == current_SI, select = c("V2","V3","V4","V5"))) 
+  
+  npm1_metadata[[sid]] <- NMF(inbam = paste0("/beegfs/scratch/ric.gentner/ric.gentner/scRNA_AML/M84toM97/results/PT19/00-NPM1_classification/input/",sid,"_possorted_genome_bam.bam"), sampleid = sid, Chromium_10X_CB_indexes = current_cb_seed, statsfile = "NMF_full_stats.txt")
+    
+}
+
+
+# Merging all data frames in a single one:
+
+NPM1_complete_data <- do.call(rbind, lapply(npm1_metadata, as.data.frame))
+base::rownames(NPM1_complete_data) <- NPM1_complete_data$CB
+NPM1_complete_data$Classification <- ifelse(NPM1_complete_data$MUT > 0, "MUT", ifelse(NPM1_complete_data$WT > 5, "WT","ND"))
+write.table(x = NPM1_complete_data, file = "NPM1_PT19_data.txt", quote = F, row.names = F)
+