Commit b2adc44d authored by Nicolò Gualandi's avatar Nicolò Gualandi
Browse files

Update files and script

parent 3b56eecc
ASXL1
ASXL2
ATM
ATRX
BCOR
BCORL1
BRAF
BRCC3
BRINP3
CALR
CBL
CBLB
CDKN2A
CEBPA
CREBBP
CSF1R
CSF3R
CTCF
CUX1
DDX3X
DHX15
DNMT3A
EP300
ETV6
EZH2
FBXW7
FLT3
GATA1
GATA2
GNAS
GNB1
HNRNPK
HRAS
IDH1
IDH2
IKZF1
JAK2
JAK3
KANSL1
KDM5A
KDM6A
KIT
KMT2A
KMT2C
KMT2D
KMT2E
KRAS
LUC7L2
MAU2
MED12
MPL
MTA2
MYC
MYD88
NF1
NIPBL
NOTCH1
NPM1
NRAS
PDGFRA
PDS5B
PHF6
PIGA
PPM1D
PRDM1
PRPF40B
PRPF8
PTEN
PTPN11
PTPRF
RAD21
RB1
RIT1
RUNX1
SETBP1
SETD2
SETDB1
SF1
SF3A1
SF3B1
SH2B3
SMC1A
SMC3
SMG1
SRSF2
STAG1
STAG2
STAT3
STAT5B
SUZ12
TET2
TP53
U2AF1
U2AF2
WAPL
WT1
ZRSR2
ID Sample Age Source Concentration (ng/μL) tot volume (uL) Total DNA (ng) Group_Age Source.1
2 PZ28 1975 Only Human 47.8 NA 1434 Young General_pop
3 P23 1929 Only Human 17 NA 510 Old General_pop
4 PZ39 1953 Only Human 20.4 NA 612 Old General_pop
17 PZ37 1940 Only Human 22 NA 660 Old General_pop
19 PZ36 1944 Only Human 22.6 NA 678 Old General_pop
26 PZ22 1960 Only Human 95 NA 2850 Young General_pop
43 34-OLD2 1951 Only Human 95.4 NA 2862 Old General_pop
44 34-OLD3 1948 Only Human 37.9 NA 1137 Old General_pop
This diff is collapsed.
library(stringr)
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(reshape2)
library(ComplexHeatmap)
#Load metadata
metadata = read.delim("Metadata.txt")
#Read clonal hematopoiesis associated genes from Jackbsen et al.
CH.genes = read.table("ClonalHema_Jakobsen_et_al.txt")
# List of samples to analyze
samples <- c("PZ22", "P23","PZ28", "PZ36","PZ37","PZ39", "34-OLD2", "34-OLD3")
######################### Part 1 ###############################################
#import and process vcf files, if vcf files are not available skip directly to part 2 and import already processed txt files
#Set working directory containing SnpSift annotated vcf files called using Mutect2
wdir <- "Mutect2"
###################### Mutect2 vcf processing ##################################
#Create dataframe with variants from vcf files
full.t <- data.frame()
for (ss in samples) {
print(ss)
orig.name = ss
t <- read.table(gzfile(paste(wdir, ss, ".mutect2.filtermutectcalls.ANNOT.dbSNP.vcf.gz", sep = "")), comment.char = "#")
colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
if (grepl("_", ss)){
ss = unlist(strsplit(ss, "_"))[2]
}
t$Sample <- ss
t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 7)[,4])
t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 7)[,2], ",", 2)[,1])
t$VAF <- (t$DP - t$AD) / t$DP
t$AD_alt = t$DP - t$AD
t$orig.name = orig.name
t$Common = unlist(lapply(t$ANNOT, function(x) grepl("COMMON", x)))
full.t <- rbind(full.t, t)
}
# Filter CHR only
chr <- c(paste0("chr", seq(1,22)), "chrX")
full.t <- subset(full.t, CHROM %in% chr)
full.t$CHROM <- factor(full.t$CHROM, levels = chr)
# Remove Multi
full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]
# Parse snpEff Annotation
full.t.nomulti$snpEff_tmp <- lapply(full.t.nomulti$ANNOT, function(x){
tmp <- strsplit(x, ";")[[1]]
ann_pos <- length(tmp)
if (!startsWith(tmp[length(tmp)], "ANN")) {
for (i in seq(1,length(tmp))) {
if (startsWith(tmp[i], "ANN")) {
ann_pos <- i
}
}
}
tmp <- tmp[ann_pos]
})
full.t.nomulti$snpEff_GeneName <- sapply(full.t.nomulti$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][5]
}, USE.NAMES = F)
full.t.nomulti.CHGenes = full.t.nomulti[full.t.nomulti$snpEff_Gene %in% CH.genes$V1,]
full.t.nomulti.CHGenes$Var.ID = paste(full.t.nomulti.CHGenes$CHROM, full.t.nomulti.CHGenes$POS, sep = "_")
full.t.nomulti.CHGenes = merge(full.t.nomulti.CHGenes, metadata, by = "Sample")
full.t.nomulti.CHGenes$Sample = factor(full.t.nomulti.CHGenes$Sample)
#Add information from SnpEff annotations
full.t.nomulti.CHGenes$snpEff_Effect <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][2]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_Impact <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][3]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_Gene <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][4]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_BioType <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][8]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_tmp <- NULL
#Add type of variantion: Indels, Snp, Other
full.t.nomulti.CHGenes <- mutate(full.t.nomulti.CHGenes,
TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "MNV",
nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
TRUE ~ "Other"))
#Mantain only "deleterious" variantions
deleterious = names(table(full.t.nomulti.CHGenes$snpEff_Effect))
deleterious = c("disruptive_inframe_deletion", "frameshift_variant", "frameshift_variant&stop_gained",
"missense_variant", "missense_variant&splice_region_variant", "splice_acceptor_variant&intron_variant",
"splice_acceptor_variant&splice_donor_variant&disruptive_inframe_deletion&splice_region_variant&intron_variant",
"splice_region_variant&intron_variant", "splice_region_variant&synonymous_variant" ,
"stop_gained", "stop_gained&splice_region_variant")
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$snpEff_Effect %in% deleterious,]
#Apply filters: AD_alt > 5 SNV and AD_alt > 10 INDELS and PASS or "clustered_event"
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$FILTER == "PASS" | full.t.nomulti.CHGenes$FILTER == "clustered_events", ]
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[ifelse( (full.t.nomulti.CHGenes$AD_alt >= 5 & full.t.nomulti.CHGenes$TYPE == "SNV") |
(full.t.nomulti.CHGenes$AD_alt >= 10 & (full.t.nomulti.CHGenes$TYPE == "DEL" | full.t.nomulti.CHGenes$TYPE == "INS") ),
TRUE, FALSE), ]
#Remove VAF > 0.4 & < 0.6 or > 0.9
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[!(full.t.nomulti.CHGenes$VAF > 0.4 & full.t.nomulti.CHGenes$VAF < 0.6) & !full.t.nomulti.CHGenes$VAF > 0.9,]
#Remove Common
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$Common == FALSE,]
#VAF > 0.01
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$VAF > 0.01,]
#Rename filterd dataframe
full.t.nomulti.CHGenes.Mutect2 = full.t.nomulti.CHGenes
###################### VarDict vcf processing ###################################
wdir <- "VarDict"
full.t <- data.frame()
for (ss in samples) {
print(ss)
orig.name = ss
empty <- tryCatch({
df <- read.table(gzfile(paste(wdir, ss, ".ANNOT.dbSNP.vcf.gz", sep = "")), comment.char = "#")
nrow(df) == 0
}, error = function(e) {
message("Error reading file, skipping: ", e$message)
TRUE
})
if (empty == TRUE){
print("Skipping")
next
}
t <- read.table(gzfile(paste(wdir, ss, ".ANNOT.dbSNP.vcf.gz", sep = "")), comment.char = "#")
colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
if (grepl("_", ss)){
ss = unlist(strsplit(ss, "_"))[2]
}
t$Sample <- ss
t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
t$MQ = as.numeric(gsub("MQ=", "", str_split_fixed(t$ANNOT, ";", 20)[,15]))
t$SBF = as.numeric(gsub("SBF=", "", str_split_fixed(t$ANNOT, ";", 20)[,13]))
t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 7)[,2])
t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 7)[,4], ",", 2)[,1])
t$VAF <- (t$DP - t$AD) / t$DP
t$AD_alt = t$DP - t$AD
t$orig.name = orig.name
t$Common = unlist(lapply(t$ANNOT, function(x) grepl("COMMON", x)))
full.t <- rbind(full.t, t)
}
# Filter CHR only
chr <- c(paste0("chr", seq(1,22)), "chrX")
full.t <- subset(full.t, CHROM %in% chr)
full.t$CHROM <- factor(full.t$CHROM, levels = chr)
# Remove Multi
full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]
## Parse snpEff Annotation
full.t.nomulti$snpEff_tmp <- lapply(full.t.nomulti$ANNOT, function(x){
tmp <- strsplit(x, ";")[[1]]
ann_pos <- length(tmp)
if (!startsWith(tmp[length(tmp)], "ANN")) {
for (i in seq(1,length(tmp))) {
if (startsWith(tmp[i], "ANN")) {
ann_pos <- i
}
}
}
tmp <- tmp[ann_pos]
})
full.t.nomulti$snpEff_GeneName <- sapply(full.t.nomulti$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][5]
}, USE.NAMES = F)
#Selevct only CH genes
full.t.nomulti.CHGenes = full.t.nomulti[full.t.nomulti$snpEff_Gene %in% CH.genes$V1,]
full.t.nomulti.CHGenes$Var.ID = paste(full.t.nomulti.CHGenes$CHROM, full.t.nomulti.CHGenes$POS, sep = "_")
full.t.nomulti.CHGenes = merge(full.t.nomulti.CHGenes, metadata, by = "Sample")
full.t.nomulti.CHGenes$Sample = factor(full.t.nomulti.CHGenes$Sample)
full.t.nomulti.CHGenes$snpEff_Effect <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][2]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_Impact <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][3]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_Gene <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][4]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_BioType <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
strsplit(tmp, "|", fixed = T)[[1]][8]
}, USE.NAMES = F)
full.t.nomulti.CHGenes$snpEff_tmp <- NULL
#Add type
full.t.nomulti.CHGenes <- mutate(full.t.nomulti.CHGenes,
TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "MNV",
nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
TRUE ~ "Other"))
#Only deleterouis mutations
deleterious = names(table(full.t.nomulti.CHGenes$snpEff_Effect))
deleterious = c("disruptive_inframe_deletion", "frameshift_variant", "frameshift_variant&stop_gained",
"missense_variant", "missense_variant&splice_region_variant", "splice_acceptor_variant&intron_variant",
"splice_acceptor_variant&splice_donor_variant&disruptive_inframe_deletion&splice_region_variant&intron_variant",
"splice_region_variant&intron_variant", "splice_region_variant&synonymous_variant" ,
"stop_gained", "stop_gained&splice_region_variant")
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$snpEff_Effect %in% deleterious,]
#Apply filters: AD_alt > 5 SNV and AD_alt > 10 INDELS and PASS or "clustered_event"
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$FILTER == "PASS", ]
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[ifelse( (full.t.nomulti.CHGenes$AD_alt >= 5 & full.t.nomulti.CHGenes$TYPE == "SNV") |
(full.t.nomulti.CHGenes$AD_alt >= 10 & (full.t.nomulti.CHGenes$TYPE == "DEL" | full.t.nomulti.CHGenes$TYPE == "INS") ),
TRUE, FALSE), ]
#Remove VAF > 0.4 & < 0.6 or > 0.9
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[!(full.t.nomulti.CHGenes$VAF > 0.4 & full.t.nomulti.CHGenes$VAF < 0.6) & !full.t.nomulti.CHGenes$VAF > 0.9,]
#Quality >= 25 and MQ >= 40 and SBF <= 0.0001
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$QUAL >= 30,]
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$MQ >= 40,]
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$SBF >= 0.0001,]
#Remove Common
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$Common == FALSE,]
#VAF > 0.01
full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$VAF > 0.01,]
full.t.nomulti.CHGenes.VarDict = full.t.nomulti.CHGenes
############################## Part 2 #########################################
################ Oncoplot for common consensus mutations #######################
full.t.nomulti.CHGenes.Mutect2 = read.delim("Mutect2_variants.txt")
full.t.nomulti.CHGenes.VarDict = read.delim("VarDict_variants.txt")
#Find common variantions called by both Mutect2 and VarDict
full.t.nomulti.CHGenes.VarDict$ID_SAMPLE = paste(full.t.nomulti.CHGenes.VarDict$Sample, full.t.nomulti.CHGenes.VarDict$Vars, sep = "_")
full.t.nomulti.CHGenes.Mutect2$ID_SAMPLE = paste(full.t.nomulti.CHGenes.Mutect2$Sample, full.t.nomulti.CHGenes.Mutect2$Vars, sep = "_")
common = intersect(full.t.nomulti.CHGenes.VarDict$ID_SAMPLE, full.t.nomulti.CHGenes.Mutect2$ID_SAMPLE)
full.t.nomulti.CHGenes.common = full.t.nomulti.CHGenes.Mutect2[full.t.nomulti.CHGenes.Mutect2$ID_SAMPLE %in% common & full.t.nomulti.CHGenes.Mutect2$TYPE == "SNV",]
#Select samples
toinclude = samples
#Create Input matrix for oncoPrint
mutation_matrix = aggregate(TYPE ~ snpEff_GeneName + Sample, data = full.t.nomulti.CHGenes.common,
FUN = function(x) paste(x, collapse = ";"))
mutation_matrix <- dcast(mutation_matrix, snpEff_GeneName ~ Sample, value.var = "TYPE", fill = "")
rownames(mutation_matrix) = mutation_matrix$snpEff_GeneName
mutation_matrix$snpEff_GeneName = NULL
#Check that all samples are present in matrix, some sample may have 0 mutations, if not present add them back
tobeadded = toinclude[!(toinclude %in% colnames(mutation_matrix))]
if (length(tobeadded) > 0){
print("Add missing")
tmp.frame = data.frame(matrix(ncol = length(tobeadded)))
colnames(tmp.frame) = tobeadded
mutation_matrix = cbind(mutation_matrix, tmp.frame)
}
#Add also gene with 0 mutations
tobeadded = unique(CH.genes[!(CH.genes$V1 %in% rownames(mutation_matrix)), "V1"])
if (length(tobeadded) > 0){
print("Add missing genes with 0 mutations")
tmp.frame = data.frame(matrix(nrow = length(tobeadded), ncol = ncol(mutation_matrix)))
colnames(tmp.frame) = colnames(mutation_matrix)
rownames(tmp.frame) = tobeadded
mutation_matrix = rbind(mutation_matrix, tmp.frame)
}
#Create annotations dataframe
cohort_info = as.data.frame(metadata[metadata$Sample %in% toinclude,c ("Sample", "Source.1")])
rownames(cohort_info) = cohort_info$Sample
cohort_info = cohort_info[colnames(mutation_matrix),]
ha <- HeatmapAnnotation(
Cohort = cohort_info$Source.1,
which = "col",
col = list(Cohort = c("General_pop" = "lightgreen"))
)
#Change names of file: from fastq names to general name
v <- c(
"PZ22:HD MA 2",
"P23:HD OLD 6",
"PZ28:HD MA 1",
"PZ36:HD OLD 8",
"PZ37:HD OLD 9",
"PZ39:HD MA 3",
"34-OLD2:HD OLD 16",
"34-OLD3:HD MA 8"
)
keys <- sub(":.*$", "", v)
values <- sub("^[^:]+:", "", v)
x <- colnames(mutation_matrix)
for(i in seq_along(keys)){
x <- gsub(keys[i], values[i], x, fixed = TRUE)
}
colnames(mutation_matrix) = x
col = c("SNV" = "#008000")
alter_fun = list(
background = function(x, y, w, h) {
grid.rect(x, y, w-unit(2, "pt"), h-unit(2, "pt"),
gp = gpar(fill = "#CCCCCC", col = NA))
},
# big blue
INS = function(x, y, w, h) {
grid.rect(x, y, w-unit(2, "pt"), h-unit(2, "pt"),
gp = gpar(fill = col["INS"], col = NA))
},
# big red
DEL = function(x, y, w, h) {
grid.rect(x, y, w-unit(2, "pt"), h-unit(2, "pt"),
gp = gpar(fill = col["DEL"], col = NA))
},
# small green
SNV = function(x, y, w, h) {
grid.rect(x, y, w-unit(2, "pt"), h*0.33,
gp = gpar(fill = col["SNV"], col = NA))
}
)
# Do Oncoprint
oncoPrint(mutation_matrix,
row_names_gp = gpar(fontsize = 10),pct_gp = gpar(fontsize = 10),
remove_empty_rows = T,
alter_fun = alter_fun, col = col,
column_order = colnames(mutation_matrix),
show_column_names = TRUE,
show_row_names = TRUE,
show_pct = F,
bottom_annotation = ha)
Code and files for WES analysis
#Whole-Exome Sequencing (WES) Analysis
This repository contains the files and scripts used for the WES analysis presented in:
Lettera et al., 2025
Molecular and phenotypic blueprint of the hematopoietic compartment reveals proliferation stress as a driver of age-associated human stem cell dysfunctions
#Study Context
Whole-exome sequencing (WES) was performed to evaluate potential genomic alterations in human hematopoietic stem cells during aging. The analysis focused on variant detection and functional annotation.
#Repository Description
OncoPrint.R: contains the code to process, filter, and create an oncoprint plot from VCF files.
ClonalHema_Jakobsen_et_al.txt: contains the list of clonal hematopoiesis–associated genes from Jakobsen et al.
Metadata.txt: contains information about the samples.
Mutect2_variants.txt: contains the already computed and filtered matrix of variants called using the Mutect2 pipeline.
VarDict_variants.txt: contains the already computed and filtered matrix of variants called using the VarDict pipeline.
#Data Availability
Raw FASTQ files are available in the European Nucleotide Archive (ENA), accession code: PRJEB93902.
#Tools and Notes
Variants are called using both Mutect2 and VarDict to select a set of high-confidence variants.
Variants were then filtered for quality, coverage, presence in a panel of genes related to clonal hematopoiesis, and effect on the harboring gene.
The oncoprint was generated using the ComplexHeatmap R package to explore variant distributions across genes and samples.
#Citation
If you use this repository or data, please cite:
Lettera et al., Molecular and phenotypic blueprint of the hematopoietic compartment reveals proliferation stress as a driver of age-associated human stem cell dysfunctions
Thank you.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment