Update files and script

b2adc44d · Nicolò Gualandi · 3b56eecc · b2adc44d · b2adc44d · b2adc44d
Commit b2adc44d authored Nov 24, 2025 by Nicolò Gualandi
--- a/ClonalHema_Jakobsen_et_al.txt
+++ b/ClonalHema_Jakobsen_et_al.txt
+ASXL1
+ASXL2
+ATM
+ATRX
+BCOR
+BCORL1
+BRAF
+BRCC3
+BRINP3
+CALR
+CBL
+CBLB
+CDKN2A
+CEBPA
+CREBBP
+CSF1R
+CSF3R
+CTCF
+CUX1
+DDX3X
+DHX15
+DNMT3A
+EP300
+ETV6
+EZH2
+FBXW7
+FLT3
+GATA1
+GATA2
+GNAS
+GNB1
+HNRNPK
+HRAS
+IDH1
+IDH2
+IKZF1
+JAK2
+JAK3
+KANSL1
+KDM5A
+KDM6A
+KIT
+KMT2A
+KMT2C
+KMT2D
+KMT2E
+KRAS
+LUC7L2
+MAU2
+MED12
+MPL
+MTA2
+MYC
+MYD88
+NF1
+NIPBL
+NOTCH1
+NPM1
+NRAS
+PDGFRA
+PDS5B
+PHF6
+PIGA
+PPM1D
+PRDM1
+PRPF40B
+PRPF8
+PTEN
+PTPN11
+PTPRF
+RAD21
+RB1
+RIT1
+RUNX1
+SETBP1
+SETD2
+SETDB1
+SF1
+SF3A1
+SF3B1
+SH2B3
+SMC1A
+SMC3
+SMG1
+SRSF2
+STAG1
+STAG2
+STAT3
+STAT5B
+SUZ12
+TET2
+TP53
+U2AF1
+U2AF2
+WAPL
+WT1
+ZRSR2
--- a/Metadata.txt
+++ b/Metadata.txt
+ID	Sample	Age	Source	Concentration (ng/μL)	tot volume (uL)	Total DNA (ng)	Group_Age	Source.1
+2	PZ28	1975	Only Human	47.8	NA	1434	Young	General_pop
+3	P23	1929	Only Human	17	NA	510	Old	General_pop
+4	PZ39	1953	Only Human	20.4	NA	612	Old	General_pop
+17	PZ37	1940	Only Human	22	NA	660	Old	General_pop
+19	PZ36	1944	Only Human	22.6	NA	678	Old	General_pop
+26	PZ22	1960	Only Human	95	NA	2850	Young	General_pop
+43	34-OLD2	1951	Only Human	95.4	NA	2862	Old	General_pop
+44	34-OLD3	1948	Only Human	37.9	NA	1137	Old	General_pop
--- a/Mutect2_variants.txt
+++ b/Mutect2_variants.txt
--- a/OncoPrint.R
+++ b/OncoPrint.R
+library(stringr)
+library(RColorBrewer)
+library(ggplot2)
+library(dplyr)
+library(reshape2)
+library(ComplexHeatmap)
+
+#Load metadata
+metadata = read.delim("Metadata.txt")
+
+#Read clonal hematopoiesis associated genes from Jackbsen et al.
+CH.genes = read.table("ClonalHema_Jakobsen_et_al.txt") 
+
+# List of samples to analyze
+samples <- c("PZ22", "P23","PZ28", "PZ36","PZ37","PZ39", "34-OLD2", "34-OLD3")
+
+######################### Part 1 ###############################################
+#import and process vcf files, if vcf files are not available skip directly to part 2 and import already processed txt files
+#Set working directory containing SnpSift annotated vcf files called using Mutect2
+wdir <- "Mutect2"
+
+###################### Mutect2 vcf processing ##################################
+#Create dataframe with variants from vcf files
+full.t <- data.frame()
+for (ss in samples) {
+  print(ss)
+  orig.name = ss
+  t <- read.table(gzfile(paste(wdir, ss, ".mutect2.filtermutectcalls.ANNOT.dbSNP.vcf.gz", sep = "")), comment.char = "#")
+  colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
+  if (grepl("_", ss)){
+    ss = unlist(strsplit(ss, "_"))[2]
+  }
+  t$Sample <- ss
+  t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
+  t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 7)[,4])
+  t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 7)[,2], ",", 2)[,1])
+  t$VAF <- (t$DP - t$AD) / t$DP
+  t$AD_alt = t$DP - t$AD
+  t$orig.name = orig.name
+  t$Common = unlist(lapply(t$ANNOT, function(x) grepl("COMMON", x)))
+  full.t <- rbind(full.t, t)
+}
+
+# Filter CHR only
+chr <- c(paste0("chr", seq(1,22)), "chrX")
+full.t <- subset(full.t, CHROM %in% chr)
+full.t$CHROM <- factor(full.t$CHROM, levels = chr)
+
+# Remove Multi
+full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]
+
+# Parse snpEff Annotation
+full.t.nomulti$snpEff_tmp <- lapply(full.t.nomulti$ANNOT, function(x){
+  tmp <- strsplit(x, ";")[[1]]
+  ann_pos <- length(tmp)
+  if (!startsWith(tmp[length(tmp)], "ANN")) {
+    for (i in seq(1,length(tmp))) {
+      if (startsWith(tmp[i], "ANN")) {
+        ann_pos <- i
+      }
+    }
+  }
+  tmp <- tmp[ann_pos]
+})
+
+full.t.nomulti$snpEff_GeneName <- sapply(full.t.nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][5]
+}, USE.NAMES = F)
+
+full.t.nomulti.CHGenes = full.t.nomulti[full.t.nomulti$snpEff_Gene %in% CH.genes$V1,]
+full.t.nomulti.CHGenes$Var.ID = paste(full.t.nomulti.CHGenes$CHROM, full.t.nomulti.CHGenes$POS, sep = "_")
+full.t.nomulti.CHGenes = merge(full.t.nomulti.CHGenes, metadata, by = "Sample")
+full.t.nomulti.CHGenes$Sample = factor(full.t.nomulti.CHGenes$Sample)
+
+#Add information from SnpEff annotations
+full.t.nomulti.CHGenes$snpEff_Effect <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][2]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_Impact <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][3]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_Gene <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][4]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_BioType <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][8]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_tmp <- NULL
+
+#Add type of variantion: Indels, Snp, Other
+full.t.nomulti.CHGenes <- mutate(full.t.nomulti.CHGenes,
+                                 TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
+                                                  nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "MNV",
+                                                  nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
+                                                  nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
+                                                  TRUE ~ "Other"))
+
+#Mantain only "deleterious" variantions
+deleterious = names(table(full.t.nomulti.CHGenes$snpEff_Effect))
+deleterious = c("disruptive_inframe_deletion", "frameshift_variant", "frameshift_variant&stop_gained", 
+                "missense_variant", "missense_variant&splice_region_variant", "splice_acceptor_variant&intron_variant", 
+                "splice_acceptor_variant&splice_donor_variant&disruptive_inframe_deletion&splice_region_variant&intron_variant", 
+                "splice_region_variant&intron_variant", "splice_region_variant&synonymous_variant" , 
+                "stop_gained", "stop_gained&splice_region_variant")
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$snpEff_Effect %in% deleterious,]
+
+
+#Apply filters: AD_alt > 5 SNV and AD_alt > 10 INDELS and PASS or "clustered_event"
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$FILTER == "PASS" | full.t.nomulti.CHGenes$FILTER == "clustered_events", ]
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[ifelse( (full.t.nomulti.CHGenes$AD_alt >= 5 & full.t.nomulti.CHGenes$TYPE == "SNV") | 
+                                                          (full.t.nomulti.CHGenes$AD_alt >= 10 & (full.t.nomulti.CHGenes$TYPE == "DEL" | full.t.nomulti.CHGenes$TYPE == "INS") ),
+                                                        TRUE, FALSE), ]
+
+#Remove VAF > 0.4 & < 0.6 or > 0.9
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[!(full.t.nomulti.CHGenes$VAF > 0.4 & full.t.nomulti.CHGenes$VAF < 0.6) & !full.t.nomulti.CHGenes$VAF > 0.9,]
+#Remove Common
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$Common == FALSE,]
+
+#VAF > 0.01
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$VAF > 0.01,]
+
+#Rename filterd dataframe
+full.t.nomulti.CHGenes.Mutect2 = full.t.nomulti.CHGenes
+
+
+###################### VarDict vcf processing ###################################
+wdir <- "VarDict"
+
+full.t <- data.frame()
+for (ss in samples) {
+  print(ss)
+  orig.name = ss
+  empty <- tryCatch({
+    df <- read.table(gzfile(paste(wdir, ss, ".ANNOT.dbSNP.vcf.gz", sep = "")), comment.char = "#")
+    nrow(df) == 0  
+  }, error = function(e) {
+    message("Error reading file, skipping: ", e$message)  
+    TRUE  
+  })
+  
+  if (empty == TRUE){
+    print("Skipping")
+    next
+  }
+  t <- read.table(gzfile(paste(wdir, ss, ".ANNOT.dbSNP.vcf.gz", sep = "")), comment.char = "#")
+  
+  colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
+  if (grepl("_", ss)){
+    ss = unlist(strsplit(ss, "_"))[2]
+  }
+  t$Sample <- ss
+  t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
+  t$MQ = as.numeric(gsub("MQ=", "", str_split_fixed(t$ANNOT, ";", 20)[,15]))
+  t$SBF = as.numeric(gsub("SBF=", "", str_split_fixed(t$ANNOT, ";", 20)[,13]))
+  t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 7)[,2])
+  t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 7)[,4], ",", 2)[,1])
+  t$VAF <- (t$DP - t$AD) / t$DP
+  t$AD_alt = t$DP - t$AD
+  t$orig.name = orig.name
+  t$Common = unlist(lapply(t$ANNOT, function(x) grepl("COMMON", x)))
+  full.t <- rbind(full.t, t)
+}
+
+# Filter CHR only
+chr <- c(paste0("chr", seq(1,22)), "chrX")
+full.t <- subset(full.t, CHROM %in% chr)
+full.t$CHROM <- factor(full.t$CHROM, levels = chr)
+
+# Remove Multi
+full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]
+
+## Parse snpEff Annotation
+full.t.nomulti$snpEff_tmp <- lapply(full.t.nomulti$ANNOT, function(x){
+  tmp <- strsplit(x, ";")[[1]]
+  ann_pos <- length(tmp)
+  if (!startsWith(tmp[length(tmp)], "ANN")) {
+    for (i in seq(1,length(tmp))) {
+      if (startsWith(tmp[i], "ANN")) {
+        ann_pos <- i
+      }
+    }
+  }
+  tmp <- tmp[ann_pos]
+})
+
+full.t.nomulti$snpEff_GeneName <- sapply(full.t.nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][5]
+}, USE.NAMES = F)
+
+#Selevct only CH genes
+full.t.nomulti.CHGenes = full.t.nomulti[full.t.nomulti$snpEff_Gene %in% CH.genes$V1,]
+full.t.nomulti.CHGenes$Var.ID = paste(full.t.nomulti.CHGenes$CHROM, full.t.nomulti.CHGenes$POS, sep = "_")
+full.t.nomulti.CHGenes = merge(full.t.nomulti.CHGenes, metadata, by = "Sample")
+full.t.nomulti.CHGenes$Sample = factor(full.t.nomulti.CHGenes$Sample)
+
+full.t.nomulti.CHGenes$snpEff_Effect <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][2]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_Impact <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][3]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_Gene <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][4]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_BioType <- sapply(full.t.nomulti.CHGenes$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][8]
+}, USE.NAMES = F)
+full.t.nomulti.CHGenes$snpEff_tmp <- NULL
+
+#Add type
+full.t.nomulti.CHGenes <- mutate(full.t.nomulti.CHGenes,
+                                 TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
+                                                  nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "MNV",
+                                                  nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
+                                                  nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
+                                                  TRUE ~ "Other"))
+
+#Only deleterouis mutations
+deleterious = names(table(full.t.nomulti.CHGenes$snpEff_Effect))
+deleterious = c("disruptive_inframe_deletion", "frameshift_variant", "frameshift_variant&stop_gained", 
+                "missense_variant", "missense_variant&splice_region_variant", "splice_acceptor_variant&intron_variant", 
+                "splice_acceptor_variant&splice_donor_variant&disruptive_inframe_deletion&splice_region_variant&intron_variant", 
+                "splice_region_variant&intron_variant", "splice_region_variant&synonymous_variant" , 
+                "stop_gained", "stop_gained&splice_region_variant")
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$snpEff_Effect %in% deleterious,]
+
+#Apply filters: AD_alt > 5 SNV and AD_alt > 10 INDELS and PASS or "clustered_event"
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$FILTER == "PASS", ]
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[ifelse( (full.t.nomulti.CHGenes$AD_alt >= 5 & full.t.nomulti.CHGenes$TYPE == "SNV") | 
+                                                          (full.t.nomulti.CHGenes$AD_alt >= 10 & (full.t.nomulti.CHGenes$TYPE == "DEL" | full.t.nomulti.CHGenes$TYPE == "INS") ),
+                                                        TRUE, FALSE), ]
+
+#Remove VAF > 0.4 & < 0.6 or > 0.9
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[!(full.t.nomulti.CHGenes$VAF > 0.4 & full.t.nomulti.CHGenes$VAF < 0.6) & !full.t.nomulti.CHGenes$VAF > 0.9,]
+
+#Quality >= 25 and MQ >= 40 and SBF <= 0.0001
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$QUAL >= 30,]
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$MQ >= 40,]
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$SBF >= 0.0001,]
+#Remove Common
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$Common == FALSE,]
+
+#VAF > 0.01
+full.t.nomulti.CHGenes = full.t.nomulti.CHGenes[full.t.nomulti.CHGenes$VAF > 0.01,]
+
+full.t.nomulti.CHGenes.VarDict = full.t.nomulti.CHGenes
+
+
+############################## Part 2  #########################################
+################ Oncoplot for common consensus mutations #######################
+
+full.t.nomulti.CHGenes.Mutect2 = read.delim("Mutect2_variants.txt")
+full.t.nomulti.CHGenes.VarDict = read.delim("VarDict_variants.txt")
+
+#Find common variantions called by both Mutect2 and VarDict
+full.t.nomulti.CHGenes.VarDict$ID_SAMPLE = paste(full.t.nomulti.CHGenes.VarDict$Sample, full.t.nomulti.CHGenes.VarDict$Vars, sep = "_")
+full.t.nomulti.CHGenes.Mutect2$ID_SAMPLE = paste(full.t.nomulti.CHGenes.Mutect2$Sample, full.t.nomulti.CHGenes.Mutect2$Vars, sep = "_")
+common = intersect(full.t.nomulti.CHGenes.VarDict$ID_SAMPLE, full.t.nomulti.CHGenes.Mutect2$ID_SAMPLE)
+full.t.nomulti.CHGenes.common = full.t.nomulti.CHGenes.Mutect2[full.t.nomulti.CHGenes.Mutect2$ID_SAMPLE %in% common & full.t.nomulti.CHGenes.Mutect2$TYPE == "SNV",]
+
+#Select samples
+toinclude = samples
+
+#Create Input matrix for oncoPrint
+mutation_matrix = aggregate(TYPE ~ snpEff_GeneName + Sample, data = full.t.nomulti.CHGenes.common, 
+                            FUN = function(x) paste(x, collapse = ";"))
+mutation_matrix <- dcast(mutation_matrix, snpEff_GeneName ~ Sample, value.var = "TYPE", fill = "")
+rownames(mutation_matrix) = mutation_matrix$snpEff_GeneName
+mutation_matrix$snpEff_GeneName = NULL
+
+#Check that all samples are present in matrix, some sample may have 0 mutations, if not present add them back
+tobeadded = toinclude[!(toinclude %in% colnames(mutation_matrix))]
+if (length(tobeadded) > 0){
+  print("Add missing")
+  tmp.frame = data.frame(matrix(ncol = length(tobeadded)))
+  colnames(tmp.frame) = tobeadded
+  mutation_matrix = cbind(mutation_matrix, tmp.frame)
+}
+
+#Add also gene with 0 mutations 
+tobeadded = unique(CH.genes[!(CH.genes$V1 %in% rownames(mutation_matrix)), "V1"])
+if (length(tobeadded) > 0){
+  print("Add missing genes with 0 mutations")
+  tmp.frame = data.frame(matrix(nrow = length(tobeadded), ncol = ncol(mutation_matrix)))
+  colnames(tmp.frame) = colnames(mutation_matrix)
+  rownames(tmp.frame) = tobeadded
+  mutation_matrix = rbind(mutation_matrix, tmp.frame)
+}
+
+
+#Create annotations dataframe
+cohort_info = as.data.frame(metadata[metadata$Sample %in% toinclude,c ("Sample", "Source.1")])
+rownames(cohort_info) = cohort_info$Sample
+cohort_info = cohort_info[colnames(mutation_matrix),]
+
+ha <- HeatmapAnnotation(
+  Cohort = cohort_info$Source.1,
+  which = "col",
+  col = list(Cohort = c("General_pop" = "lightgreen"))
+)
+
+#Change names of file: from fastq names to general name 
+v <- c(
+  "PZ22:HD MA 2",
+  "P23:HD OLD 6",
+  "PZ28:HD MA 1",
+  "PZ36:HD OLD 8",
+  "PZ37:HD OLD 9",
+  "PZ39:HD MA 3",
+  "34-OLD2:HD OLD 16",
+  "34-OLD3:HD MA 8"
+)
+
+keys   <- sub(":.*$", "", v)
+values <- sub("^[^:]+:", "", v)
+x <- colnames(mutation_matrix)
+for(i in seq_along(keys)){
+  x <- gsub(keys[i], values[i], x, fixed = TRUE)
+}
+colnames(mutation_matrix) = x
+
+
+col = c("SNV" = "#008000")
+
+alter_fun = list(
+  background = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(2, "pt"), h-unit(2, "pt"), 
+              gp = gpar(fill = "#CCCCCC", col = NA))
+  },
+  # big blue
+  INS = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(2, "pt"), h-unit(2, "pt"), 
+              gp = gpar(fill = col["INS"], col = NA))
+  },
+  # big red
+  DEL = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(2, "pt"), h-unit(2, "pt"), 
+              gp = gpar(fill = col["DEL"], col = NA))
+  },
+  # small green
+  SNV = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(2, "pt"), h*0.33, 
+              gp = gpar(fill = col["SNV"], col = NA))
+  }
+)
+
+
+
+# Do Oncoprint
+oncoPrint(mutation_matrix,
+          row_names_gp = gpar(fontsize = 10),pct_gp = gpar(fontsize = 10),
+          remove_empty_rows = T, 
+          alter_fun = alter_fun, col = col, 
+          column_order = colnames(mutation_matrix),
+          show_column_names = TRUE,
+          show_row_names = TRUE,
+          show_pct = F,
+          bottom_annotation = ha)
--- a/README.md
+++ b/README.md
-Code and files for WES analysis
+#Whole-Exome Sequencing (WES) Analysis
+
+This repository contains the files and scripts used for the WES analysis presented in:
+
+Lettera et al., 2025
+Molecular and phenotypic blueprint of the hematopoietic compartment reveals proliferation stress as a driver of age-associated human stem cell dysfunctions
+
+#Study Context
+
+Whole-exome sequencing (WES) was performed to evaluate potential genomic alterations in human hematopoietic stem cells during aging. The analysis focused on variant detection and functional annotation.
+
+#Repository Description
+
+OncoPrint.R: contains the code to process, filter, and create an oncoprint plot from VCF files.
+
+ClonalHema_Jakobsen_et_al.txt: contains the list of clonal hematopoiesis–associated genes from Jakobsen et al.
+
+Metadata.txt: contains information about the samples.
+
+Mutect2_variants.txt: contains the already computed and filtered matrix of variants called using the Mutect2 pipeline.
+
+VarDict_variants.txt: contains the already computed and filtered matrix of variants called using the VarDict pipeline.
+
+#Data Availability
+
+Raw FASTQ files are available in the European Nucleotide Archive (ENA), accession code: PRJEB93902.
+
+#Tools and Notes
+
+Variants are called using both Mutect2 and VarDict to select a set of high-confidence variants.
+
+Variants were then filtered for quality, coverage, presence in a panel of genes related to clonal hematopoiesis, and effect on the harboring gene.
+
+The oncoprint was generated using the ComplexHeatmap R package to explore variant distributions across genes and samples.
+
+#Citation
+
+If you use this repository or data, please cite:
+
+Lettera et al., Molecular and phenotypic blueprint of the hematopoietic compartment reveals proliferation stress as a driver of age-associated human stem cell dysfunctions
+
+Thank you.
--- a/VarDict_variants.txt
+++ b/VarDict_variants.txt