Commit bbe3ab96 authored by Teresa Tavella's avatar Teresa Tavella
Browse files

plot

parent 9bc178b7
library(tidyverse)
library(VennDiagram)
library(pheatmap)
library(ggplot2)
library(dplyr)
library(readr)
# parse HOMER known results
parse_homer_known <- function(file_path) {
df <- file_path
colnames(df) <- str_trim(colnames(df))
df <- df %>%
mutate(
TF_Name = str_extract(`Motif_Name`, "^[^(]+"),
TF_Family = str_extract(`Motif_Name`, "(?<=\\()[^)]+(?=\\))")
) %>%
replace_na(list(TF_Family = "Unknown"))
df$`Percent_Target` <- as.numeric(gsub('%', ' ', df$`Percent_Target`))
df$`Percent_Background` <- as.numeric(gsub('%', ' ', df$`Percent_Background`))
return(df)
}
wdir <- "/beegfs/scratch/ric.dimicco/ric.dimicco/DiMiccoR_EL_90-1049491329_ATACseq/results/05-filter_qual_bos_qual25/multibamc" #results motif
cnames <- c('Motif_Name', 'Consensus', 'P_value', 'Log_P_value', 'q_value_Benjamini', '# of Target Sequences with Motif(of 1635)', 'Percent_Target', '# of Background Sequences with Motif(of 97946)',
'Percent_Background') #'TF_Name', 'TF_Family')
open <- read.table(paste(wdir, 'Motifs-Open_DARs.sorted.bed', 'knownResults.txt', sep = '/'), row.names = NULL, sep ='\t', skip = 1)
colnames(open) <- cnames
close <- read.table(paste(wdir, 'Motifs-Close_DARs.sorted.bed', 'knownResults.txt', sep = '/'), row.names = NULL, sep ='\t', skip = 1)
colnames(close) <- cnames
## add family to df
df_motif <- list('open' = open, 'close' = close)
## add motif family name
df <- list()
for (n in names(df_motif)) {
df[[n]] <- parse_homer_known(df_motif[[n]])
}
#Extract TF name and family
open <- df$open
close <- df$close
# comparison function for Open vs Close DARs
compare_open_close_dars <- function(open_file, close_file, pval_threshold = 0.05) {
open <- open_file
open$dataset <- 'Open_DARs'
close <- close_file
close$dataset <- 'Close_DARs'
open_sig <- open %>%
dplyr::filter(P_value < pval_threshold)
close_sig <- close %>%
dplyr::filter(P_value < pval_threshold)
# Get motif
open_motifs <- open_sig$Motif_Name
close_motifs <- close_sig$Motif_Name
# overlaps
shared_motifs <- intersect(open_motifs, close_motifs)
open_unique <- setdiff(open_motifs, close_motifs)
close_unique <- setdiff(close_motifs, open_motifs)
summary_stats <- list(
open_significant = length(open_motifs),
close_significant = length(close_motifs),
shared = length(shared_motifs),
open_unique = length(open_unique),
close_unique = length(close_unique)
)
if (length(shared_motifs) > 0) {
shared_comparison <- open_sig %>%
filter(Motif_Name %in% shared_motifs) %>%
dplyr::select(Motif_Name, TF_Name, TF_Family, open_pval = P_value, open_logpval = Log_P_value,
open_percent = Percent_Target) %>%
left_join(
close_sig %>%
filter(Motif_Name %in% shared_motifs) %>%
dplyr::select(Motif_Name, close_pval = P_value, close_logpval = Log_P_value,
close_percent = Percent_Target),
by = "Motif_Name"
) %>%
mutate(
logpval_diff = close_logpval - open_logpval,
percent_diff = close_percent - open_percent,
enrichment_bias = case_when(
abs(close_logpval) > abs(open_logpval) & abs(logpval_diff) > 0.5 ~ "Close-enriched",
abs(open_logpval) > abs(close_logpval) & abs(logpval_diff) > 0.5 ~ "Open-enriched",
TRUE ~ "Similar"
)
)
} else {
shared_comparison <- tibble()
}
all_sig_combined <- bind_rows(open_sig, close_sig)
# results
list(
summary = summary_stats,
open_significant = open_sig,
close_significant = close_sig,
all_significant = all_sig_combined,
shared_detailed = shared_comparison,
shared_motifs = shared_motifs,
open_unique_motifs = open_unique,
close_unique_motifs = close_unique
)
}
rex <- compare_open_close_dars(open, close, pval_threshold = 0.05)
motif_data <- rex$shared_detailed
md <- motif_data %>%
dplyr::select(TF_Name, open_logpval, close_logpval, enrichment_bias, percent_diff) %>%
pivot_longer(cols = c(open_logpval, close_logpval),
names_to = "condition", values_to = "logpval") %>%
mutate(condition = ifelse(condition == "open_logpval", "Open DARs", "Close DARs"),
logpval = abs(logpval))
md <- data.frame(md)
md$enrichment_bias <- factor(md$enrichment_bias, levels =c("Open-enriched", "Similar","Close-enriched" ))
md <- md[order(md$enrichment_bias),]
md$TF_Name <- factor(md$TF_Name, levels= unique(md$TF_Name))
mx <- md %>%
group_by(enrichment_bias) %>%
arrange(enrichment_bias,logpval)
l <- data.frame()
for (n in unique(md$TF_Name)) {
val <- md[md$TF_Name == n,]
val <- val[val$logpval == max(val$logpval), ]
if (nrow(l) == 0) {
l <- val
} else {
l <- rbind(l, val)
}
}
l <- l %>%
group_by(enrichment_bias) %>%
arrange(enrichment_bias,logpval)
o <- subset(l, enrichment_bias == 'Open-enriched')
final <- rbind( o[order(o$logpval, decreasing = TRUE),], subset(l, enrichment_bias != 'Open-enriched'))
md$TF_Name <- factor(md$TF_Name, levels = final$TF_Name )
md$condition <- ifelse(md$condition == 'Open DARs', 'Open in High', 'Open in Low')
md$TF_Name <- reverse(md$TF_Name)
plot1 <- ggplot(md, aes(x =TF_Name, y = logpval, fill = condition)) +
geom_col(position = 'dodge', width = 0.7, color='black') +
scale_fill_manual(values = c("Open in High" = "#0000ffff", "Open in Low" = "#e74c3c")) +
coord_flip() +
geom_hline(yintercept = -log(0.05), linetype="dashed", color='red')+
labs(title = "",#CORE Transcription Factor Binding sites enriched from Motif analysis",
subtitle = "HIGH vs Low Dose HSC Transplant",
x = "",
y = "-Log P Value Enrichment",
fill = "") +
theme_minimal() +
theme(
#
text = element_text(size = 20, family = "Arial"),
plot.title = element_text(size = 20, face = "bold", hjust = 0.5,
margin = margin(b = 20)),
axis.title.y = element_text(size = 20, face = "bold", family = "Arial"),
axis.text.x = element_text(size = 20, color = 'black'),
axis.text.y = element_text(size = 35, color = 'black', family = "Arial", angle= -45, hjust = 1, vjust = 1),
legend.position = 'right',
legend.title = element_text(size = 18, face = "bold"),
legend.text = element_text(size = 16),
legend.key.size = unit(1, "cm"),
legend.key.width = unit(1.5, "cm"),
legend.margin = margin(b = 15),
legend.box.spacing = unit(0.5, "cm"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black", size = 0.5, linetype = "solid"),
axis.ticks = element_line(colour = "black", size = 0.5),
axis.ticks.length = unit(0.25, "cm"),
strip.placement = "outside"
)+
scale_y_continuous(expand = c(0, 0), position = "right")
#ggsave("motif_coreenrichment_region_open_vs_close_sorted_v2.svg", plot = plot1, width = 10, height = 14, units = "in")
#############################
dx <- rex$all_significant
dx_subset <- dx[dx$`Motif_Name` %in% unique(c(rex$open_unique_motifs, rex$close_unique_motifs)),]
dx_subset$TF_Family <- factor(dx_subset$TF_Family, levels =rev(unique(dx_subset$TF_Family) ))
dx_subset$TF_Name <- ifelse(duplicated(dx_subset$TF_Name), paste0(dx_subset$TF_Name, '-',dx_subset$TF_Family) , dx_subset$TF_Name)
top20 <- dx_subset %>%
group_by(dataset) %>%
slice_head(n = 20) %>%
mutate(rank = row_number())
#############################
## load data with annotation
#############################
library(readxl)
top_results_close <- data.frame(read_excel('/beegfs/scratch/ric.dimicco/ric.dimicco/DiMiccoR_EL_90-1049491329_ATACseq/results/05-filter_qual_bos_qual25/multibamc/Top_20_Motif_analysis_annotated.xls', sheet = 'Open_in_Low'))
colnames(top_results_close)[14] <- 'Annotation'
top_results_open <- data.frame(read_excel('results/05-filter_qual_bos_qual25/multibamc/Top_20_Motif_analysis_annotated.xls', sheet = 'Open_in_High'))
colnames(top_results_open)[14] <- 'Annotation'
top20_annotated <- left_join(top20, top_results_close[, c('TF_Name','Annotation')], by = 'TF_Name')
top20_annotated_all <- left_join(top20_annotated, top_results_open[, c('TF_Name','Annotation')], by = 'TF_Name')
top <- data.frame(top20_annotated_all)
top$Annotation <- ifelse(is.na(top$Annotation.x), top$Annotation.y, top$Annotation.x)
my_colors <- c(
"HSC stemness" = "#a6cee3",
"HSC self-renewal" = "#33a02c",
"HSC homeostasis" = "#ffd92f",
"HSC senescence" = "#bdbdbd",
"Oxidative stress response" = "#b2df8a",
"Interferon signaling" = "#e31a1c",
"Myeloid committment" = "#1f78b4",
"Myegakaryo/erythroid committment" = "#ff7f00",
"Myeloid/erythroid committment" = "#6a3d9a",
"Lymphoid committment" = "#fb9a99"
)
top$Dataset <- ifelse(top$dataset == 'Close_DARs', 'Open in Low', 'Open in High')
p <- ggplot(top, aes( x=reorder(TF_Name,Log_P_value), y =-Log_P_value))+
geom_bar(aes(fill = Annotation), stat = "identity", position = "dodge",alpha = 0.7, size = 2)+
facet_wrap(~Dataset , scales = 'free_x',nrow =2)+ #scales = 'free',
theme_minimal()+
scale_fill_manual(values = my_colors) +
theme(
legend.position = 'right',
plot.title = element_text(hjust = 0.5),
text = element_text(size = 14),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_blank(),
plot.margin = margin(t = 20, r = 10, b = 10, l = 10, unit = 'pt'),
axis.line = element_line(colour = "black",
size = 0.3, linetype = "solid"),
axis.text.x = element_text(size=12, color='black', angle = 45, hjust = 1),
axis.text.y = element_text(size=12, color='black'),
strip.placement = "outside",
) + geom_hline(yintercept = -log(0.05), linetype = "dashed", color = "red") +
labs(title = "",
subtitle = "",
y = "-Log P Value Enrichment",
fill = "") +
guides(fill = guide_legend(ncol = 1))
ggsave("motif_enrichment_region_open_vs_close_unique_analysis_annotation_v2.svg", plot = p, width = 9, height = 5, units = "in")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment