library(tidyverse) library(VennDiagram) library(pheatmap) library(ggplot2) library(dplyr) library(readr) # parse HOMER known results parse_homer_known <- function(file_path) { df <- file_path colnames(df) <- str_trim(colnames(df)) df <- df %>% mutate( TF_Name = str_extract(`Motif_Name`, "^[^(]+"), TF_Family = str_extract(`Motif_Name`, "(?<=\\()[^)]+(?=\\))") ) %>% replace_na(list(TF_Family = "Unknown")) df$`Percent_Target` <- as.numeric(gsub('%', ' ', df$`Percent_Target`)) df$`Percent_Background` <- as.numeric(gsub('%', ' ', df$`Percent_Background`)) return(df) } wdir <- "/beegfs/scratch/ric.dimicco/ric.dimicco/DiMiccoR_EL_90-1049491329_ATACseq/results/05-filter_qual_bos_qual25/multibamc" #results motif cnames <- c('Motif_Name', 'Consensus', 'P_value', 'Log_P_value', 'q_value_Benjamini', '# of Target Sequences with Motif(of 1635)', 'Percent_Target', '# of Background Sequences with Motif(of 97946)', 'Percent_Background') #'TF_Name', 'TF_Family') open <- read.table(paste(wdir, 'Motifs-Open_DARs.sorted.bed', 'knownResults.txt', sep = '/'), row.names = NULL, sep ='\t', skip = 1) colnames(open) <- cnames close <- read.table(paste(wdir, 'Motifs-Close_DARs.sorted.bed', 'knownResults.txt', sep = '/'), row.names = NULL, sep ='\t', skip = 1) colnames(close) <- cnames ## add family to df df_motif <- list('open' = open, 'close' = close) ## add motif family name df <- list() for (n in names(df_motif)) { df[[n]] <- parse_homer_known(df_motif[[n]]) } #Extract TF name and family open <- df$open close <- df$close # comparison function for Open vs Close DARs compare_open_close_dars <- function(open_file, close_file, pval_threshold = 0.05) { open <- open_file open$dataset <- 'Open_DARs' close <- close_file close$dataset <- 'Close_DARs' open_sig <- open %>% dplyr::filter(P_value < pval_threshold) close_sig <- close %>% dplyr::filter(P_value < pval_threshold) # Get motif open_motifs <- open_sig$Motif_Name close_motifs <- close_sig$Motif_Name # overlaps shared_motifs <- intersect(open_motifs, close_motifs) open_unique <- setdiff(open_motifs, close_motifs) close_unique <- setdiff(close_motifs, open_motifs) summary_stats <- list( open_significant = length(open_motifs), close_significant = length(close_motifs), shared = length(shared_motifs), open_unique = length(open_unique), close_unique = length(close_unique) ) if (length(shared_motifs) > 0) { shared_comparison <- open_sig %>% filter(Motif_Name %in% shared_motifs) %>% dplyr::select(Motif_Name, TF_Name, TF_Family, open_pval = P_value, open_logpval = Log_P_value, open_percent = Percent_Target) %>% left_join( close_sig %>% filter(Motif_Name %in% shared_motifs) %>% dplyr::select(Motif_Name, close_pval = P_value, close_logpval = Log_P_value, close_percent = Percent_Target), by = "Motif_Name" ) %>% mutate( logpval_diff = close_logpval - open_logpval, percent_diff = close_percent - open_percent, enrichment_bias = case_when( abs(close_logpval) > abs(open_logpval) & abs(logpval_diff) > 0.5 ~ "Close-enriched", abs(open_logpval) > abs(close_logpval) & abs(logpval_diff) > 0.5 ~ "Open-enriched", TRUE ~ "Similar" ) ) } else { shared_comparison <- tibble() } all_sig_combined <- bind_rows(open_sig, close_sig) # results list( summary = summary_stats, open_significant = open_sig, close_significant = close_sig, all_significant = all_sig_combined, shared_detailed = shared_comparison, shared_motifs = shared_motifs, open_unique_motifs = open_unique, close_unique_motifs = close_unique ) } rex <- compare_open_close_dars(open, close, pval_threshold = 0.05) motif_data <- rex$shared_detailed md <- motif_data %>% dplyr::select(TF_Name, open_logpval, close_logpval, enrichment_bias, percent_diff) %>% pivot_longer(cols = c(open_logpval, close_logpval), names_to = "condition", values_to = "logpval") %>% mutate(condition = ifelse(condition == "open_logpval", "Open DARs", "Close DARs"), logpval = abs(logpval)) md <- data.frame(md) md$enrichment_bias <- factor(md$enrichment_bias, levels =c("Open-enriched", "Similar","Close-enriched" )) md <- md[order(md$enrichment_bias),] md$TF_Name <- factor(md$TF_Name, levels= unique(md$TF_Name)) mx <- md %>% group_by(enrichment_bias) %>% arrange(enrichment_bias,logpval) l <- data.frame() for (n in unique(md$TF_Name)) { val <- md[md$TF_Name == n,] val <- val[val$logpval == max(val$logpval), ] if (nrow(l) == 0) { l <- val } else { l <- rbind(l, val) } } l <- l %>% group_by(enrichment_bias) %>% arrange(enrichment_bias,logpval) o <- subset(l, enrichment_bias == 'Open-enriched') final <- rbind( o[order(o$logpval, decreasing = TRUE),], subset(l, enrichment_bias != 'Open-enriched')) md$TF_Name <- factor(md$TF_Name, levels = final$TF_Name ) md$condition <- ifelse(md$condition == 'Open DARs', 'Open in High', 'Open in Low') md$TF_Name <- reverse(md$TF_Name) plot1 <- ggplot(md, aes(x =TF_Name, y = logpval, fill = condition)) + geom_col(position = 'dodge', width = 0.7, color='black') + scale_fill_manual(values = c("Open in High" = "#0000ffff", "Open in Low" = "#e74c3c")) + coord_flip() + geom_hline(yintercept = -log(0.05), linetype="dashed", color='red')+ labs(title = "",#CORE Transcription Factor Binding sites enriched from Motif analysis", subtitle = "HIGH vs Low Dose HSC Transplant", x = "", y = "-Log P Value Enrichment", fill = "") + theme_minimal() + theme( # text = element_text(size = 20, family = "Arial"), plot.title = element_text(size = 20, face = "bold", hjust = 0.5, margin = margin(b = 20)), axis.title.y = element_text(size = 20, face = "bold", family = "Arial"), axis.text.x = element_text(size = 20, color = 'black'), axis.text.y = element_text(size = 35, color = 'black', family = "Arial", angle= -45, hjust = 1, vjust = 1), legend.position = 'right', legend.title = element_text(size = 18, face = "bold"), legend.text = element_text(size = 16), legend.key.size = unit(1, "cm"), legend.key.width = unit(1.5, "cm"), legend.margin = margin(b = 15), legend.box.spacing = unit(0.5, "cm"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black", size = 0.5, linetype = "solid"), axis.ticks = element_line(colour = "black", size = 0.5), axis.ticks.length = unit(0.25, "cm"), strip.placement = "outside" )+ scale_y_continuous(expand = c(0, 0), position = "right") #ggsave("motif_coreenrichment_region_open_vs_close_sorted_v2.svg", plot = plot1, width = 10, height = 14, units = "in") ############################# dx <- rex$all_significant dx_subset <- dx[dx$`Motif_Name` %in% unique(c(rex$open_unique_motifs, rex$close_unique_motifs)),] dx_subset$TF_Family <- factor(dx_subset$TF_Family, levels =rev(unique(dx_subset$TF_Family) )) dx_subset$TF_Name <- ifelse(duplicated(dx_subset$TF_Name), paste0(dx_subset$TF_Name, '-',dx_subset$TF_Family) , dx_subset$TF_Name) top20 <- dx_subset %>% group_by(dataset) %>% slice_head(n = 20) %>% mutate(rank = row_number()) ############################# ## load data with annotation ############################# library(readxl) top_results_close <- data.frame(read_excel('/beegfs/scratch/ric.dimicco/ric.dimicco/DiMiccoR_EL_90-1049491329_ATACseq/results/05-filter_qual_bos_qual25/multibamc/Top_20_Motif_analysis_annotated.xls', sheet = 'Open_in_Low')) colnames(top_results_close)[14] <- 'Annotation' top_results_open <- data.frame(read_excel('results/05-filter_qual_bos_qual25/multibamc/Top_20_Motif_analysis_annotated.xls', sheet = 'Open_in_High')) colnames(top_results_open)[14] <- 'Annotation' top20_annotated <- left_join(top20, top_results_close[, c('TF_Name','Annotation')], by = 'TF_Name') top20_annotated_all <- left_join(top20_annotated, top_results_open[, c('TF_Name','Annotation')], by = 'TF_Name') top <- data.frame(top20_annotated_all) top$Annotation <- ifelse(is.na(top$Annotation.x), top$Annotation.y, top$Annotation.x) my_colors <- c( "HSC stemness" = "#a6cee3", "HSC self-renewal" = "#33a02c", "HSC homeostasis" = "#ffd92f", "HSC senescence" = "#bdbdbd", "Oxidative stress response" = "#b2df8a", "Interferon signaling" = "#e31a1c", "Myeloid committment" = "#1f78b4", "Myegakaryo/erythroid committment" = "#ff7f00", "Myeloid/erythroid committment" = "#6a3d9a", "Lymphoid committment" = "#fb9a99" ) top$Dataset <- ifelse(top$dataset == 'Close_DARs', 'Open in Low', 'Open in High') p <- ggplot(top, aes( x=reorder(TF_Name,Log_P_value), y =-Log_P_value))+ geom_bar(aes(fill = Annotation), stat = "identity", position = "dodge",alpha = 0.7, size = 2)+ facet_wrap(~Dataset , scales = 'free_x',nrow =2)+ #scales = 'free', theme_minimal()+ scale_fill_manual(values = my_colors) + theme( legend.position = 'right', plot.title = element_text(hjust = 0.5), text = element_text(size = 14), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.title.x = element_blank(), plot.margin = margin(t = 20, r = 10, b = 10, l = 10, unit = 'pt'), axis.line = element_line(colour = "black", size = 0.3, linetype = "solid"), axis.text.x = element_text(size=12, color='black', angle = 45, hjust = 1), axis.text.y = element_text(size=12, color='black'), strip.placement = "outside", ) + geom_hline(yintercept = -log(0.05), linetype = "dashed", color = "red") + labs(title = "", subtitle = "", y = "-Log P Value Enrichment", fill = "") + guides(fill = guide_legend(ncol = 1)) ggsave("motif_enrichment_region_open_vs_close_unique_analysis_annotation_v2.svg", plot = p, width = 9, height = 5, units = "in")