Sys.setenv(RETICULATE_PYTHON = "/home/philipp/miniconda3/envs/r-reticulate/bin/python3")
suppressPackageStartupMessages({
library(reticulate)
library(rtracklayer)
library(GenomicAlignments)
library(BRGenomics)
library(BSgenome.Mmusculus.UCSC.mm10)
library(tidyverse)
library(furrr)
library(motifmatchr)
library(JASPAR2020)
library(TFBSTools)
})
# Setup Multiprocessing
options(future.globals.maxSize=5000*1024^2)
future::plan(future::multisession, workers = 4)01 Prepare Input
1 Libraries
2 Configs
2.1 Output Directory
output_dir <- "/home/philipp/BPNet/input/"
if (!dir.exists(output_dir)){dir.create(output_dir)}
TFs <- c("Sox2", "Oct4", "Klf4", "Nanog")
purrr::walk(c(TFs, "patchcap", "figures"), ~ if (!dir.exists(paste0(output_dir, .x))){dir.create(paste0(output_dir, .x))})2.2 Peak Width
As in the BPNet paper, we are using a peak width of 1000 bp, meaning we consider 500 bp up- and downstream of the ChIP-seq peaks.
peak_width <- 10002.3 Chromosome Train/Tune/Test split
all_chroms <- paste0("chr", c(1:19, "X", "Y"))
# we use the same train/tune/test split as in the BPnet paper
chrom_list <- list("tune" = c("chr2", "chr3", "chr4"), # tune set (hyperparameter tuning): chromosomes 2, 3, 4
"test" = c("chr1", "chr8", "chr9")) # test set (performance evaluation): chromosome 1, 8, 9
chrom_list$train <- setdiff(all_chroms, c(chrom_list$tune, chrom_list$test)) # train set: all other chromosomes
chrom_list$tune
[1] "chr2" "chr3" "chr4"
$test
[1] "chr1" "chr8" "chr9"
$train
[1] "chr5" "chr6" "chr7" "chr10" "chr11" "chr12" "chr13" "chr14" "chr15"
[10] "chr16" "chr17" "chr18" "chr19" "chrX" "chrY"
2.4 Colors
colors = c("Klf4" = "#92C592",
"Nanog" = "#FFE03F",
"Oct4" = "#CD5C5C",
"Sox2" = "#849EEB",
"patchcap" = "#827F81")3 Function for One-Hot Encoding
Traditionally DNA sequences are encoded using: A = [1, 0, 0, 0], C = [0, 1, 0, 0], G = [0, 0, 1, 0], T = [0, 0, 0, 1], N = [0, 0, 0, 0].
one_hot <- function(sequence) {
len = nchar(sequence)
mtx <- matrix(data=0, nrow=4, ncol=len)
for (i in 1:len) {
if (substr(sequence, i, i) == "A") mtx[1, i] = 1
else if (substr(sequence, i, i) == "C") mtx[2, i] = 1
else if (substr(sequence, i, i) == "G") mtx[3, i] = 1
else if (substr(sequence, i, i) == "T") mtx[4, i] = 1
}
return(mtx)
}
compiler::enableJIT(0)[1] 3
one_hot_c <- compiler::cmpfun(one_hot) # compiling to bytecode not machine code like numba4 Read Stats
Look at the reading stats per TF to check how comparable the different ChIP-Nexus experiments are.
total_reads <- TFs %>%
purrr::set_names() %>%
purrr::map_dbl(function(tf) {
Rsamtools::countBam(paste0("../data/chip-nexus/", tf, "/pool_filt.bam"))$records
})ggplot() +
geom_bar(aes(y=names(total_reads), x=total_reads/1e6), stat="identity", fill="blue", alpha=0.4, width=0.4) +
labs(x="N", y="TF") +
ggdark::dark_theme_linedraw()Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().

5 Load the Peak Data
For each transcription factor (TF) we read in the peak information provided by the authors of the BPNet paper. The peaks were called using MACS2 (v.2.1.1.20160309). The following non-default settings were used: shift=-75, extsize=150, meaning the 5’ ends of each read were extended 75 bp in both directions.
In particular shift=-75 means the 5’ ends of the reads are moved -75 bp and extsize=150 means the reads are extended to a fixed length of 150 bp in 5’ -> 3’ direction.
Click to view sketch
Check the macs3 callpeak documentation here
peak_infos <- purrr::map(TFs, function(tf) {
narrowPeaks <-
rtracklayer::import(paste0("/home/philipp/AML_Final_Project/data/chip-nexus/", tf, "/idr-optimal-set.narrowPeak"),
format="narrowPeak")
# read in the peak summits and extend symmetrically to get 1000 bp width
summits <-
rtracklayer::import(paste0("/home/philipp/AML_Final_Project/data/chip-nexus/", tf, "/idr-optimal-set.summit.bed"),
format="bed") %>%
GenomicRanges::resize(width=peak_width, fix="center") %>%
plyranges::mutate(TF = tf) %>%
plyranges::mutate(set = dplyr::case_when(
as.character(seqnames) %in% chrom_list$tune ~ "tune",
as.character(seqnames) %in% chrom_list$test ~ "test",
as.character(seqnames) %in% chrom_list$train ~ "train"))
GenomicRanges::elementMetadata(summits) <- cbind(
GenomicRanges::elementMetadata(summits),
GenomicRanges::elementMetadata(narrowPeaks)
)
summits
}) %>%
do.call(what=c, args=.)
# save peak info as tsv
write_delim(data.frame(peak_infos) %>% dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)),
file=paste0(output_dir, "region_info.tsv"), delim = "\t")
peak_infos %>% headGRanges object with 6 ranges and 8 metadata columns:
seqnames ranges strand | TF set name
<Rle> <IRanges> <Rle> | <character> <character> <character>
[1] chrX 143482559-143483558 * | Sox2 train <NA>
[2] chrY 4149724-4150723 * | Sox2 train <NA>
[3] chr9 3001634-3002633 * | Sox2 test <NA>
[4] chr3 122145078-122146077 * | Sox2 tune <NA>
[5] chr5 28209565-28210564 * | Sox2 train <NA>
[6] chr8 86393561-86394560 * | Sox2 test <NA>
score signalValue pValue qValue peak
<numeric> <numeric> <numeric> <numeric> <integer>
[1] 1000 65.04316 37093.23 37086.70 145
[2] 1000 59.28276 2920.20 2914.44 142
[3] 1000 7.05411 2326.30 2320.59 128
[4] 1000 25.36805 1518.00 1512.39 407
[5] 1000 29.15046 1502.75 1497.15 292
[6] 1000 27.50292 1455.06 1449.46 307
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
5.1 Distribution of the MACS2 Scores
p <- peak_infos %>%
as.data.frame() %>%
ggplot() +
geom_density(aes(x=log2(qValue), color=TF), alpha=0.4, fill=NA) +
labs(x="Log2 qValue", y="Density", color="TF") +
scale_color_manual(values=colors) +
theme_bw()
ggsave(filename = "/home/philipp/BPNet/out/figures/macs2_scores_per_tf.pdf",
plot = p, width = 4, height = 4)
p +
ggdark::dark_theme_linedraw()
5.2 Number of Peaks per TF
p <- peak_infos %>%
as.data.frame() %>%
ggplot() +
geom_bar(aes(y=TF, fill=TF), alpha=0.4, width=0.4, color="black") +
labs(x="Number of Peaks", y="TF") +
theme_bw() +
scale_fill_manual(values=colors)
ggsave(filename = "/home/philipp/BPNet/out/figures/n_peaks_per_tf.pdf",
plot = p, width = 4, height = 4)
p +
ggdark::dark_theme_linedraw()
6 Translate Peak Sequences to One-Hot Encoding
seq_names <- purrr::imap(chrom_list, function(set_chroms, set_name) {
peak_info_subset <- peak_infos[seqnames(peak_infos) %in% set_chroms]
rnames <- paste0(as.character(peak_info_subset@seqnames), ":",
peak_info_subset@ranges@start, "-",
peak_info_subset@ranges@start + peak_info_subset@ranges@width - 1)
write_lines(rnames, file = paste0(output_dir, set_name, "_seq_names.txt"))
rnames
})str(seq_names)List of 3
$ tune : chr [1:29277] "chr3:122145078-122146077" "chr3:108433158-108434157" "chr2:52071742-52072741" "chr4:141565124-141566123" ...
$ test : chr [1:27727] "chr9:3001634-3002633" "chr8:86393561-86394560" "chr9:102205227-102206226" "chr1:134534776-134535775" ...
$ train: chr [1:93904] "chrX:143482559-143483558" "chrY:4149724-4150723" "chr5:28209565-28210564" "chr12:74803793-74804792" ...
one_hot_seqs <- furrr::future_imap(chrom_list, function(set_chroms, set_name) {
peak_info_subset <- peak_infos[seqnames(peak_infos) %in% set_chroms]
peak_seqs <- BSgenome::getSeq(
BSgenome.Mmusculus.UCSC.mm10,
names=as.character(peak_info_subset@seqnames),
start=(peak_info_subset@ranges@start),
end=(peak_info_subset@ranges@start + peak_width - 1),
strand="+"
)
peak_seqs@ranges@NAMES <- seq_names[[set_name]]
# check for correctness here: https://genome.ucsc.edu/cgi-bin/hgTracks?db=mm10
Biostrings::writeXStringSet(x=peak_seqs,
filepath = paste0(output_dir, set_name, "_seqs.fa"))
# one hot encode the sequence
one_hot_mtx <- matrix(data=0, nrow=length(peak_seqs)*4, ncol=peak_width)
for (peak_index in 1:length(peak_seqs)) {
one_hot_mtx[(((peak_index-1)*4)+1):((peak_index)*4), ] <-
one_hot_c(as.character(peak_seqs[[peak_index]]))
}
one_hot_mtx
})Check the output:
str(one_hot_seqs)List of 3
$ tune : num [1:117108, 1:1000] 1 0 0 0 0 1 0 0 0 0 ...
$ test : num [1:110908, 1:1000] 0 0 0 1 0 1 0 0 0 0 ...
$ train: num [1:375616, 1:1000] 0 0 0 1 1 0 0 0 0 1 ...
Write the matrices in binary format to disk using reticulate.
import numpy as np
for key, val in r.one_hot_seqs.items():
old_shape = val.shape
val = val.reshape(int(val.shape[0]/4), 4, 1000).astype(np.float32)
print(f"{key}: {old_shape} -> {val.shape}")
save_path = f"{r.output_dir}{key}_one_hot_seqs.npy"
print(f"-> {save_path}")
np.save(file=save_path, arr=val)tune: (117108, 1000) -> (29277, 4, 1000)
-> /home/philipp/BPNet/input/tune_one_hot_seqs.npy
test: (110908, 1000) -> (27727, 4, 1000)
-> /home/philipp/BPNet/input/test_one_hot_seqs.npy
train: (375616, 1000) -> (93904, 4, 1000)
-> /home/philipp/BPNet/input/train_one_hot_seqs.npy
7 Extract the TF Counts
First we have to merge overlapping peaks from the peak info file. Otherwise we read reads aligning to these overlapping peaks several times.
peak_infos_reduced <- GenomicRanges::reduce(resize(peak_infos, GenomicRanges::width(peak_infos + 1), "start"))
peak_infos_reducedGRanges object with 85250 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chrX 5704156-5705557 *
[2] chrX 6278899-6280002 *
[3] chrX 6411759-6412760 *
[4] chrX 6415698-6416699 *
[5] chrX 6445022-6446023 *
... ... ... ...
[85246] chr10 130523572-130524574 *
[85247] chr10 130527565-130528566 *
[85248] chr10 130535527-130536528 *
[85249] chr10 130541495-130543475 *
[85250] chr10 130594405-130595406 *
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
### Parallel loop over all tfs
tf_counts <- TFs %>%
purrr::set_names() %>%
furrr::future_map(function(tf) {
# read only from the alignment file in the given peak regions
# note: make sure that the BAM file is sorted (check for presence of ".bam.bai")
alignments <- readGAlignments(paste0("../data/chip-nexus/", tf, "/pool_filt.bam"),
param = ScanBamParam(which=peak_infos_reduced))
# split the alignment into pos and neg strand
align_pos <- alignments[GenomicRanges::strand(alignments)=="+"]
align_neg <- alignments[GenomicRanges::strand(alignments)=="-"]
# only retain first base pair of each read
align_pos@cigar <- rep("1M", length(align_pos))
align_neg@start <- GenomicAlignments::end(align_neg)
align_neg@cigar <- rep("1M", length(align_neg))
# compute the coverage per base pair
cov_list = list("pos" = GenomicAlignments::coverage(align_pos, weight = 1L),
"neg" = GenomicAlignments::coverage(align_neg, weight = 1L))
chrom_list %>%
purrr::imap(function(set_chroms, set_name) {
peak_info_subset <- peak_infos[seqnames(peak_infos) %in% set_chroms]
c("pos", "neg") %>%
purrr::set_names() %>%
purrr::map(function(strand) {
mtx <- matrix(data=0, ncol=peak_width, nrow=length(peak_info_subset))
for (i in 1:length(peak_info_subset)) {
chr <- as.character(peak_info_subset[i]@seqnames)
position_index <- peak_info_subset[i]@ranges@start:(peak_info_subset[i]@ranges@start + peak_width - 1)
mtx[i, ] = as.numeric(cov_list[[strand]][[chr]][position_index])
}
mtx
})
})
})str(tf_counts)List of 4
$ Sox2 :List of 3
..$ tune :List of 2
.. ..$ pos: num [1:29277, 1:1000] 0 0 0 0 0 0 0 0 1 0 ...
.. ..$ neg: num [1:29277, 1:1000] 0 0 1 0 0 0 0 0 1 0 ...
..$ test :List of 2
.. ..$ pos: num [1:27727, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:27727, 1:1000] 0 0 0 0 0 0 0 1 0 0 ...
..$ train:List of 2
.. ..$ pos: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
$ Oct4 :List of 3
..$ tune :List of 2
.. ..$ pos: num [1:29277, 1:1000] 1 1 1 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:29277, 1:1000] 1 0 1 0 0 0 0 0 0 0 ...
..$ test :List of 2
.. ..$ pos: num [1:27727, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:27727, 1:1000] 0 0 0 0 0 2 0 0 0 0 ...
..$ train:List of 2
.. ..$ pos: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 2 0 ...
.. ..$ neg: num [1:93904, 1:1000] 0 0 1 0 0 0 0 0 0 0 ...
$ Klf4 :List of 3
..$ tune :List of 2
.. ..$ pos: num [1:29277, 1:1000] 0 0 1 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:29277, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
..$ test :List of 2
.. ..$ pos: num [1:27727, 1:1000] 0 1 0 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:27727, 1:1000] 0 0 0 0 0 3 0 0 0 0 ...
..$ train:List of 2
.. ..$ pos: num [1:93904, 1:1000] 0 0 0 0 0 1 0 0 2 0 ...
.. ..$ neg: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
$ Nanog:List of 3
..$ tune :List of 2
.. ..$ pos: num [1:29277, 1:1000] 1 0 1 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:29277, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
..$ test :List of 2
.. ..$ pos: num [1:27727, 1:1000] 0 1 0 1 0 0 0 0 0 0 ...
.. ..$ neg: num [1:27727, 1:1000] 0 1 0 0 0 1 0 1 0 0 ...
..$ train:List of 2
.. ..$ pos: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
.. ..$ neg: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
Write the matrices in binary format to disk using reticulate.
import numpy as np
for tf_name, tf_entry in r.tf_counts.items():
print(tf_name)
for set_name, set_entry in tf_entry.items():
mtx = np.stack([set_entry["pos"], set_entry["neg"]], axis=1).astype(np.float32)
print(f"\t{set_name} shape: {mtx.shape}")
save_path = f"{r.output_dir}{tf_name}/{set_name}_counts.npy"
print(f"\t-> {save_path}")
np.save(file=save_path, arr=mtx)Sox2
tune shape: (29277, 2, 1000)
-> /home/philipp/BPNet/input/Sox2/tune_counts.npy
test shape: (27727, 2, 1000)
-> /home/philipp/BPNet/input/Sox2/test_counts.npy
train shape: (93904, 2, 1000)
-> /home/philipp/BPNet/input/Sox2/train_counts.npy
Oct4
tune shape: (29277, 2, 1000)
-> /home/philipp/BPNet/input/Oct4/tune_counts.npy
test shape: (27727, 2, 1000)
-> /home/philipp/BPNet/input/Oct4/test_counts.npy
train shape: (93904, 2, 1000)
-> /home/philipp/BPNet/input/Oct4/train_counts.npy
Klf4
tune shape: (29277, 2, 1000)
-> /home/philipp/BPNet/input/Klf4/tune_counts.npy
test shape: (27727, 2, 1000)
-> /home/philipp/BPNet/input/Klf4/test_counts.npy
train shape: (93904, 2, 1000)
-> /home/philipp/BPNet/input/Klf4/train_counts.npy
Nanog
tune shape: (29277, 2, 1000)
-> /home/philipp/BPNet/input/Nanog/tune_counts.npy
test shape: (27727, 2, 1000)
-> /home/philipp/BPNet/input/Nanog/test_counts.npy
train shape: (93904, 2, 1000)
-> /home/philipp/BPNet/input/Nanog/train_counts.npy
7.1 Examples: Highest Scoring Peaks
Click to expand
test_df <- peak_infos %>%
as.data.frame() %>%
dplyr::filter(set=="train") %>%
dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)) %>%
dplyr::group_by(TF) %>%
slice_max(order_by=qValue, n=5) %>%
select(Region, TF, qValue)
purrr::walk(1:nrow(test_df), function(i) {
test_instance <- unlist(test_df[i, ])
purrr::map_dfr(TFs, function(tf){
tibble::tibble(position=-499:500,
plus = tf_counts[[tf]]$train$pos[match(test_instance["Region"], seq_names$train), ],
minus = -tf_counts[[tf]]$train$neg[match(test_instance["Region"], seq_names$train), ]) %>%
dplyr::mutate(TF = tf, p_name = test_instance["Region"])
}) %>%
pivot_longer(cols=c(minus, plus)) %>%
ggplot() +
geom_line(aes(x=position, y=value, color=name)) +
facet_wrap(~TF, scales="free_y") +
ggdark::dark_theme_linedraw() +
labs(x="Relative Position [bp]", y="Counts", color="Strand",
title=paste0("Peak ", test_instance["Region"], " | ", test_instance["TF"],
" | ", test_instance["qValue"])) -> p
print(p)
})



















7.2 Examples: Random Peaks
Click to expand
set.seed(42)
test_df <- peak_infos %>%
as.data.frame() %>%
dplyr::filter(set=="train") %>%
dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)) %>%
dplyr::group_by(TF) %>%
slice_sample(n=5) %>%
select(Region, TF, qValue)
purrr::walk(1:nrow(test_df), function(i) {
test_instance <- unlist(test_df[i, ])
purrr::map_dfr(TFs, function(tf){
tibble::tibble(position=-499:500,
plus = tf_counts[[tf]]$train$pos[match(test_instance["Region"], seq_names$train), ],
minus = -tf_counts[[tf]]$train$neg[match(test_instance["Region"], seq_names$train), ]) %>%
dplyr::mutate(TF = tf, p_name = test_instance["Region"])
}) %>%
pivot_longer(cols=c(minus, plus)) %>%
ggplot() +
geom_line(aes(x=position, y=value, color=name)) +
facet_wrap(~TF, scales="free_y") +
ggdark::dark_theme_linedraw() +
labs(x="Relative Position [bp]", y="Counts", color="Strand",
title=paste0("Peak ", test_instance["Region"], " | ", test_instance["TF"],
" | ", test_instance["qValue"])) -> p
print(p)
})



















8 Extract the Control (Patchcap) Counts
patchcap_input <-
c("pos", "neg") %>%
purrr::set_names() %>%
purrr::map(function(strand){
rtracklayer::import.bw(paste0("../data/chip-nexus/patchcap/counts.", strand, ".bw")) %>%
GenomicRanges::coverage(., weight = "score")
})
ctrl_counts <- furrr::future_imap(chrom_list, function(set_chroms, set_name) {
peak_info_subset <- peak_infos[seqnames(peak_infos) %in% set_chroms]
c("pos", "neg") %>%
purrr::set_names() %>%
purrr::map(function(strand) {
mtx <- matrix(data=0, ncol=peak_width, nrow=length(peak_info_subset))
for (seq_index in 1:nrow(mtx)) {
chrom_index <- as.character(peak_info_subset[seq_index]@seqnames)
position_index <- peak_info_subset[seq_index]@ranges@start:(peak_info_subset[seq_index]@ranges@start+peak_width-1)
mtx[seq_index, ] = as.numeric(patchcap_input[[strand]][[chrom_index]][position_index])
}
mtx
})
})str(ctrl_counts)List of 3
$ tune :List of 2
..$ pos: num [1:29277, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
..$ neg: num [1:29277, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
$ test :List of 2
..$ pos: num [1:27727, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
..$ neg: num [1:27727, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
$ train:List of 2
..$ pos: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 1 0 ...
..$ neg: num [1:93904, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
Write the matrices in binary format to disk using reticulate
import numpy as np
for set_name, set_entry in r.ctrl_counts.items():
mtx = np.stack([set_entry["pos"], set_entry["neg"]], axis=1).astype(np.float32)
print(f"\t{set_name} shape: {mtx.shape}")
save_path = f"{r.output_dir}patchcap/{set_name}_counts.npy"
print(f"\t-> {save_path}")
np.save(file=save_path, arr=mtx) tune shape: (29277, 2, 1000)
-> /home/philipp/BPNet/input/patchcap/tune_counts.npy
test shape: (27727, 2, 1000)
-> /home/philipp/BPNet/input/patchcap/test_counts.npy
train shape: (93904, 2, 1000)
-> /home/philipp/BPNet/input/patchcap/train_counts.npy
8.1 Examples: Highest Scoring Peaks with Bias
test_instance <- peak_infos %>%
as.data.frame() %>%
dplyr::filter(set=="test") %>%
dplyr::filter(seqnames=="chr1", start >= 180924752-1000, end <= 180925152+1000) %>%
dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)) %>%
dplyr::select(Region, TF, qValue) %>%
.[1, ]
print(test_instance) Region TF qValue
1 chr1:180924435-180925434 Sox2 436.0653
p <- purrr::map_dfr(TFs, function(tf){
tibble::tibble(position=-499:500,
plus = tf_counts[[tf]]$test$pos[match(test_instance["Region"], seq_names$test), ],
minus = -tf_counts[[tf]]$test$neg[match(test_instance["Region"], seq_names$test), ]) %>%
dplyr::mutate(TF = tf, p_name = test_instance["Region"])
}) %>%
rbind(
tibble::tibble(position=-499:500,
plus = ctrl_counts$test$pos[match(test_instance["Region"], seq_names$test), ],
minus = -ctrl_counts$test$neg[match(test_instance["Region"], seq_names$test), ],
TF = "patchcap", p_name = test_instance["Region"])
) %>%
pivot_longer(cols=c(minus, plus)) %>%
dplyr::mutate(TF = factor(TF, levels=names(colors))) %>%
ggplot() +
geom_line(aes(x=position, y=value, color=TF, alpha=name), size=0.2) +
facet_wrap(~TF, ncol=1, scales="free_y") +
labs(x="Relative Position [bp]", y="Counts", color="Strand",
title=test_instance["Region"]) +
#scale_color_manual(values=c("minus"="darkred", "plus"="forestgreen")) +
scale_color_manual(values=colors) +
scale_alpha_manual(values=c("plus" = 1, "minus" = 1)) +
theme_bw() +
theme(plot.title = element_text(size=10, hjust=0.5), strip.background = element_rect(fill=NA))
ggsave(filename = "/home/philipp/BPNet/out/figures/example_high_q.pdf",
plot = p, width = 4, height = 8)
p +
ggdark::dark_theme_linedraw()
Click to view more plots
test_df <- peak_infos %>%
as.data.frame() %>%
dplyr::filter(set=="train") %>%
dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)) %>%
dplyr::group_by(TF) %>%
slice_max(order_by=qValue, n=5) %>%
select(Region, TF, qValue)
purrr::walk(1:nrow(test_df), function(i) {
test_instance <- unlist(test_df[i, ])
purrr::map_dfr(TFs, function(tf){
tibble::tibble(position=-499:500,
plus = tf_counts[[tf]]$train$pos[match(test_instance["Region"], seq_names$train), ],
minus = -tf_counts[[tf]]$train$neg[match(test_instance["Region"], seq_names$train), ]) %>%
dplyr::mutate(TF = tf, p_name = test_instance["Region"])
}) %>%
rbind(
tibble::tibble(position=-499:500,
plus = ctrl_counts$train$pos[match(test_instance["Region"], seq_names$train), ],
minus = -ctrl_counts$train$neg[match(test_instance["Region"], seq_names$train), ],
TF = "Bias", p_name = test_instance["Region"])
) %>%
pivot_longer(cols=c(minus, plus)) %>%
ggplot() +
geom_line(aes(x=position, y=value, color=name)) +
facet_wrap(~TF, scales="free_y") +
ggdark::dark_theme_linedraw() +
labs(x="Relative Position [bp]", y="Counts", color="Strand",
title=paste0("Peak ", test_instance["Region"], " | ", test_instance["TF"],
" | ", test_instance["qValue"])) -> p
print(p)
})



















8.2 Examples: Random Peaks with Bias
test_instance <- peak_infos %>%
as.data.frame() %>%
dplyr::filter(set=="test") %>%
dplyr::filter(seqnames=="chr1", start >= 4000, end <= 100000000) %>%
dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)) %>%
dplyr::select(Region, TF, qValue) %>%
.[1000, ]
print(test_instance) Region TF qValue
1000 chr1:58392603-58393602 Oct4 21.10429
p <- purrr::map_dfr(TFs, function(tf){
tibble::tibble(position=-499:500,
plus = tf_counts[[tf]]$test$pos[match(test_instance["Region"], seq_names$test), ],
minus = -tf_counts[[tf]]$test$neg[match(test_instance["Region"], seq_names$test), ]) %>%
dplyr::mutate(TF = tf, p_name = test_instance["Region"])
}) %>%
rbind(
tibble::tibble(position=-499:500,
plus = ctrl_counts$test$pos[match(test_instance["Region"], seq_names$test), ],
minus = -ctrl_counts$test$neg[match(test_instance["Region"], seq_names$test), ],
TF = "patchcap", p_name = test_instance["Region"])
) %>%
pivot_longer(cols=c(minus, plus)) %>%
dplyr::mutate(TF = factor(TF, levels=names(colors))) %>%
ggplot() +
geom_line(aes(x=position, y=value, color=TF, alpha=name), size=0.2) +
facet_wrap(~TF, ncol=1, scales="free_y") +
labs(x="Relative Position [bp]", y="Counts", color="Strand",
title=test_instance["Region"]) +
#scale_color_manual(values=c("minus"="darkred", "plus"="forestgreen")) +
scale_color_manual(values=colors) +
scale_alpha_manual(values=c("plus" = 1, "minus" = 1)) +
theme_bw() +
theme(plot.title = element_text(size=10, hjust=0.5), strip.background = element_rect(fill=NA))
ggsave(filename = "/home/philipp/BPNet/out/figures/example_low_q.pdf",
plot = p, width = 4, height = 8)
p +
ggdark::dark_theme_linedraw()
Click to view more plots
set.seed(42)
test_df <- peak_infos %>%
as.data.frame() %>%
dplyr::filter(set=="train") %>%
dplyr::mutate(Region = paste0(seqnames, ":", start, "-", end)) %>%
dplyr::group_by(TF) %>%
slice_sample(n=5) %>%
select(Region, TF, qValue)
purrr::walk(1:nrow(test_df), function(i) {
test_instance <- unlist(test_df[i, ])
purrr::map_dfr(TFs, function(tf){
tibble::tibble(position=-499:500,
plus = tf_counts[[tf]]$train$pos[match(test_instance["Region"], seq_names$train), ],
minus = -tf_counts[[tf]]$train$neg[match(test_instance["Region"], seq_names$train), ]) %>%
dplyr::mutate(TF = tf, p_name = test_instance["Region"])
}) %>%
rbind(
tibble::tibble(position=-499:500,
plus = ctrl_counts$train$pos[match(test_instance["Region"], seq_names$train), ],
minus = -ctrl_counts$train$neg[match(test_instance["Region"], seq_names$train), ],
TF = "Bias", p_name = test_instance["Region"])
) %>%
pivot_longer(cols=c(minus, plus)) %>%
ggplot() +
geom_line(aes(x=position, y=value, color=name)) +
facet_wrap(~TF, scales="free_y") +
ggdark::dark_theme_linedraw() +
labs(x="Relative Position [bp]", y="Counts", color="Strand",
title=paste0("Peak ", test_instance["Region"], " | ", test_instance["TF"],
" | ", test_instance["qValue"])) -> p
print(p)
})



















9 Figure 1e
9.1 From Raw Data
roi <- list("seqname"="chr1", "start"=180924752, "end"=180925152)
TFs <- c("Oct4", "Sox2", "Nanog", "Klf4")
df <-
purrr::map_dfr(TFs, function(tf) {
alignments <- readGAlignments(paste0("../data/chip-nexus/", tf, "/pool_filt.bam"),
param = ScanBamParam(which=GRanges(paste0(roi$seqname, ":", roi$start, "-", roi$end))))
align_pos <- alignments[strand(alignments)=="+"]
align_neg <- alignments[strand(alignments)=="-"]
align_pos@cigar <- rep("1M", length(align_pos))
align_neg@start <- GenomicAlignments::end(align_neg)
align_neg@cigar <- rep("1M", length(align_neg))
tibble::tibble(pos=as.numeric(coverage(align_pos)$chr1[roi$start:roi$end]),
neg=-as.numeric(coverage(align_neg)$chr1[roi$start:roi$end]),
position=roi$start:roi$end,
TF=tf)
}) %>%
pivot_longer(cols=c("pos", "neg"))
df# A tibble: 3,208 x 4
position TF name value
<int> <chr> <chr> <dbl>
1 180924752 Oct4 pos 0
2 180924752 Oct4 neg 0
3 180924753 Oct4 pos 2
4 180924753 Oct4 neg 0
5 180924754 Oct4 pos 2
6 180924754 Oct4 neg 0
7 180924755 Oct4 pos 0
8 180924755 Oct4 neg 0
9 180924756 Oct4 pos 0
10 180924756 Oct4 neg 0
# ... with 3,198 more rows
# i Use `print(n = ...)` to see more rows
df %>%
mutate(TF = factor(TF, levels=c(TFs))) %>%
ggplot() +
geom_line(aes(x=position, y=value, col=name)) +
facet_wrap(~TF, ncol=1, scales="free_y") +
labs(x="Position", y="Counts", col="Strand") +
ggdark::dark_theme_linedraw()
9.2 From Processed Data
roi <- list("seqname"="chr1", "start"=180924752, "end"=180925152)
roi_adjusted <- roi
roi_adjusted$start = roi_adjusted$start-1000
roi_adjusted$end = roi_adjusted$end+1000
TFs <- c("Oct4", "Sox2", "Nanog", "Klf4")
df2 <-
purrr::map_dfr(TFs, function(tf) {
cov_list <- list("pos" = tf_counts[[tf]]$test$pos,
"neg" = tf_counts[[tf]]$test$neg)
rnames <- seq_names$test
gr_rnames <- GRanges(rnames)
bool_vec <-
(as.character(gr_rnames@seqnames) == roi_adjusted$seqname &
start(gr_rnames@ranges) >= roi_adjusted$start &
end(gr_rnames@ranges) <= roi_adjusted$end)
peak_index <- which(bool_vec)[1]
peak_info <- gr_rnames[peak_index]
diff <- 180924752 - start(peak_info@ranges) + 1
w <- 400
tibble::tibble(pos=cov_list$pos[peak_index, diff:(diff+w)],
neg=-cov_list$neg[peak_index, diff:(diff+w)],
position=0:400,
TF=tf)
}) %>%
pivot_longer(cols=c("pos", "neg"))
df2# A tibble: 3,208 x 4
position TF name value
<int> <chr> <chr> <dbl>
1 0 Oct4 pos 0
2 0 Oct4 neg 0
3 1 Oct4 pos 2
4 1 Oct4 neg 0
5 2 Oct4 pos 2
6 2 Oct4 neg 0
7 3 Oct4 pos 0
8 3 Oct4 neg 0
9 4 Oct4 pos 0
10 4 Oct4 neg 0
# ... with 3,198 more rows
# i Use `print(n = ...)` to see more rows
df2 %>%
mutate(TF = factor(TF, levels=c(TFs))) %>%
ggplot() +
geom_line(aes(x=position, y=value, col=name)) +
facet_wrap(~TF, ncol=1, scales="free_y") +
labs(x="Position", y="Counts", col="Strand") +
ggdark::dark_theme_linedraw()
Compare.
all(df2$value == df$value)[1] TRUE
all(df2$TF == df$TF)[1] TRUE
all(df2$name == df$name)[1] TRUE
10 Oct4-Sox2 Motif
motif <- TFBSTools::getMatrixByID(JASPAR2020, ID = "MA0142.1")
seqLogo::seqLogo(motif@profileMatrix / colSums(motif@profileMatrix))
w = 400
motif_matches <- motifmatchr::matchMotifs(motif, peak_infos, genome = "mm10", out="position")[[1]] %>%
GenomicRanges::resize(width=w, fix="center") %>%
plyranges::filter(score > 20.8) %>%
plyranges::arrange(score)
saveRDS(motif_matches, "/home/philipp/BPNet/input/oct4_sox2_matches.rds")
motif_matchesGRanges object with 408 ranges and 1 metadata column:
seqnames ranges strand | score
<Rle> <IRanges> <Rle> | <numeric>
[1] chr4 84528359-84528758 + | 20.8166
[2] chr4 84528359-84528758 + | 20.8166
[3] chr12 86188063-86188462 - | 20.8166
[4] chr16 23504381-23504780 + | 20.8166
[5] chr9 14619412-14619811 + | 20.8166
... ... ... ... . ...
[404] chr15 75270453-75270852 + | 23.4055
[405] chr15 80618982-80619381 - | 23.4055
[406] chr7 35617437-35617836 - | 23.4055
[407] chr8 76863085-76863484 + | 23.4055
[408] chr1 75997151-75997550 + | 23.4055
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths
motif_matches_reduced <- GenomicRanges::reduce(resize(motif_matches, GenomicRanges::width(motif_matches + 1), "start"))
### Parallel loop over all tfs
motif_counts <- c("Oct4", "Sox2") %>%
purrr::set_names() %>%
furrr::future_map(function(tf) {
# read only from the alignment file in the given peak regions
# note: make sure that the BAM file is sorted (check for presence of ".bam.bai")
alignments <- readGAlignments(paste0("../data/chip-nexus/", tf, "/pool_filt.bam"),
param = ScanBamParam(which=motif_matches_reduced))
# split the alignment into pos and neg strand
align_pos <- alignments[GenomicRanges::strand(alignments)=="+"]
align_neg <- alignments[GenomicRanges::strand(alignments)=="-"]
# only retain first base pair of each read
align_pos@cigar <- rep("1M", length(align_pos))
align_neg@start <- GenomicAlignments::end(align_neg)
align_neg@cigar <- rep("1M", length(align_neg))
# compute the coverage per base pair
cov_list = list("pos" = GenomicAlignments::coverage(align_pos, weight = 1L),
"neg" = GenomicAlignments::coverage(align_neg, weight = 1L))
c("pos", "neg") %>%
purrr::set_names() %>%
purrr::map(function(strand) {
mtx <- matrix(data=0, ncol=w, nrow=length(motif_matches))
for (i in 1:length(motif_matches)) {
chr <- as.character(motif_matches[i]@seqnames)
position_index <- motif_matches[i]@ranges@start:(motif_matches[i]@ranges@start + w - 1)
mtx[i, ] = as.numeric(cov_list[[strand]][[chr]][position_index])
}
mtx
})
})Plotting the means per motif as in Figure 1c.
rbind(
tibble(pos = 1:ncol(motif_counts$Oct4$pos),
counts_pos = colMeans(motif_counts$Oct4$pos),
counts_neg = -colMeans(motif_counts$Oct4$neg),
TF = "Oct4"),
tibble(pos = 1:ncol(motif_counts$Sox2$pos),
counts_pos = colMeans(motif_counts$Sox2$pos),
counts_neg = -colMeans(motif_counts$Sox2$neg),
TF = "Sox2")
) %>%
ggplot() +
geom_line(aes(x=pos, y=counts_pos, color=TF), size=0.5) +
geom_line(aes(x=pos, y=counts_neg, color=TF), size=0.5) +
scale_color_manual(values=c("Oct4" = "darkred", "Sox2" = "blue")) +
labs(x="Position [bp]", y="Coverage", color="TF") +
ggdark::dark_theme_linedraw()
Plot the coverage per motif and per sequence as in Figure 1b
df <-
purrr::imap_dfr(motif_counts, function(cov, tf) {
purrr::imap_dfr(cov, function(mtx, strand) {
purrr::map_dfr(1:nrow(mtx), function(i) {
tibble::tibble(seq = i,
idx = (-w/2):(w/2 -1),
counts = mtx[i, ],
s = strand,
TF = tf)
})
})
})Plot via log2 FC at each position
pcount = 1
df %>%
pivot_wider(names_from=s, values_from=counts) %>%
dplyr::filter(TF=="Oct4") %>%
dplyr::mutate(diff = log2(pos+pcount) - log2(neg+pcount)) %>%
ggplot(aes(y=seq, x=idx)) +
geom_raster(aes(fill=diff)) +
scale_fill_gradient2(low = "blue", mid="black", high="red") +
theme_classic() +
labs(x = "Relative Postion [bp]", y = "Top Oct4-Sox2 Motifs", fill = "Log2 FC",
title="Oct4") +
ggdark::dark_mode()
df %>%
pivot_wider(names_from=s, values_from=counts) %>%
dplyr::filter(TF=="Sox2") %>%
dplyr::mutate(diff = log2(pos+pcount) - log2(neg+pcount)) %>%
ggplot(aes(y=seq, x=idx)) +
geom_raster(aes(fill=diff)) +
scale_fill_gradient2(low = "blue", mid="black", high="red") +
theme_classic() +
labs(x = "Relative Postion [bp]", y = "Top Oct4-Sox2 Motifs", fill = "Log2 FC",
title="Sox2") +
ggdark::dark_mode()
Plot via facetting
tf = "Oct4"
tmp <- df %>%
dplyr::filter(TF==tf) %>%
dplyr::mutate(counts = ifelse(s=="pos", counts, -counts)) %>%
pull(counts)
df %>%
dplyr::filter(TF==tf) %>%
dplyr::mutate(counts = ifelse(s=="pos", counts, -counts)) %>%
ggplot(aes(y=seq, x=idx)) +
geom_raster(aes(fill=counts)) +
scale_fill_gradient2(low = "blue", mid="black", high="red") +
scale_fill_gradientn(colours = c("red", "black", "blue"),
values = scales::rescale(c(max(tmp), 5, 0, -5, min(tmp)))) +
theme_classic() +
labs(x = "Relative Postion [bp]", y = "Top Oct4-Sox2 Motifs", fill = "Log2 FC",
title=tf) +
ggdark::dark_mode() +
facet_wrap(~s, ncol=1)Scale for 'fill' is already present. Adding another scale for 'fill', which
will replace the existing scale.

tf = "Sox2"
tmp <- df %>%
dplyr::filter(TF==tf) %>%
dplyr::mutate(counts = ifelse(s=="pos", counts, -counts)) %>%
pull(counts)
df %>%
dplyr::filter(TF==tf) %>%
dplyr::mutate(counts = ifelse(s=="pos", counts, -counts)) %>%
ggplot(aes(y=seq, x=idx)) +
geom_raster(aes(fill=counts)) +
scale_fill_gradient2(low = "blue", mid="black", high="red") +
scale_fill_gradientn(colours = c("red", "black", "blue"),
values = scales::rescale(c(max(tmp), 5, 0, -5, min(tmp)))) +
theme_classic() +
labs(x = "Relative Postion [bp]", y = "Top Oct4-Sox2 Motifs", fill = "Log2 FC",
title=tf) +
ggdark::dark_mode() +
facet_wrap(~s, ncol=1)Scale for 'fill' is already present. Adding another scale for 'fill', which
will replace the existing scale.

11 Appendix
11.1 Save Image
save.image(paste0(output_dir, "tmp.RData"))11.2 Session Info
sessionInfo()R version 4.2.1 (2022-06-23)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale:
[1] C
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] TFBSTools_1.34.0 JASPAR2020_0.99.10
[3] motifmatchr_1.18.0 furrr_0.3.0
[5] future_1.27.0 forcats_0.5.1
[7] stringr_1.4.0 dplyr_1.0.9
[9] purrr_0.3.4 readr_2.1.2
[11] tidyr_1.2.0 tibble_3.1.8
[13] ggplot2_3.3.6 tidyverse_1.3.2
[15] BSgenome.Mmusculus.UCSC.mm10_1.4.3 BSgenome_1.64.0
[17] BRGenomics_1.8.0 GenomicAlignments_1.32.1
[19] Rsamtools_2.12.0 Biostrings_2.64.0
[21] XVector_0.36.0 SummarizedExperiment_1.26.1
[23] Biobase_2.56.0 MatrixGenerics_1.8.1
[25] matrixStats_0.62.0 rtracklayer_1.56.1
[27] GenomicRanges_1.48.0 GenomeInfoDb_1.32.3
[29] IRanges_2.30.0 S4Vectors_0.34.0
[31] BiocGenerics_0.42.0 reticulate_1.26
loaded via a namespace (and not attached):
[1] readxl_1.4.0 backports_1.4.1
[3] plyr_1.8.7 splines_4.2.1
[5] BiocParallel_1.30.3 listenv_0.8.0
[7] digest_0.6.29 htmltools_0.5.3
[9] GO.db_3.15.0 fansi_1.0.3
[11] magrittr_2.0.3 memoise_2.0.1
[13] googlesheets4_1.0.0 tzdb_0.3.0
[15] globals_0.16.0 annotate_1.74.0
[17] modelr_0.1.8 R.utils_2.12.0
[19] colorspace_2.0-3 blob_1.2.3
[21] rvest_1.0.2 haven_2.5.0
[23] xfun_0.32 crayon_1.5.1
[25] RCurl_1.98-1.8 jsonlite_1.8.0
[27] genefilter_1.78.0 TFMPvalue_0.0.8
[29] survival_3.4-0 glue_1.6.2
[31] gtable_0.3.0 gargle_1.2.0
[33] zlibbioc_1.42.0 DelayedArray_0.22.0
[35] ggdark_0.2.1 plyranges_1.16.0
[37] scales_1.2.0 DBI_1.1.3
[39] Rcpp_1.0.9 xtable_1.8-4
[41] bit_4.0.4 httr_1.4.3
[43] RColorBrewer_1.1-3 ellipsis_0.3.2
[45] pkgconfig_2.0.3 XML_3.99-0.10
[47] R.methodsS3_1.8.2 farver_2.1.1
[49] dbplyr_2.2.1 locfit_1.5-9.6
[51] utf8_1.2.2 tidyselect_1.1.2
[53] labeling_0.4.2 rlang_1.0.4
[55] reshape2_1.4.4 AnnotationDbi_1.58.0
[57] munsell_0.5.0 cellranger_1.1.0
[59] tools_4.2.1 cachem_1.0.6
[61] cli_3.3.0 DirichletMultinomial_1.38.0
[63] generics_0.1.3 RSQLite_2.2.16
[65] broom_1.0.0 evaluate_0.16
[67] fastmap_1.1.0 yaml_2.3.5
[69] knitr_1.39 bit64_4.0.5
[71] fs_1.5.2 caTools_1.18.2
[73] KEGGREST_1.36.3 R.oo_1.25.0
[75] poweRlaw_0.70.6 pracma_2.3.8
[77] xml2_1.3.3 compiler_4.2.1
[79] png_0.1-7 reprex_2.0.1
[81] geneplotter_1.74.0 stringi_1.7.8
[83] lattice_0.20-45 CNEr_1.32.0
[85] Matrix_1.5-1 vctrs_0.4.1
[87] pillar_1.8.0 lifecycle_1.0.1
[89] bitops_1.0-7 R6_2.5.1
[91] BiocIO_1.6.0 parallelly_1.32.1
[93] codetools_0.2-18 gtools_3.9.3
[95] assertthat_0.2.1 seqLogo_1.62.0
[97] DESeq2_1.36.0 rjson_0.2.21
[99] withr_2.5.0 GenomeInfoDbData_1.2.8
[101] parallel_4.2.1 hms_1.1.1
[103] grid_4.2.1 rmarkdown_2.14
[105] googledrive_2.0.0 lubridate_1.8.0
[107] restfulr_0.0.15