Title: | Dynamic Programming Algorithm for Peak Detection in ChIP-Seq Data |
---|---|
Description: | A quadratic time dynamic programming algorithm can be used to compute an approximate solution to the problem of finding the most likely changepoints with respect to the Poisson likelihood, subject to a constraint on the number of segments, and the changes which must alternate: up, down, up, down, etc. For more info read <http://proceedings.mlr.press/v37/hocking15.html> "PeakSeg: constrained optimal segmentation and supervised penalty learning for peak detection in count data" by TD Hocking et al, proceedings of ICML2015. |
Authors: | Toby Dylan Hocking, Guillem Rigaill |
Maintainer: | Toby Dylan Hocking <[email protected]> |
License: | GPL-3 |
Version: | 2024.1.24 |
Built: | 2024-11-20 05:35:24 UTC |
Source: | https://github.com/tdhock/peaksegdp |
List of calc.grad functions: x, features, limits -> gradient.
"calc.grad.list"
"calc.grad.list"
if we have already calculated the linear predictor using fit$predict, this function can be useful.
"calc.loss.from.lp.list"
"calc.loss.from.lp.list"
List of interval regression loss functions: x, feat, lim => numeric.
"calc.loss.list"
"calc.loss.list"
A constrained dynamic programming algorithm (cDPA) can be used to compute the best segmentation with respect to the Poisson likelihood, subject to a constraint on the number of segments, and the changes which must alternate: up, down, up, down, ...
cDPA(count, weight = rep(1, length(count)), maxSegments)
cDPA(count, weight = rep(1, length(count)), maxSegments)
count |
Integer vector of |
weight |
Data weights (normally this is the number of base pairs). |
maxSegments |
Maximum number of segments to consider. |
Toby Dylan Hocking, Guillem Rigaill
fit <- cDPA(c(0, 10, 11, 1), maxSegments=3) stopifnot(fit$ends[3,4] == 3) stopifnot(fit$ends[2,3] == 1)
fit <- cDPA(c(0, 10, 11, 1), maxSegments=3) stopifnot(fit$ends[3,4] == 3) stopifnot(fit$ends[2,3] == 1)
A ChIP-seq experiment was performed to locate the genomic positions of a histone (H3K4me3) in 2 B cell samples (McGill0091, McGill0322) and 2 T cell samples (McGill0002, McGill0004). The short sequence reads (about 100 base pairs each) were aligned to the hg19 reference genome, and the "coverage" in this data set contains the total count of aligned reads at each base pair. It also contains annotated regions determined by an expert who examined scatterplots of the coverage profiles.
data("chr11ChIPseq")
data("chr11ChIPseq")
A named list of 2 data.frames: regions contains annotations about which regions contain or do not contain peaks, and coverage contains the noisy signal.
H3K4me3_TDH_immune chunk 5 in http://cbio.ensmp.fr/~thocking/chip-seq-chunk-db/ which in turn comes from http://epigenomesportal.ca/
data(chr11ChIPseq) library(ggplot2) ann.colors <- c(noPeaks="#f6f4bf", peakStart="#ffafaf", peakEnd="#ff4c4c", peaks="#a445ee") if(interactive() && require(ggplot2)){ ggplot()+ scale_fill_manual("annotation", values=ann.colors, breaks=names(ann.colors))+ penaltyLearning::geom_tallrect(aes(xmin=chromStart/1e3, xmax=chromEnd/1e3, fill=annotation), data=chr11ChIPseq$regions, alpha=1/2)+ theme_bw()+ theme(panel.margin=grid::unit(0, "cm"))+ facet_grid(sample.id ~ ., scales="free")+ geom_step(aes(chromStart/1e3, count), data=chr11ChIPseq$coverage)+ xlab("position on chr11 (kilo base pairs)") }
data(chr11ChIPseq) library(ggplot2) ann.colors <- c(noPeaks="#f6f4bf", peakStart="#ffafaf", peakEnd="#ff4c4c", peaks="#a445ee") if(interactive() && require(ggplot2)){ ggplot()+ scale_fill_manual("annotation", values=ann.colors, breaks=names(ann.colors))+ penaltyLearning::geom_tallrect(aes(xmin=chromStart/1e3, xmax=chromEnd/1e3, fill=annotation), data=chr11ChIPseq$regions, alpha=1/2)+ theme_bw()+ theme(panel.margin=grid::unit(0, "cm"))+ facet_grid(sample.id ~ ., scales="free")+ geom_step(aes(chromStart/1e3, count), data=chr11ChIPseq$coverage)+ xlab("position on chr11 (kilo base pairs)") }
For 4 samples on chr11 (hg19), this data set counts the first base pair of aligned reads at each genomic position. In contrast, chr11ChIPseq counts every base pair in each read (and each read is about 100bp, so that means there is some auto-correlation in chr11ChIPseq, but not in chr11first).
data("chr11first")
data("chr11first")
A data frame with 23252 observations on the following 4 variables.
sample.id
a factor with levels for each of 4 samples
chromStart
integer vector: base before, on chr11
chromEnd
integer vector: last base on chr11
count
integer: aligned first base read counts
H3K4me3_TDH_immune chunk 5 in http://cbio.ensmp.fr/~thocking/chip-seq-chunk-db/ which in turn comes from http://epigenomesportal.ca/
data(chr11ChIPseq) data(chr11first) library(ggplot2) ann.colors <- c(noPeaks="#f6f4bf", peakStart="#ffafaf", peakEnd="#ff4c4c", peaks="#a445ee") both <- list(coverage=chr11ChIPseq$coverage, first=chr11first) representations <- NULL one.sample <- "McGill0322" for(data.type in names(both)){ one <- subset(both[[data.type]], sample.id==one.sample) representations <- rbind(representations, data.frame(data.type, one)) } one.sample.regions <- subset( chr11ChIPseq$regions, sample.id==one.sample) if(interactive() && require(ggplot2)){ ggplot()+ scale_fill_manual("annotation", values=ann.colors, breaks=names(ann.colors))+ penaltyLearning::geom_tallrect(aes(xmin=chromStart/1e3, xmax=chromEnd/1e3, fill=annotation), data=one.sample.regions, alpha=1/2)+ theme_bw()+ theme(panel.margin=grid::unit(0, "cm"))+ facet_grid(data.type ~ ., scales="free")+ geom_step(aes(chromStart/1e3, count), data=representations)+ xlab("position on chr11 (kilo base pairs)") }
data(chr11ChIPseq) data(chr11first) library(ggplot2) ann.colors <- c(noPeaks="#f6f4bf", peakStart="#ffafaf", peakEnd="#ff4c4c", peaks="#a445ee") both <- list(coverage=chr11ChIPseq$coverage, first=chr11first) representations <- NULL one.sample <- "McGill0322" for(data.type in names(both)){ one <- subset(both[[data.type]], sample.id==one.sample) representations <- rbind(representations, data.frame(data.type, one)) } one.sample.regions <- subset( chr11ChIPseq$regions, sample.id==one.sample) if(interactive() && require(ggplot2)){ ggplot()+ scale_fill_manual("annotation", values=ann.colors, breaks=names(ann.colors))+ penaltyLearning::geom_tallrect(aes(xmin=chromStart/1e3, xmax=chromEnd/1e3, fill=annotation), data=one.sample.regions, alpha=1/2)+ theme_bw()+ theme(panel.margin=grid::unit(0, "cm"))+ facet_grid(data.type ~ ., scales="free")+ geom_step(aes(chromStart/1e3, count), data=representations)+ xlab("position on chr11 (kilo base pairs)") }
These data are used to test the PeakSegDP algorithm, to make sure it gives sensible results, even when there are few data.
data("H3K36me3.AM.immune.19")
data("H3K36me3.AM.immune.19")
Named list of 21 data.frames, each with columns chromStart, chromEnd, count.
http://cbio.ensmp.fr/~thocking/chip-seq-chunk-db/ data set H3K36me3_AM_immune, chunk id 19
these data caused a bug in multiSampleSegHeuristic.
data("H3K36me3.TDH.other.chunk3.cluster4")
data("H3K36me3.TDH.other.chunk3.cluster4")
A data frame with 36914 observations on the following 4 variables.
sample.id
a factor with 8 levels
chromStart
integer vector
chromEnd
integer vector
count
integer vector
http://cbio.ensmp.fr/~thocking/chip-seq-chunk-db/ data set H3K36me3_TDH_other chunk 3.
26 samples, each with the same overlapping peak(s).
data("H3K4me3.TDH.immune.chunk12.cluster4")
data("H3K4me3.TDH.immune.chunk12.cluster4")
A data frame.
http://cbio.ensmp.fr/~thocking/chip-seq-chunk-db H3K4me3_TDH_immune data set, chunk.id=12.
Compute the PeakSeg model on a data.frame of compressed
sequence
reads.
PeakSegDP(compressed, maxPeaks)
PeakSegDP(compressed, maxPeaks)
compressed |
data.frame with columns chromStart, chromEnd, count. |
maxPeaks |
maximum number of peaks to consider. |
Toby Dylan Hocking, Guillem Rigaill
library(PeakSegDP) data(chr11ChIPseq, envir=environment()) one <- subset(chr11ChIPseq$coverage, sample.id=="McGill0002")[10000:12000,] fit <- PeakSegDP(one, 3L) if(interactive() && require(ggplot2)){ ggplot()+ geom_step(aes(chromStart/1e3, count), data=one)+ geom_segment(aes(chromStart/1e3, mean, xend=chromEnd/1e3, yend=mean), data=fit$segments, color="green")+ geom_segment(aes(chromStart/1e3, 0, xend=chromEnd/1e3, yend=0), data=subset(fit$segments, status=="peak"), size=3, color="deepskyblue")+ theme_bw()+ theme(panel.margin=grid::unit(0, "cm"))+ facet_grid(peaks ~ ., scales="free", labeller=function(df){ s <- ifelse(df$peaks==1, "", "s") df$peaks <- paste0(df$peaks, " peak", s) df }) }
library(PeakSegDP) data(chr11ChIPseq, envir=environment()) one <- subset(chr11ChIPseq$coverage, sample.id=="McGill0002")[10000:12000,] fit <- PeakSegDP(one, 3L) if(interactive() && require(ggplot2)){ ggplot()+ geom_step(aes(chromStart/1e3, count), data=one)+ geom_segment(aes(chromStart/1e3, mean, xend=chromEnd/1e3, yend=mean), data=fit$segments, color="green")+ geom_segment(aes(chromStart/1e3, 0, xend=chromEnd/1e3, yend=0), data=subset(fit$segments, status=="peak"), size=3, color="deepskyblue")+ theme_bw()+ theme(panel.margin=grid::unit(0, "cm"))+ facet_grid(peaks ~ ., scales="free", labeller=function(df){ s <- ifelse(df$peaks==1, "", "s") df$peaks <- paste0(df$peaks, " peak", s) df }) }
Compute the weighted Poisson loss function, which is seg.mean
-
count
* log(seg.mean
). The edge case is when the mean is zero, in
which case the probability mass function takes a value of 1 when
the data is 0 (and 0 otherwise). Thus the log-likelihood of a
maximum likelihood segment with mean zero must be zero.
PoissonLoss(count, seg.mean, weight = 1)
PoissonLoss(count, seg.mean, weight = 1)
count |
count |
seg.mean |
seg.mean |
weight |
weight |
Toby Dylan Hocking, Guillem Rigaill
PoissonLoss(1, 1) PoissonLoss(0, 0) PoissonLoss(1, 0) PoissonLoss(0, 1)
PoissonLoss(1, 1) PoissonLoss(0, 0) PoissonLoss(1, 0) PoissonLoss(0, 1)
List of regression functions: features, limits -> list.
"regression.funs"
"regression.funs"