Worst_case.knit

The time complexity of the algorithm depends on the number of intervals (candidate changepoints stored). Here we compute the mean number of intervals for real Mono27ac data, and synthetic count data which are always increasing.

data(Mono27ac, package="PeakSegDisk", envir=environment())
library(data.table)
loss.list <- list()
N.data.vec <- 10^seq(1, 3)
for(penalty in c(0, 1e2, 1e4, 1e6)){
  for(N.data in N.data.vec){
    some.cov <- Mono27ac$coverage[1:N.data]
    some.inc <- data.table(some.cov)
    some.inc[, count := 1:.N]
    data.list <- list(
      real=some.cov,
      synthetic=some.inc)
    for(short.type in names(data.list)){
      df <- data.list[[short.type]]
      fit <- PeakSegDisk::PeakSegFPOP_df(df, penalty)
      loss.list[[paste(penalty, N.data, short.type)]] <- data.table(
        N.data,
        short.type,
        fit$loss)
    }
  }
}
(loss <- do.call(rbind, loss.list))[, .(
  penalty, short.type, N.data, 
  mean.intervals, max.intervals,
  megabytes, seconds)][order(penalty, short.type, N.data)]
#>     penalty short.type N.data mean.intervals max.intervals    megabytes seconds
#>       <int>     <char>  <num>          <num>         <int>        <num>   <num>
#>  1:       0       real     10         0.9500             1 0.0008850098   0.001
#>  2:       0       real    100         1.8250             4 0.0122909546   0.003
#>  3:       0       real   1000         2.8325             5 0.1614456177   0.023
#>  4:       0  synthetic     10         2.3000             3 0.0013999939   0.001
#>  5:       0  synthetic    100         2.8200             5 0.0160865784   0.003
#>  6:       0  synthetic   1000         2.9920             5 0.1675300598   0.024
#>  7:     100       real     10         1.0500             3 0.0009231567   0.001
#>  8:     100       real    100         3.1350             8 0.0172882080   0.003
#>  9:     100       real   1000        10.9590            30 0.4714469910   0.025
#> 10:     100  synthetic     10         3.3000             7 0.0017814636   0.001
#> 11:     100  synthetic    100         8.5050            17 0.0377731323   0.003
#> 12:     100  synthetic   1000        54.5855           108 2.1356658936   0.048
#> 13:   10000       real     10         3.1000             5 0.0017051697   0.001
#> 14:   10000       real    100         7.5750            12 0.0342254639   0.003
#> 15:   10000       real   1000        11.9290            31 0.5084495544   0.024
#> 16:   10000  synthetic     10         3.8000             6 0.0019721985   0.000
#> 17:   10000  synthetic    100        20.7450            39 0.0844650269   0.003
#> 18:   10000  synthetic   1000       151.7575           315 5.8424835205   0.096
#> 19: 1000000       real     10         1.8500             4 0.0012283325   0.001
#> 20: 1000000       real    100         4.5000            11 0.0224952698   0.003
#> 21: 1000000       real   1000        13.2370            19 0.5583457947   0.025
#> 22: 1000000  synthetic     10         2.0000             4 0.0012855530   0.001
#> 23: 1000000  synthetic    100        21.4600            29 0.0871925354   0.004
#> 24: 1000000  synthetic   1000       192.2360           289 7.3866157532   0.117
#>     penalty short.type N.data mean.intervals max.intervals    megabytes seconds

Theoretically the most intervals that could be stored is O(i) for each data point i ∈ {1, ..., N}. Therefore the largest total number of intervals is sum(1:N), which can also be computed by N*(N+1)/2. The largest mean is mean(1:N), which can be computed via sum(1:N)/N = (N+1)/2.

(worst.dt <- data.table(
  N.data=N.data.vec,
  mean.intervals=(N.data.vec+1)/2,
  short.type="theoretical"))
#>    N.data mean.intervals  short.type
#>     <num>          <num>      <char>
#> 1:     10            5.5 theoretical
#> 2:    100           50.5 theoretical
#> 3:   1000          500.5 theoretical

The plot below shows that the algorithm achieves the theoretical worst case time complexity for the synthetic increasing data, when the penalty is large. But the number of intervals is always much smaller for real Mono27ac ChIP-seq data.

one <- function(short.type, data.type, color){
  data.table(short.type, data.type, color)
}
type.dt <- rbind(
  one("theoretical", "Theoretical\nworst case", "grey"),
  one("synthetic", "Synthetic\nIncreasing", "red"),
  one("real", "Real ChIP-seq", "black"))
loss.types <- type.dt[loss, on=list(short.type)]
worst.types <- type.dt[worst.dt, on=list(short.type)]
(type.colors <- type.dt[, structure(color, names=data.type)])
#> Theoretical\nworst case   Synthetic\nIncreasing           Real ChIP-seq 
#>                  "grey"                   "red"                 "black"
if(require(ggplot2)){
ggplot()+
  guides(
    color=guide_legend(keyheight=3)
  )+
  geom_blank(aes(
    N.data, 1),
    data=data.table(N.data=c(5, 2000)))+
  theme_bw()+
  theme(panel.spacing=grid::unit(0, "lines"))+
  facet_grid(. ~ penalty, labeller=label_both)+
  geom_line(aes(
    N.data, mean.intervals, color=data.type),
    data=worst.types)+
  scale_color_manual(
    values=type.colors,
    breaks=names(type.colors))+
  geom_line(aes(
    bedGraph.lines, mean.intervals, color=data.type),
    data=loss.types)+
  scale_x_log10("N data")+
  scale_y_log10("Mean intervals (candidate changepoints)")
}