(subject.size.vec <- unique(as.integer(10^seq(0,3.5,l=100))))
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#> [16] 17 18 20 22 23 25 28 30 33 35 38 42 45 49 53
#> [31] 58 63 68 74 81 87 95 103 112 121 132 143 155 168 183
#> [46] 198 215 233 253 275 298 323 351 380 413 448 486 527 572 620
#> [61] 673 730 792 859 932 1011 1097 1190 1291 1401 1519 1648 1788 1940 2104
#> [76] 2283 2477 2687 2915 3162
(backtrackers <- c(
if(requireNamespace("stringi"))atime::atime_grid(
ICU=stringi::stri_match(subject, regex=pattern)),
atime::atime_grid(
PCRE=regexpr(pattern, subject, perl=TRUE),
TRE=regexpr(pattern, subject, perl=FALSE))))
#> Loading required namespace: stringi
#> $ICU
#> stringi::stri_match(subject, regex = pattern)
#>
#> $PCRE
#> regexpr(pattern, subject, perl = TRUE)
#>
#> $TRE
#> regexpr(pattern, subject, perl = FALSE)
backtrackers.result <- atime::atime(
N=subject.size.vec,
setup={
subject <- paste(rep("a", N), collapse="")
pattern <- paste(rep(c("(a)?", "\\1"), each=N), collapse="")
},
expr.list=backtrackers)
backtrackers.best <- atime::references_best(backtrackers.result)
plot(backtrackers.best)
#> Warning in ggplot2::scale_y_log10(""): log-10 transformation introduced
#> infinite values.
The plot above shows that ICU/PCRE/TRE are all exponential in N (subject/pattern size) when the pattern contains backreferences.
all.exprs <- c(
if(requireNamespace("re2"))atime::atime_grid(
RE2=re2::re2_match(subject, pattern)),
backtrackers)
#> Loading required namespace: re2
all.result <- atime::atime(
N=subject.size.vec,
setup={
subject <- paste(rep("a", N), collapse="")
pattern <- paste(rep(c("a?", "a"), each=N), collapse="")
},
expr.list=all.exprs)
all.best <- atime::references_best(all.result)
plot(all.best)
#> Warning in ggplot2::scale_y_log10(""): log-10 transformation introduced
#> infinite values.
The plot above shows that ICU/PCRE are exponential time whereas
RE2/TRE are polynomial time. Exercise for the reader: modify the above
code to use the seconds.limit
argument so that you can see
what happens to ICU/PCRE for larger N (hint: you should see a difference
at larger sizes).
(all.pred <- predict(all.best))
#> atime_prediction object
#> unit expr.name unit.value N
#> <char> <char> <num> <num>
#> 1: seconds RE2 0.01 495.44403
#> 2: seconds ICU 0.01 17.64505
#> 3: seconds PCRE 0.01 17.76005
#> 4: seconds TRE 0.01 167.62036
summary(all.pred)
#> Length Class Mode
#> seconds.limit 1 -none- numeric
#> references 16 data.table list
#> plot.references 16 data.table list
#> measurements 23 data.table list
#> by.vec 1 -none- character
#> prediction 4 data.table list
The predict
method above returns a list with a new
element named prediction
, which shows the data sizes that
can be computed with a given time budget. The plot
method
is used below,
atime_grid
to compare different enginesIn the nc
package there is an engine
argument which controls which C regex library is used:
nc.exprs <- atime::atime_grid(
list(ENGINE=c(
if(requireNamespace("re2"))"RE2",
"PCRE",
if(requireNamespace("stringi"))"ICU")),
nc=nc::capture_first_vec(subject, pattern, engine=ENGINE))
nc.result <- atime::atime(
N=subject.size.vec,
setup={
rep.collapse <- function(chr)paste(rep(chr, N), collapse="")
subject <- rep.collapse("a")
pattern <- list(maybe=rep.collapse("a?"), rep.collapse("a"))
},
expr.list=nc.exprs)
nc.best <- atime::references_best(nc.result)
plot(nc.best)
The result/plot above is consistent with the previous result.