This vignette provides comparisons with other packages that provide similar functionality. This is a work in progress – for a more detailed / complete / coherent comparison with other packages which provide wide-to-tall data reshaping, see my paper.
Sometimes you want to melt a “wide” data table which has several distinct pieces of information encoded in each column name. One example is the familiar iris data, which have flower part and measurement dimension encoded in each of four column names:
library(data.table)
data.table(iris)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <num> <num> <num> <num> <fctr>
#> 1: 5.1 3.5 1.4 0.2 setosa
#> 2: 4.9 3.0 1.4 0.2 setosa
#> 3: 4.7 3.2 1.3 0.2 setosa
#> 4: 4.6 3.1 1.5 0.2 setosa
#> 5: 5.0 3.6 1.4 0.2 setosa
#> ---
#> 146: 6.7 3.0 5.2 2.3 virginica
#> 147: 6.3 2.5 5.0 1.9 virginica
#> 148: 6.5 3.0 5.2 2.0 virginica
#> 149: 6.2 3.4 5.4 2.3 virginica
#> 150: 5.9 3.0 5.1 1.8 virginica
The goal in this section will be to convert these data into a format
with a column for each flower part (Sepal
and
Petal
) so we can easily make a facetted scatterplot to
visually examine whether or not sepals or larger than petals. The
easiest way to perform this conversion is with packages which provide a
function for melting into multiple output columns:
iris.parts <- list(
nc=nc::capture_melt_multiple(
iris,
column=".*?",
"[.]",
dim=".*"),
tidyr=if(requireNamespace("tidyr"))tidyr::pivot_longer(
iris,
cols=1:4,
names_to=c(".value", "dim"),
names_sep="[.]"),
stats=stats::reshape(
iris,
direction="long",
timevar="dim",
varying=1:4,
sep="."),
"data.table::melt"=melt(
data.table(iris),
measure.vars=patterns(
Sepal="^Sepal",
Petal="^Petal")
)[data.table(
variable=factor(1:2), dim=c("Length", "Width")
), on=.(variable)],
if(requireNamespace("cdata"))cdata::rowrecs_to_blocks(
iris,
controlTable=data.frame(
dim=c("Length", "Width"),
Petal=c("Petal.Length", "Petal.Width"),
Sepal=c("Sepal.Length", "Sepal.Width"),
stringsAsFactors=FALSE),
columnsToCopy="Species"))
#> Loading required namespace: cdata
iris.parts$nc
#> Species dim Petal Sepal
#> <fctr> <char> <num> <num>
#> 1: setosa Length 1.4 5.1
#> 2: setosa Length 1.4 4.9
#> 3: setosa Length 1.3 4.7
#> 4: setosa Length 1.5 4.6
#> 5: setosa Length 1.4 5.0
#> ---
#> 296: virginica Width 2.3 3.0
#> 297: virginica Width 1.9 2.5
#> 298: virginica Width 2.0 3.0
#> 299: virginica Width 2.3 3.4
#> 300: virginica Width 1.8 3.0
It is clear from the code above that each package is capable of the conversions. However the syntax and level of explicitness varies:
nc::capture_melt_multiple
requires a regular
expression: (most implicit, least repetition)
column
group are used for the
output column names.tidyr::pivot_longer
and stats::reshape
require specification of the input columns to melt along with a
separator.
stats::reshape
assumes the output columns names occur
in the part of the input column name before the separator.tidyr::pivot_longer
assumes the output columns occur in
the part which corresponds to the .value
element of the
names_to
argument.data.table::melt
requires a join to recover the
dim
output column.cdata::rowrecs_to_blocks
requires explicit
specification of a control table. (most explicit, most repetition)Any of the results can be visualized via:
if(require(ggplot2)){
ggplot()+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(dim ~ Species)+
coord_equal()+
geom_abline(slope=1, intercept=0, color="grey")+
geom_point(aes(
Petal, Sepal),
data=iris.parts$nc)
}
It is clear from the plot above that sepals are larger than petals, for every measured flower.
What if we wanted to compare dimensions rather than parts?
iris.dims <- list(
nc=nc::capture_melt_multiple(
iris,
part=".*?",
"[.]",
column=".*"),
stats=stats::reshape(
structure(iris, names=sub("(.*?)[.](.*)", "\\2.\\1", names(iris))),
direction="long",
timevar="part",
varying=1:4,
sep="."))
iris.dims$nc
#> Species part Length Width
#> <fctr> <char> <num> <num>
#> 1: setosa Petal 1.4 0.2
#> 2: setosa Petal 1.4 0.2
#> 3: setosa Petal 1.3 0.2
#> 4: setosa Petal 1.5 0.2
#> 5: setosa Petal 1.4 0.2
#> ---
#> 296: virginica Sepal 6.7 3.0
#> 297: virginica Sepal 6.3 2.5
#> 298: virginica Sepal 6.5 3.0
#> 299: virginica Sepal 6.2 3.4
#> 300: virginica Sepal 5.9 3.0
The code above shows that the syntax is mostly the same for this
example. The biggest difference is for stats::reshape
which
assumes that each input column name is composed of (1) the output column
name, (2) a delimiter, and (3) some additional information to be stored
in the output column given by timevar
. Therefore we need to
pre-process column names using sub
for it to work.
if(require(ggplot2)){
ggplot()+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(part ~ Species)+
coord_equal()+
geom_abline(slope=1, intercept=0, color="grey")+
geom_point(aes(
Length, Width),
data=iris.dims$nc)
}
It is clear from the plot above that Length
is larger
than Width
for every measured flower part.
Consider the following wide data set:
TC <- data.table::data.table(
age.treatment=c(1, 5),
sex.control=c("M", "M"),
sex.treatment=c("F", "F"),
age.control=c(10, 50))
It is clear from the column names how the data should be grouped when they are converted to tall format. However the columns do not appear in regular order (age is before sex for treatment, but age is after sex for control), which causes a problem for stats and data.table:
input.list <- list(
"nc"=nc::capture_melt_multiple(
TC,
column=".*?",
"[.]",
group=".*"),
"cdata"=if(requireNamespace("cdata"))cdata::rowrecs_to_blocks(
TC,
controlTable=data.frame(
group=c("treatment", "control"),
age=c("age.treatment", "age.control"),
sex=c("sex.treatment", "sex.control"),
stringsAsFactors=FALSE)),
"data.table"=data.table::melt(TC, measure.vars=patterns(
age="age",
sex="sex")),
"stats"=stats::reshape(
TC,
varying=1:4,
direction="long"),
"tidyr"=if(requireNamespace("tidyr"))tidyr::pivot_longer(
TC, 1:4,
names_to=c(".value", "group"),
names_sep="[.]"))
output.list <- list()
for(pkg in names(input.list)){
df.or.null <- input.list[[pkg]]
if(is.data.frame(df.or.null)){
output.list[[pkg]] <- data.table::data.table(df.or.null)[order(age)]
}
}
output.list
#> $nc
#> group age sex
#> <char> <num> <char>
#> 1: treatment 1 F
#> 2: treatment 5 F
#> 3: control 10 M
#> 4: control 50 M
#>
#> $cdata
#> group age sex
#> <char> <num> <char>
#> 1: treatment 1 F
#> 2: treatment 5 F
#> 3: control 10 M
#> 4: control 50 M
#>
#> $data.table
#> variable age sex
#> <fctr> <num> <char>
#> 1: 1 1 M
#> 2: 1 5 M
#> 3: 2 10 F
#> 4: 2 50 F
#>
#> $stats
#> time age sex id
#> <char> <num> <char> <int>
#> 1: treatment 1 M 1
#> 2: treatment 5 M 2
#> 3: control 10 F 1
#> 4: control 50 F 2
#>
#> $tidyr
#> group age sex
#> <char> <num> <char>
#> 1: treatment 1 F
#> 2: treatment 5 F
#> 3: control 10 M
#> 4: control 50 M
sapply(output.list, function(DT)identical(DT$sex, c("F", "F", "M", "M")))
#> nc cdata data.table stats tidyr
#> TRUE TRUE FALSE FALSE TRUE
In conclusion, when the input column names to melt do not appear in
the same order across groups or output columns, then the correct tall
data can be computed using one of
nc::capture_melt_multiple
,
tidyr::pivot_longer
,
cdata::rowrecs_to_blocks
.
Another data set where it is useful to do column name pattern matching followed by melting is the World Health Organization data:
if(requireNamespace("tidyr")){
data(who, package="tidyr")
}else{
who <- data.frame(id=1, new_sp_m5564=2, newrel_f65=3)
}
names(who)
#> [1] "country" "iso2" "iso3" "year" "new_sp_m014"
#> [6] "new_sp_m1524" "new_sp_m2534" "new_sp_m3544" "new_sp_m4554" "new_sp_m5564"
#> [11] "new_sp_m65" "new_sp_f014" "new_sp_f1524" "new_sp_f2534" "new_sp_f3544"
#> [16] "new_sp_f4554" "new_sp_f5564" "new_sp_f65" "new_sn_m014" "new_sn_m1524"
#> [21] "new_sn_m2534" "new_sn_m3544" "new_sn_m4554" "new_sn_m5564" "new_sn_m65"
#> [26] "new_sn_f014" "new_sn_f1524" "new_sn_f2534" "new_sn_f3544" "new_sn_f4554"
#> [31] "new_sn_f5564" "new_sn_f65" "new_ep_m014" "new_ep_m1524" "new_ep_m2534"
#> [36] "new_ep_m3544" "new_ep_m4554" "new_ep_m5564" "new_ep_m65" "new_ep_f014"
#> [41] "new_ep_f1524" "new_ep_f2534" "new_ep_f3544" "new_ep_f4554" "new_ep_f5564"
#> [46] "new_ep_f65" "newrel_m014" "newrel_m1524" "newrel_m2534" "newrel_m3544"
#> [51] "newrel_m4554" "newrel_m5564" "newrel_m65" "newrel_f014" "newrel_f1524"
#> [56] "newrel_f2534" "newrel_f3544" "newrel_f4554" "newrel_f5564" "newrel_f65"
Each column which starts with new
has three distinct
pieces of information encoded in its name: diagnosis type (e.g. sp or
rel), gender (m or f), and age range (e.g. 5564 or 1524). We would like
to use a regex to match these column names, then using the matching
columns as measure.vars in a melt, then join the two results. The most
convenient way to do that is via:
who.chr.list <- list(
nc=nc::capture_melt_single(
who,
"new_?",
diagnosis=".*",
"_",
gender=".",
ages=".*"),
tidyr=if(requireNamespace("tidyr"))tidyr::pivot_longer(
who,
new_sp_m014:newrel_f65,
names_to=c("diagnosis", "gender", "ages"),
names_pattern="new_?(.*)_(.)(.*)"))
Note the result includes additional column value
which
contains the melted data. There is also a column for each capture group
in the specified pattern. The following example shows how to rename the
value
column, remove missing values, and use numeric type
conversion functions:
who.pattern <- "new_?(.*)_(.)((0|[0-9]{2})([0-9]{0,2}))"
as.numeric.Inf <- function(y)ifelse(y=="", Inf, as.numeric(y))
who.typed.list <- list(
nc=nc::capture_melt_single(
who,
"new_?",
diagnosis=".*",
"_",
gender=".",
ages=list(
ymin.num="0|[0-9]{2}", as.numeric,
ymax.num="[0-9]{0,2}", as.numeric.Inf),
value.name="count",
na.rm=TRUE),
tidyr=if(requireNamespace("tidyr"))try(tidyr::pivot_longer(
who,
cols=grep(who.pattern, names(who)),
names_transform=list(
ymin.num=as.numeric,
ymax.num=as.numeric.Inf),
names_to=c("diagnosis", "gender", "ages", "ymin.num", "ymax.num"),
names_pattern=who.pattern,
values_drop_na=TRUE,
values_to="count")))
str(who.typed.list)
#> List of 2
#> $ nc :Classes 'data.table' and 'data.frame': 76046 obs. of 10 variables:
#> ..$ country : chr [1:76046] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#> ..$ iso2 : chr [1:76046] "AF" "AF" "AF" "AF" ...
#> ..$ iso3 : chr [1:76046] "AFG" "AFG" "AFG" "AFG" ...
#> ..$ year : num [1:76046] 1997 1998 1999 2000 2001 ...
#> ..$ diagnosis: chr [1:76046] "sp" "sp" "sp" "sp" ...
#> ..$ gender : chr [1:76046] "m" "m" "m" "m" ...
#> ..$ ages : chr [1:76046] "014" "014" "014" "014" ...
#> ..$ ymin.num : num [1:76046] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ ymax.num : num [1:76046] 14 14 14 14 14 14 14 14 14 14 ...
#> ..$ count : num [1:76046] 0 30 8 52 129 90 127 139 151 193 ...
#> ..- attr(*, ".internal.selfref")=<externalptr>
#> $ tidyr: tibble [76,046 × 10] (S3: tbl_df/tbl/data.frame)
#> ..$ country : chr [1:76046] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#> ..$ iso2 : chr [1:76046] "AF" "AF" "AF" "AF" ...
#> ..$ iso3 : chr [1:76046] "AFG" "AFG" "AFG" "AFG" ...
#> ..$ year : num [1:76046] 1997 1997 1997 1997 1997 ...
#> ..$ diagnosis: chr [1:76046] "sp" "sp" "sp" "sp" ...
#> ..$ gender : chr [1:76046] "m" "m" "m" "m" ...
#> ..$ ages : chr [1:76046] "014" "1524" "2534" "3544" ...
#> ..$ ymin.num : num [1:76046] 0 15 25 35 45 55 65 0 15 25 ...
#> ..$ ymax.num : num [1:76046] 14 24 34 44 54 ...
#> ..$ count : num [1:76046] 0 10 6 3 5 2 0 5 38 36 ...
The result above shows that nc::capture_melt_single
(1)
makes it easier to define complex patterns (2) supports type conversion
without a post-processing step, and (3) reduces repetition in user code.
There are several sources of repetition in tidyr
code:
ymin.num
appears only once for
nc
but twice for tidyr
.ymax.chr
appears only once for
nc
but three times for tidyr
.who
appears only once for
nc
but twice for tidyr
.nc
but twice for tidyr
.Other packages for doing this include:
if(requireNamespace("tidyr")){
gather.result <- tidyr::gather(
who,
"variable",
"count",
grep(who.pattern, names(who)),
na.rm=TRUE)
extract.result <- tidyr::extract(
gather.result,
"variable",
c("diagnosis", "gender", "ages", "ymin.int", "ymax.int"),
who.pattern,
convert=TRUE)
transform.result <- base::transform(
extract.result,
ymin.num=as.numeric(ymin.int),
ymax.num=ifelse(is.na(ymax.int), Inf, as.numeric(ymax.int)))
str(transform.result)
}
#> 'data.frame': 76046 obs. of 12 variables:
#> $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#> $ iso2 : chr "AF" "AF" "AF" "AF" ...
#> $ iso3 : chr "AFG" "AFG" "AFG" "AFG" ...
#> $ year : num 1997 1998 1999 2000 2001 ...
#> $ diagnosis: chr "sp" "sp" "sp" "sp" ...
#> $ gender : chr "m" "m" "m" "m" ...
#> $ ages : int 14 14 14 14 14 14 14 14 14 14 ...
#> $ ymin.int : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ ymax.int : int 14 14 14 14 14 14 14 14 14 14 ...
#> $ count : num 0 30 8 52 129 90 127 139 151 193 ...
#> $ ymin.num : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ ymax.num : num 14 14 14 14 14 14 14 14 14 14 ...
Note that tidyr::gather
requires two post-processing
steps, which cause the same two types of repetition as
tidyr::pivot_longer
:
base::transform
is used for
converting age range variables to numeric, since default types are int
with convert=TRUE
.tidyr::extract
is used to convert the melted column into
several output columns; this results in repetition in the code because
the regex is also used to define the columns to melt/gather.The reshape2
package suffers from the same two
issues:
reshape2.result <- if(requireNamespace("reshape2")){
reshape2:::melt.data.frame(
who,
measure.vars=grep(who.pattern, names(who)),
na.rm=TRUE,
value.name="count")
}
#> Loading required namespace: reshape2
Interestingly, data.table::patterns
can be used to avoid
repeating the data set name, who
. However it supports
neither type conversion nor regex capture groups.
dt.result <- data.table::melt.data.table(
data.table(who),
measure.vars=patterns(who.pattern),
na.rm=TRUE,
value.name="count")
Neither cdata nor stats provide an na.rm option:
## Example 1: melting a wider iris data back to original.
library(data.table)
iris.dt <- data.table(
i=1:nrow(iris),
iris[,1:4],
Species=paste(iris$Species))
print(iris.dt)
#> i Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <int> <num> <num> <num> <num> <char>
#> 1: 1 5.1 3.5 1.4 0.2 setosa
#> 2: 2 4.9 3.0 1.4 0.2 setosa
#> 3: 3 4.7 3.2 1.3 0.2 setosa
#> 4: 4 4.6 3.1 1.5 0.2 setosa
#> 5: 5 5.0 3.6 1.4 0.2 setosa
#> ---
#> 146: 146 6.7 3.0 5.2 2.3 virginica
#> 147: 147 6.3 2.5 5.0 1.9 virginica
#> 148: 148 6.5 3.0 5.2 2.0 virginica
#> 149: 149 6.2 3.4 5.4 2.3 virginica
#> 150: 150 5.9 3.0 5.1 1.8 virginica
## what if we had two observations on each row?
set.seed(1)
iris.rand <- iris.dt[sample(.N)]
iris.wide <- cbind(treatment=iris.rand[1:75], control=iris.rand[76:150])
print(iris.wide, topn=2, nrows=10)
#> treatment.i treatment.Sepal.Length treatment.Sepal.Width
#> <int> <num> <num>
#> 1: 68 5.8 2.7
#> 2: 129 6.4 2.8
#> ---
#> 74: 91 5.5 2.6
#> 75: 64 6.1 2.9
#> treatment.Petal.Length treatment.Petal.Width treatment.Species control.i
#> <num> <num> <char> <int>
#> 1: 4.1 1.0 versicolor 60
#> 2: 5.6 2.1 virginica 113
#> ---
#> 74: 4.4 1.2 versicolor 57
#> 75: 4.7 1.4 versicolor 72
#> control.Sepal.Length control.Sepal.Width control.Petal.Length
#> <num> <num> <num>
#> 1: 5.2 2.7 3.9
#> 2: 6.8 3.0 5.5
#> ---
#> 74: 6.3 3.3 4.7
#> 75: 6.1 2.8 4.0
#> control.Petal.Width control.Species
#> <num> <char>
#> 1: 1.4 versicolor
#> 2: 2.1 virginica
#> ---
#> 74: 1.6 versicolor
#> 75: 1.3 versicolor
## This is the usual data.table syntax for getting the original iris back.
iris.melted <- melt(iris.wide, value.factor=TRUE, measure.vars = patterns(
i="i$",
Sepal.Length="Sepal.Length$",
Sepal.Width="Sepal.Width$",
Petal.Length="Petal.Length$",
Petal.Width="Petal.Width$",
Species="Species$"))
identical(iris.melted[order(i), names(iris.dt), with=FALSE], iris.dt)
#> [1] TRUE
## nc can do the same thing -- you must define an R argument named
## column, and another named argument which identifies each group.
(nc.melted <- nc::capture_melt_multiple(
iris.wide,
group="[^.]+",
"[.]",
column=".*"))
#> group Petal.Length Petal.Width Sepal.Length Sepal.Width Species
#> <char> <num> <num> <num> <num> <char>
#> 1: control 3.9 1.4 5.2 2.7 versicolor
#> 2: control 5.5 2.1 6.8 3.0 virginica
#> 3: control 5.6 1.4 6.1 2.6 virginica
#> 4: control 1.5 0.1 4.9 3.1 setosa
#> 5: control 1.4 0.2 5.1 3.5 setosa
#> ---
#> 146: treatment 1.6 0.2 4.8 3.1 setosa
#> 147: treatment 1.3 0.4 5.4 3.9 setosa
#> 148: treatment 5.4 2.1 6.9 3.1 virginica
#> 149: treatment 4.4 1.2 5.5 2.6 versicolor
#> 150: treatment 4.7 1.4 6.1 2.9 versicolor
#> i
#> <int>
#> 1: 60
#> 2: 113
#> 3: 135
#> 4: 10
#> 5: 1
#> ---
#> 146: 31
#> 147: 17
#> 148: 140
#> 149: 91
#> 150: 64
identical(nc.melted[order(i), names(iris.dt), with=FALSE], iris.dt)
#> [1] TRUE
## This is how we do it using stats::reshape.
iris.wide.df <- data.frame(iris.wide)
names(iris.wide.df) <- sub("(.*?)[.](.*)", "\\2_\\1", names(iris.wide))
iris.reshaped <- stats::reshape(
iris.wide.df,
direction="long",
timevar="group",
varying=names(iris.wide.df),
sep="_")
identical(data.table(iris.reshaped[, names(iris.dt)])[order(i)], iris.dt)
#> [1] TRUE
## get the parts columns and groups -- is there any difference
## between groups? of course not!
parts.wide <- nc::capture_melt_multiple(
iris.wide,
group=".*?",
"[.]",
column=".*?",
"[.]",
dim=".*")
if(require("ggplot2")){
ggplot()+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(dim ~ group)+
coord_equal()+
geom_abline(slope=1, intercept=0, color="grey")+
geom_point(aes(
Petal, Sepal),
data=parts.wide)
}
## Example 2. Lots of column types, from example(melt.data.table).
DT <- data.table(
i_1 = c(1:5, NA),
i_2 = c(NA,6:10),
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
c_1 = sample(c(letters[1:3], NA), 6, TRUE),
d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
d_2 = as.Date(6:1, origin="2012-01-01"))
## add a couple of list cols
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]
## original DT syntax is quite repetitive.
melt(DT, measure=patterns(
i="^i",
f="^f",
d="^d",
l="^l"
))
#> c_1 variable i f d l
#> <char> <fctr> <int> <char> <Date> <list>
#> 1: a 1 1 a 2013-09-02 1,1,1,1
#> 2: a 1 2 a 2013-09-03 2,2
#> 3: <NA> 1 3 <NA> 2013-09-04 3
#> 4: b 1 4 a <NA> 4,4,4
#> 5: b 1 5 c 2013-09-05 5
#> 6: <NA> 1 NA c 2013-09-06 NA
#> 7: a 2 NA z 2012-01-07 a,a,a,a
#> 8: a 2 6 a 2012-01-06 a,a,a,a,a
#> 9: <NA> 2 7 x 2012-01-05 NA
#> 10: b 2 8 c 2012-01-04 b,b,b
#> 11: b 2 9 x 2012-01-03 b,b,b,b
#> 12: <NA> 2 10 x 2012-01-02 NA,NA,NA,NA,NA
## nc syntax uses a single regex rather than four.
nc::capture_melt_multiple(
DT,
column="^[^c]",
"_",
number="[12]")
#> c_1 number d f i l
#> <char> <char> <Date> <char> <int> <list>
#> 1: a 1 2013-09-02 a 1 1,1,1,1
#> 2: a 1 2013-09-03 a 2 2,2
#> 3: <NA> 1 2013-09-04 <NA> 3 3
#> 4: b 1 <NA> a 4 4,4,4
#> 5: b 1 2013-09-05 c 5 5
#> 6: <NA> 1 2013-09-06 c NA NA
#> 7: a 2 2012-01-07 z NA a,a,a,a
#> 8: a 2 2012-01-06 a 6 a,a,a,a,a
#> 9: <NA> 2 2012-01-05 x 7 NA
#> 10: b 2 2012-01-04 c 8 b,b,b
#> 11: b 2 2012-01-03 x 9 b,b,b,b
#> 12: <NA> 2 2012-01-02 x 10 NA,NA,NA,NA,NA
## id.vars can be specified using original DT syntax.
melt(DT, id=1:2, measure=patterns(
f="^f",
l="^l"
))
#> i_1 i_2 variable f l
#> <int> <int> <fctr> <char> <list>
#> 1: 1 NA 1 a 1,1,1,1
#> 2: 2 6 1 a 2,2
#> 3: 3 7 1 <NA> 3
#> 4: 4 8 1 a 4,4,4
#> 5: 5 9 1 c 5
#> 6: NA 10 1 c NA
#> 7: 1 NA 2 z a,a,a,a
#> 8: 2 6 2 a a,a,a,a,a
#> 9: 3 7 2 x NA
#> 10: 4 8 2 c b,b,b
#> 11: 5 9 2 x b,b,b,b
#> 12: NA 10 2 x NA,NA,NA,NA,NA
nc::capture_melt_multiple(
DT,
column="^[fl]",
"_",
number="[12]")
#> i_1 i_2 c_1 d_1 d_2 number f l
#> <int> <int> <char> <Date> <Date> <char> <char> <list>
#> 1: 1 NA a 2013-09-02 2012-01-07 1 a 1,1,1,1
#> 2: 2 6 a 2013-09-03 2012-01-06 1 a 2,2
#> 3: 3 7 <NA> 2013-09-04 2012-01-05 1 <NA> 3
#> 4: 4 8 b <NA> 2012-01-04 1 a 4,4,4
#> 5: 5 9 b 2013-09-05 2012-01-03 1 c 5
#> 6: NA 10 <NA> 2013-09-06 2012-01-02 1 c NA
#> 7: 1 NA a 2013-09-02 2012-01-07 2 z a,a,a,a
#> 8: 2 6 a 2013-09-03 2012-01-06 2 a a,a,a,a,a
#> 9: 3 7 <NA> 2013-09-04 2012-01-05 2 x NA
#> 10: 4 8 b <NA> 2012-01-04 2 c b,b,b
#> 11: 5 9 b 2013-09-05 2012-01-03 2 x b,b,b,b
#> 12: NA 10 <NA> 2013-09-06 2012-01-02 2 x NA,NA,NA,NA,NA
## reshape does not support list columns.
reshape(
DT,
varying=grep("^[fid]", names(DT)),
sep="_",
direction="long",
timevar="number")
#> c_1 l_1 l_2 number i f d id
#> <char> <list> <list> <num> <int> <ord> <Date> <int>
#> 1: a 1,1,1,1 a,a,a,a 1 1 a 2013-09-02 1
#> 2: a 2,2 a,a,a,a,a 1 2 a 2013-09-03 2
#> 3: <NA> 3 NA 1 3 <NA> 2013-09-04 3
#> 4: b 4,4,4 b,b,b 1 4 a <NA> 4
#> 5: b 5 b,b,b,b 1 5 c 2013-09-05 5
#> 6: <NA> NA NA,NA,NA,NA,NA 1 NA c 2013-09-06 6
#> 7: a 1,1,1,1 a,a,a,a 2 NA z 2012-01-07 1
#> 8: a 2,2 a,a,a,a,a 2 6 a 2012-01-06 2
#> 9: <NA> 3 NA 2 7 x 2012-01-05 3
#> 10: b 4,4,4 b,b,b 2 8 c 2012-01-04 4
#> 11: b 5 b,b,b,b 2 9 x 2012-01-03 5
#> 12: <NA> NA NA,NA,NA,NA,NA 2 10 x 2012-01-02 6
## tidyr does, but errors for combining ordered and un-ordered factors.
if(requireNamespace("tidyr")){
tidyr::pivot_longer(
DT, grep("[cf]", names(DT), invert=TRUE),
names_pattern="(.)_(.)",
names_to=c(".value", "number"))
}
#> # A tibble: 12 × 7
#> f_1 f_2 c_1 number i d l
#> <fct> <ord> <chr> <chr> <int> <date> <list>
#> 1 a z a 1 1 2013-09-02 <int [4]>
#> 2 a z a 2 NA 2012-01-07 <chr [4]>
#> 3 a a a 1 2 2013-09-03 <int [2]>
#> 4 a a a 2 6 2012-01-06 <chr [5]>
#> 5 <NA> x <NA> 1 3 2013-09-04 <int [1]>
#> 6 <NA> x <NA> 2 7 2012-01-05 <chr [1]>
#> 7 a c b 1 4 NA <int [3]>
#> 8 a c b 2 8 2012-01-04 <chr [3]>
#> 9 c x b 1 5 2013-09-05 <int [1]>
#> 10 c x b 2 9 2012-01-03 <chr [4]>
#> 11 c x <NA> 1 NA 2013-09-06 <int [1]>
#> 12 c x <NA> 2 10 2012-01-02 <chr [5]>
## Example 3, three children, one family per row, from data.table
## vignette.
family.dt <- fread(text="
family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
1 30 1998-11-26 2000-01-29 NA 1 2 NA
2 27 1996-06-22 NA NA 2 NA NA
3 26 2002-07-11 2004-04-05 2007-09-02 2 2 1
4 32 2004-10-10 2009-08-27 2012-07-21 1 1 1
5 29 2000-12-05 2005-02-28 NA 2 1 NA")
(children.melt <- melt(family.dt, measure = patterns(
dob="^dob", gender="^gender"
), na.rm=TRUE, variable.factor=FALSE))
#> family_id age_mother variable dob gender
#> <int> <int> <char> <IDat> <int>
#> 1: 1 30 1 1998-11-26 1
#> 2: 2 27 1 1996-06-22 2
#> 3: 3 26 1 2002-07-11 2
#> 4: 4 32 1 2004-10-10 1
#> 5: 5 29 1 2000-12-05 2
#> 6: 1 30 2 2000-01-29 2
#> 7: 3 26 2 2004-04-05 2
#> 8: 4 32 2 2009-08-27 1
#> 9: 5 29 2 2005-02-28 1
#> 10: 3 26 3 2007-09-02 1
#> 11: 4 32 3 2012-07-21 1
## nc::field can be used to define group name and pattern at the
## same time, to avoid repetitive code.
(children.nc <- nc::capture_melt_multiple(
family.dt,
column="[^_]+",
"_",
nc::field("child", "", "[1-3]"),
na.rm=TRUE))
#> family_id age_mother child dob gender
#> <int> <int> <char> <IDat> <int>
#> 1: 1 30 1 1998-11-26 1
#> 2: 2 27 1 1996-06-22 2
#> 3: 3 26 1 2002-07-11 2
#> 4: 4 32 1 2004-10-10 1
#> 5: 5 29 1 2000-12-05 2
#> 6: 1 30 2 2000-01-29 2
#> 7: 3 26 2 2004-04-05 2
#> 8: 4 32 2 2009-08-27 1
#> 9: 5 29 2 2005-02-28 1
#> 10: 3 26 3 2007-09-02 1
#> 11: 4 32 3 2012-07-21 1
## reshape works too.
stats::reshape(
family.dt,
varying=grep("child", names(family.dt)),
direction="long",
sep="_",
timevar="child.str")
#> family_id age_mother child.str dob gender id
#> <int> <int> <char> <IDat> <int> <int>
#> 1: 1 30 child1 1998-11-26 1 1
#> 2: 2 27 child1 1996-06-22 2 2
#> 3: 3 26 child1 2002-07-11 2 3
#> 4: 4 32 child1 2004-10-10 1 4
#> 5: 5 29 child1 2000-12-05 2 5
#> 6: 1 30 child2 2000-01-29 2 1
#> 7: 2 27 child2 <NA> NA 2
#> 8: 3 26 child2 2004-04-05 2 3
#> 9: 4 32 child2 2009-08-27 1 4
#> 10: 5 29 child2 2005-02-28 1 5
#> 11: 1 30 child3 <NA> NA 1
#> 12: 2 27 child3 <NA> NA 2
#> 13: 3 26 child3 2007-09-02 1 3
#> 14: 4 32 child3 2012-07-21 1 4
#> 15: 5 29 child3 <NA> NA 5
## Comparison with base R. 1. mfrow means parts on rows, mfcol means
## parts on columns. 2. same number of lines of code. 3. nc/ggplot2
## code has more names and fewer numbers.
imat <- as.matrix(iris[, 1:4])
ylim <- range(table(imat))
xlim <- range(imat)
old.par <- par(mfcol=c(2,2), mar=c(2,2,1,1))
for(col.i in 1:ncol(imat)){
hist(
imat[, col.i],
breaks=seq(xlim[1], xlim[2], by=0.1),
ylim=ylim,
main=colnames(imat)[col.i])
}
All four packages below can convert the input reshape column name to a numeric output capture column.
pen.peaks.wide <- data.table::data.table(
data.set=c("foo", "bar"),
"10.1"=c(5L, 10L),
"0.3"=c(26L, 39L))
pen.peaks.gather <- if(requireNamespace("tidyr"))tidyr::gather(
pen.peaks.wide,
"penalty",
"peaks",
-1,
convert=TRUE)
str(pen.peaks.gather)
#> 'data.frame': 4 obs. of 3 variables:
#> $ data.set: chr "foo" "bar" "foo" "bar"
#> $ penalty : num 10.1 10.1 0.3 0.3
#> $ peaks : int 5 10 26 39
pen.peaks.nc <- nc::capture_melt_single(
pen.peaks.wide,
penalty="^[0-9.]+", as.numeric,
value.name="peaks")
str(pen.peaks.nc)
#> Classes 'data.table' and 'data.frame': 4 obs. of 3 variables:
#> $ data.set: chr "foo" "bar" "foo" "bar"
#> $ penalty : num 10.1 10.1 0.3 0.3
#> $ peaks : int 5 10 26 39
#> - attr(*, ".internal.selfref")=<externalptr>
pen.peaks.pivot <- if(requireNamespace("tidyr"))try(tidyr::pivot_longer(
pen.peaks.wide,
-1,
names_to="penalty",
names_transform=list(penalty=as.numeric),
values_to="peaks"))
str(pen.peaks.pivot)
#> tibble [4 × 3] (S3: tbl_df/tbl/data.frame)
#> $ data.set: chr [1:4] "foo" "foo" "bar" "bar"
#> $ penalty : num [1:4] 10.1 0.3 10.1 0.3
#> $ peaks : int [1:4] 5 26 10 39
varying <- 2:3
pen.peaks.reshape.times <- stats::reshape(
pen.peaks.wide,
direction="long",
varying=varying,
times=as.numeric(names(pen.peaks.wide)[varying]),
v.names="peaks",
timevar="penalty")
str(pen.peaks.reshape.times)
#> Classes 'data.table' and 'data.frame': 4 obs. of 4 variables:
#> $ data.set: chr "foo" "bar" "foo" "bar"
#> $ penalty : num 10.1 10.1 0.3 0.3
#> $ peaks : int 5 10 26 39
#> $ id : int 1 2 1 2
#> - attr(*, ".internal.selfref")=<externalptr>
#> - attr(*, "reshapeLong")=List of 4
#> ..$ varying:List of 1
#> .. ..$ peaks: chr [1:2] "10.1" "0.3"
#> .. ..- attr(*, "v.names")= chr "peaks"
#> .. ..- attr(*, "times")= num [1:2] 10.1 0.3
#> ..$ v.names: chr "peaks"
#> ..$ idvar : chr "id"
#> ..$ timevar: chr "penalty"
pen.peaks.renamed <- pen.peaks.wide
names(pen.peaks.renamed) <- paste0(ifelse(
grepl("^[0-9]", names(pen.peaks.wide)),
"peaks_", ""),
names(pen.peaks.wide))
pen.peaks.reshape.sep <- stats::reshape(
pen.peaks.renamed,
direction="long",
varying=varying,
sep="_",
timevar="penalty")
str(pen.peaks.reshape.sep)
#> Classes 'data.table' and 'data.frame': 4 obs. of 4 variables:
#> $ data.set: chr "foo" "bar" "foo" "bar"
#> $ penalty : num 10.1 10.1 0.3 0.3
#> $ peaks : int 5 10 26 39
#> $ id : int 1 2 1 2
#> - attr(*, ".internal.selfref")=<externalptr>
#> - attr(*, "reshapeLong")=List of 4
#> ..$ varying:List of 1
#> .. ..$ peaks: chr [1:2] "peaks_10.1" "peaks_0.3"
#> .. ..- attr(*, "v.names")= chr "peaks"
#> .. ..- attr(*, "times")= num [1:2] 10.1 0.3
#> ..$ v.names: chr "peaks"
#> ..$ idvar : chr "id"
#> ..$ timevar: chr "penalty"
peaks.csv <- system.file(
"extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv",
package="nc", mustWork=TRUE)
peaks.wide <- data.table::fread(peaks.csv)
tidyr.long <- tidyr::pivot_longer(
peaks.wide,
grep(" [0-9]", names(peaks.wide)),
names_pattern = "(.*) ([0-9]+)",
names_to = c(".value", "peak"),
names_transform = list(peak=as.integer))
peaks.tall <- nc::capture_melt_multiple(
peaks.wide,
column=".*",
" ",
peak="[0-9]+", as.integer)
#> Warning in melt.data.table(L[["data"]], measure.vars = L[["measure.vars"]], :
#> 'measure.vars' [Allele 1, Allele 2, Allele 3, Allele 4, ...] are not all of the
#> same type. By order of hierarchy, the molten data value column will be of type
#> 'character'. All measure variables not of type 'character' will be coerced too.
#> Check DETAILS in ?melt.data.table for more on coercion.
#> Warning in melt.data.table(L[["data"]], measure.vars = L[["measure.vars"]], :
#> 'measure.vars' [Height 1, Height 2, Height 3, Height 4, ...] are not all of the
#> same type. By order of hierarchy, the molten data value column will be of type
#> 'integer'. All measure variables not of type 'integer' will be coerced too.
#> Check DETAILS in ?melt.data.table for more on coercion.
#> Warning in melt.data.table(L[["data"]], measure.vars = L[["measure.vars"]], :
#> 'measure.vars' [Size 1, Size 2, Size 3, Size 4, ...] are not all of the same
#> type. By order of hierarchy, the molten data value column will be of type
#> 'double'. All measure variables not of type 'double' will be coerced too. Check
#> DETAILS in ?melt.data.table for more on coercion.
options(old.par)