Skip to content

Commit 0d26a8c

Browse files
Merge branch 'master' into roll-nearest
2 parents 785e53e + 8052346 commit 0d26a8c

File tree

9 files changed

+198
-64
lines changed

9 files changed

+198
-64
lines changed

.ci/atime/tests.R

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,5 +277,14 @@ test.list <- atime::atime_test_list(
277277
Slow = "73d79edf8ff8c55163e90631072192301056e336", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
278278
Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
279279

280+
"isoweek improved in #7144" = atime::atime_test(
281+
setup = {
282+
set.seed(349)
283+
x = sample(Sys.Date() - 0:5000, N, replace=TRUE)
284+
},
285+
expr = data.table::isoweek(x),
286+
Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
287+
Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
288+
280289
tests=extra.test.list)
281290
# nolint end: undesirable_operator_linter.

NEWS.md

Lines changed: 55 additions & 34 deletions
Large diffs are not rendered by default.

R/IDateTime.R

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -342,19 +342,20 @@ yday = function(x) convertDate(as.IDate(x), "yday")
342342
wday = function(x) convertDate(as.IDate(x), "wday")
343343
mday = function(x) convertDate(as.IDate(x), "mday")
344344
week = function(x) convertDate(as.IDate(x), "week")
345-
isoweek = function(x) {
345+
# TODO(#3279): Investigate if improved as.IDate() makes our below implementation faster than this
346+
isoweek = function(x) as.integer(format(as.IDate(x), "%V"))
346347
# ISO 8601-conformant week, as described at
347348
# https://en.wikipedia.org/wiki/ISO_week_date
348349
# Approach:
349350
# * Find nearest Thursday to each element of x
350351
# * Find the number of weeks having passed between
351352
# January 1st of the year of the nearest Thursdays and x
352353

353-
x = as.IDate(x) # number of days since 1 Jan 1970 (a Thurs)
354-
nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
355-
year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
356-
1L + (nearest_thurs - year_start) %/% 7L
357-
}
354+
# x = as.IDate(x) # number of days since 1 Jan 1970 (a Thurs)
355+
# nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
356+
# year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
357+
# 1L + (nearest_thurs - year_start) %/% 7L
358+
358359

359360
month = function(x) convertDate(as.IDate(x), "month")
360361
quarter = function(x) convertDate(as.IDate(x), "quarter")

R/as.data.table.R

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
4848
if (!identical(keep.rownames, FALSE)) {
4949
# can specify col name to keep.rownames, #575
5050
ans = data.table(rn=rownames(x), x, keep.rownames=FALSE)
51+
# auto-inferred name 'x' is not back-compatible & inconsistent, #7145
52+
if (ncol(x) == 1L && is.null(colnames(x)))
53+
setnames(ans, 'x', 'V1')
5154
if (is.character(keep.rownames))
5255
setnames(ans, 'rn', keep.rownames[1L])
5356
return(ans)
@@ -133,9 +136,26 @@ as.data.table.list = function(x,
133136
missing.check.names = missing(check.names)
134137
origListNames = if (missing(.named)) names(x) else NULL # as.data.table called directly, not from inside data.table() which provides .named, #3854
135138
empty_atomic = FALSE
139+
140+
# Handle keep.rownames for vectors (mimicking data.frame behavior)
141+
rownames_ = NULL
142+
check_rownames = !isFALSE(keep.rownames)
143+
136144
for (i in seq_len(n)) {
137145
xi = x[[i]]
138146
if (is.null(xi)) next # eachncol already initialized to 0 by integer() above
147+
if (check_rownames && is.null(rownames_)) {
148+
if (is.null(dim(xi))) {
149+
if (!is.null(nm <- names(xi))) {
150+
rownames_ = nm
151+
x[[i]] = unname(xi)
152+
}
153+
} else {
154+
if (!is.null(nm <- rownames(xi))) {
155+
rownames_ = nm
156+
}
157+
}
158+
}
139159
if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE
140160
if ("POSIXlt" %chin% class(xi)) {
141161
warningf("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
@@ -200,6 +220,18 @@ as.data.table.list = function(x,
200220
}
201221
if (any(vnames==".SD")) stopf("A column may not be called .SD. That has special meaning.")
202222
if (check.names) vnames = make.names(vnames, unique=TRUE)
223+
224+
# Add rownames column when vector names were found
225+
if (!is.null(rownames_)) {
226+
rn_name = if (is.character(keep.rownames)) keep.rownames[1L] else "rn"
227+
if (!is.na(idx <- chmatch(rn_name, vnames)[1L])) {
228+
ans = c(list(ans[[idx]]), ans[-idx])
229+
vnames = c(vnames[idx], vnames[-idx])
230+
} else {
231+
ans = c(list(recycle(rownames_, nrow)), ans)
232+
vnames = c(rn_name, vnames)
233+
}
234+
}
203235
setattr(ans, "names", vnames)
204236
setDT(ans, key=key) # copy ensured above; also, setDT handles naming
205237
if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames) # PR 3854 and tests 2058.15-17

R/cedta.R

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdow
3939
}
4040
# nocov end
4141

42+
.any_sd_queries_in_stack = function(calls) {
43+
for (ii in length(calls):1) { # nolint: seq_linter. As above.
44+
if (!calls[[ii]] %iscall% "[") next
45+
the_lhs = calls[[ii]][[2L]]
46+
if (!is.name(the_lhs) || the_lhs != ".SD") next
47+
return(TRUE)
48+
}
49+
FALSE
50+
}
51+
4252
# cedta = Calling Environment Data.Table-Aware
4353
cedta = function(n=2L) {
4454
# Calling Environment Data Table Aware
@@ -52,12 +62,15 @@ cedta = function(n=2L) {
5262
return(TRUE)
5363
}
5464
nsname = getNamespaceName(ns)
65+
sc = sys.calls()
5566
ans = nsname=="data.table" ||
5667
"data.table" %chin% names(getNamespaceImports(ns)) || # most common and recommended cases first for speed
5768
(nsname=="utils" &&
5869
(exists("debugger.look", parent.frame(n+1L)) ||
59-
(length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972
60-
(nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply
70+
(length(sc)>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972
71+
(nsname=="base" && # lapply
72+
(all(c("FUN", "X") %chin% ls(parent.frame(n))) ||
73+
.any_sd_queries_in_stack(sc))) ||
6174
(nsname %chin% cedta.pkgEvalsUserCode && .any_eval_calls_in_stack()) ||
6275
nsname %chin% cedta.override ||
6376
isTRUE(ns$.datatable.aware) || # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist

inst/tests/tests.Rraw

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21281,13 +21281,46 @@ if (test_R.utils) local({
2128121281
})
2128221282

2128321283
# Create a data.table when one vector is transposed doesn't respect the name defined by user #4124
21284-
test(2321.1, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
21285-
test(2321.2, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
21286-
test(2321.3, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
21287-
test(2321.4, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
21284+
test(2321.01, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
21285+
test(2321.02, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
21286+
test(2321.03, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
21287+
test(2321.04, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
2128821288
## but respect named column vectors
21289-
test(2321.5, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
21290-
test(2321.6, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
21289+
test(2321.05, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
21290+
test(2321.06, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))
21291+
## also respect old naming pattern when invoked indirectly, #7145
21292+
M = cbind(1:3)
21293+
test(2321.07, as.data.table(M), data.table(V1=1:3))
21294+
rownames(M) = c('a', 'b', 'c')
21295+
test(2321.08, as.data.table(M), data.table(V1=1:3))
21296+
test(2321.09, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3))
21297+
colnames(M) = 'zz'
21298+
test(2321.10, as.data.table(M), data.table(zz=1:3))
21299+
test(2321.11, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), zz=1:3))
21300+
colnames(M) = 'x'
21301+
test(2321.12, as.data.table(M), data.table(x=1:3))
21302+
test(2321.13, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3))
21303+
M = cbind(M, y=4:6)
21304+
test(2321.14, as.data.table(M), data.table(x=1:3, y=4:6))
21305+
test(2321.15, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3, y=4:6))
21306+
colnames(M) = c('A', 'B')
21307+
test(2321.16, as.data.table(M), data.table(A=1:3, B=4:6))
21308+
test(2321.17, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, B=4:6))
21309+
colnames(M) = NULL
21310+
test(2321.18, as.data.table(M), data.table(V1=1:3, V2=4:6))
21311+
test(2321.19, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, V2=4:6))
21312+
colnames(M) = c('x', '')
21313+
test(2321.20, as.data.table(M), data.table(x=1:3, V2=4:6))
21314+
test(2321.21, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), x=1:3, V2=4:6))
21315+
colnames(M) = c('', 'x')
21316+
test(2321.22, as.data.table(M), data.table(V1=1:3, x=4:6))
21317+
test(2321.23, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, x=4:6))
21318+
colnames(M) = c('', '')
21319+
test(2321.24, as.data.table(M), data.table(V1=1:3, V2=4:6))
21320+
test(2321.25, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), V1=1:3, V2=4:6))
21321+
colnames(M) = c('A', '')
21322+
test(2321.26, as.data.table(M), data.table(A=1:3, V2=4:6))
21323+
test(2321.27, as.data.table(M, keep.rownames='id'), data.table(id=c('a', 'b', 'c'), A=1:3, V2=4:6))
2129121324

2129221325
# New fctr() helper: like factor() but retaining order by default #4837
2129321326
test(2322.01, levels(fctr(c("b","a","c"))), c("b","a","c"))
@@ -21378,3 +21411,26 @@ dt = data.table(x = 123456, y = "wide_string")
2137821411
test(2329.2, print(dt, col.names = "none"), output = "1: 123456 wide_string\n")
2137921412
dt = data.table(a = NA_integer_, b = NaN)
2138021413
test(2329.3, print(dt, col.names = "none"), output = "1: NA NaN\n")
21414+
21415+
# Row name extraction from multiple vectors, #7136
21416+
x <- 1:3
21417+
y <- setNames(4:6, c("A", "B", "C"))
21418+
test(2330.1, as.data.table(list(x, y), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), V1=1:3, V2=4:6))
21419+
test(2330.2, as.data.table(list(x, y), keep.rownames="custom"), data.table(custom=c("A", "B", "C"), V1=1:3, V2=4:6))
21420+
test(2330.3, as.data.table(list(y, x), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), V1=4:6, V2=1:3))
21421+
21422+
# Behavior under data.frame()
21423+
test(2330.4, as.data.table(data.frame(x, y), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), x=1:3, y=4:6))
21424+
test(2330.5, as.data.table(data.frame(y, x), keep.rownames=TRUE), data.table(rn=c("A", "B", "C"), y=4:6, x=1:3))
21425+
21426+
DF <- data.frame(row.names = letters[1:6], V = 1:6) # Test data.frame with explicit rownames
21427+
test(2330.6, as.data.table(list(a = 6:1, DF), keep.rownames=TRUE), data.table(rn=letters[1:6], a=6:1, V=1:6))
21428+
21429+
z <- setNames(1:3, rep("", 3)) # vector with all-empty names # behaviour with all-empty row names
21430+
test(2330.7, as.data.table(list(z), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3))
21431+
21432+
M <- matrix(1:6, nrow=3, dimnames=list(rep("", 3), c("V1", "V2"))) # test of list(M) for empty-rowname'd matrix input
21433+
test(2330.8, as.data.table(list(M), keep.rownames=TRUE), data.table(rn=rep("", 3), V1=1:3, V2=4:6))
21434+
21435+
# .SD reference in '...' passed to lapply(FUN=) is recognized as data.table
21436+
test(2331, lapply(list(data.table(a=1:2)), `[`, j=.SD[1L]), list(data.table(a=1L)))

man/as.data.table.Rd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ is.data.table(x)
3131
}
3232
\arguments{
3333
\item{x}{An R object.}
34-
\item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.}
34+
\item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead. For lists and when calling \code{data.table()}, names from the first named vector are extracted and used as row names, similar to \code{data.frame()} behavior.}
3535
\item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. }
3636
\item{sorted}{logical used in \emph{array} method, default \code{TRUE} is overridden when \code{key} is provided. }
3737
\item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.}

man/data.table.Rd

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,11 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
117117
118118
\item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use \code{keyby=} routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.}
119119
120-
\item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to \code{cols} variable parent scope and not from your dataset.
120+
\item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In the case of overlapping variable names inside \code{x} and in parent scope, you can use the double dot prefix \code{..cols} to explicitly refer to the \code{cols} variable in parent scope and not from \code{x}.
121121
122-
When \code{j} is a character vector of column names, a numeric vector of column positions to select or of the form \code{startcol:endcol}, and the value returned is always a \code{data.table}. \code{with=FALSE} is not necessary anymore to select columns dynamically. Note that \code{x[, cols]} is equivalent to \code{x[, ..cols]} and to \code{x[, cols, with=FALSE]} and to \code{x[, .SD, .SDcols=cols]}.}
122+
When \code{j} is a character vector of column names, a numeric vector of column positions to select, or of the form \code{startcol:endcol}, the value returned is always a \code{data.table}.
123+
124+
New code should rarely use this argument, which was originally needed for similarity to data.frame. For example, to select columns from a character vector \code{cols}, in data.frame we do \code{x[, cols]}, which has several equivalents in data.table: \code{x[, .SD, .SDcols=cols]}, \code{x[, ..cols]}, \code{x[, cols, env = list(cols = I(cols))]}, or \code{x[, cols, with=FALSE]}.}
123125
124126
\item{nomatch}{ When a row in \code{i} has no match to \code{x}, \code{nomatch=NA} (default) means \code{NA} is returned. \code{NULL} (or \code{0} for backward compatibility) means no rows will be returned for that row of \code{i}. }
125127

vignettes/datatable-reshape.Rmd

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -142,31 +142,31 @@ So far we've seen features of `melt` and `dcast` that are implemented efficientl
142142
However, there are situations we might run into where the desired operation is not expressed in a straightforward manner. For example, consider the `data.table` shown below:
143143

144144
```{r}
145-
s2 <- "family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
146-
1 30 1998-11-26 2000-01-29 NA 1 2 NA
147-
2 27 1996-06-22 NA NA 2 NA NA
148-
3 26 2002-07-11 2004-04-05 2007-09-02 2 2 1
149-
4 32 2004-10-10 2009-08-27 2012-07-21 1 1 1
150-
5 29 2000-12-05 2005-02-28 NA 2 1 NA"
145+
s2 <- "family_id age_mother name_child1 name_child2 name_child3 gender_child1 gender_child2 gender_child3
146+
1 30 Ben Anna NA 1 2 NA
147+
2 27 Tom NA NA 2 NA NA
148+
3 26 Lia Sam Amy 2 2 1
149+
4 32 Max Zoe Joe 1 1 1
150+
5 29 Dan Eva NA 2 1 NA"
151151
DT <- fread(s2)
152152
DT
153153
## 1 = female, 2 = male
154154
```
155155

156-
And you'd like to combine (`melt`) all the `dob` columns together, and `gender` columns together. Using the old functionality, we could do something like this:
156+
And you'd like to combine (`melt`) all the `name` columns together, and `gender` columns together. Using the old functionality, we could do something like this:
157157

158158
```{r}
159159
DT.m1 = melt(DT, id.vars = c("family_id", "age_mother"))
160160
DT.m1[, c("variable", "child") := tstrsplit(variable, "_", fixed = TRUE)]
161161
DT.c1 = dcast(DT.m1, family_id + age_mother + child ~ variable, value.var = "value")
162162
DT.c1
163163
164-
str(DT.c1) ## gender column is class IDate now!
164+
str(DT.c1) ## gender column is character type now!
165165
```
166166

167167
#### Issues
168168

169-
1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead, we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient).
169+
1. What we wanted to do was to combine all the `name` and `gender` type columns together respectively. Instead, we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient).
170170

171171
As an analogy, imagine you've a closet with four shelves of clothes and you'd like to put together the clothes from shelves 1 and 2 together (in 1), and 3 and 4 together (in 3). What we are doing is more or less to combine all the clothes together, and then split them back on to shelves 1 and 3!
172172

@@ -189,9 +189,9 @@ Since we'd like for `data.table`s to perform this operation straightforward and
189189
The idea is quite simple. We pass a list of columns to `measure.vars`, where each element of the list contains the columns that should be combined together.
190190

191191
```{r}
192-
colA = paste0("dob_child", 1:3)
192+
colA = paste0("name_child", 1:3)
193193
colB = paste0("gender_child", 1:3)
194-
DT.m2 = melt(DT, measure.vars = list(colA, colB), value.name = c("dob", "gender"))
194+
DT.m2 = melt(DT, measure.vars = list(colA, colB), value.name = c("name", "gender"))
195195
DT.m2
196196
197197
str(DT.m2) ## col type is preserved
@@ -206,7 +206,7 @@ str(DT.m2) ## col type is preserved
206206
Usually in these problems, the columns we'd like to melt can be distinguished by a common pattern. We can use the function `patterns()`, implemented for convenience, to provide regular expressions for the columns to be combined together. The above operation can be rewritten as:
207207

208208
```{r}
209-
DT.m2 = melt(DT, measure.vars = patterns("^dob", "^gender"), value.name = c("dob", "gender"))
209+
DT.m2 = melt(DT, measure.vars = patterns("^name", "^gender"), value.name = c("name", "gender"))
210210
DT.m2
211211
```
212212

@@ -305,7 +305,7 @@ We can now provide **multiple `value.var` columns** to `dcast` for `data.table`s
305305

306306
```{r}
307307
## new 'cast' functionality - multiple value.vars
308-
DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "gender"))
308+
DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("name", "gender"))
309309
DT.c2
310310
```
311311

0 commit comments

Comments
 (0)