Skip to content

Commit 7d6aea9

Browse files
committed
Merge branch 'master' into frev
2 parents 832324c + 3eefbca commit 7d6aea9

File tree

6 files changed

+122
-65
lines changed

6 files changed

+122
-65
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR.
2424

25+
4. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR.
26+
2527
## BUG FIXES
2628

2729
1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.

R/data.table.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,8 +1122,8 @@ replace_dot_alias = function(e) {
11221122
if (is.name(lhs)) {
11231123
lhs = as.character(lhs)
11241124
} else {
1125-
# e.g. (MyVar):= or get("MyVar"):=
1126-
lhs = eval(lhs, parent.frame(), parent.frame())
1125+
# lhs is e.g. (MyVar) or get("MyVar") or names(.SD) || setdiff(names(.SD), cols)
1126+
lhs = eval(lhs, list(.SD = setNames(logical(length(sdvars)), sdvars)), parent.frame())
11271127
}
11281128
} else {
11291129
# `:=`(c2=1L,c3=2L,...)

inst/tests/tests.Rraw

Lines changed: 74 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18352,53 +18352,92 @@ if (test_bit64) {
1835218352
test(2248, dcast(apple, id ~ time, value.var = "y"), data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id'))
1835318353
}
1835418354

18355+
# Unit tests for DT[, .SD] retaining secondary indices, #1709
18356+
DT = data.table(x=1:5, y=6:10)
18357+
setindex(DT, x)
18358+
test(2249.1, indices(DT), 'x')
18359+
test(2249.2, indices(DT[, .SD]), 'x')
18360+
setindex(DT, y)
18361+
test(2249.3, indices(DT), c('x', 'y'))
18362+
test(2249.4, indices(DT[, .SD]), c('x', 'y'))
18363+
18364+
# make names(.SD) work - issue #795
18365+
dt = data.table(a = 1:4, b = 5:8)
18366+
test(2250.01, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8))
18367+
test(2250.02, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2))
18368+
test(2250.03, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2)))
18369+
test(2250.04, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2))))
18370+
test(2250.05, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2))))
18371+
test(2250.06, dt[, c(names(.SD)) := NULL], null.data.table())
18372+
18373+
dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))
18374+
test(2250.07, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c')))
18375+
18376+
dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))
18377+
keep = c('a', 'b')
18378+
test(2250.08, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8))
18379+
18380+
dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'))
18381+
test(2250.09, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'), a_max = c(2L, 2L, 3L, 4L), b_max = c(6L, 6L, 7L, 8L)))
18382+
18383+
dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'))
18384+
test(2250.10, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_)))
18385+
test(2250.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error = 'could not find function ".SD"')
18386+
18387+
dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'))
18388+
test(2250.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.table(a = 1:3 + 5:7, b = 5:7, grp = c('a', 'a', 'b')))
18389+
18390+
18391+
dt = data.table(a = 1L, b = 2L, c = 3L, d = 4L, e = 5L, f = 6L)
18392+
test(2250.13, dt[, names(.SD)[1:5] := sum(.SD)], data.table(a = 21L, b = 21L, c = 21L, d = 21L, e = 21L, f = 6L))
18393+
1835518394
# 5885 implement frev
1835618395
d = c(NA, NaN, Inf, -Inf)
18357-
test(2249.00, frev(c(FALSE, NA)), c(NA, FALSE))
18358-
test(2249.01, frev(c(0L, NA)), c(NA, 0L))
18359-
test(2249.02, frev(d), c(-Inf, Inf, NaN, NA))
18360-
test(2249.03, frev(c(NA, 1, 0+2i)), c(0+2i, 1, NA))
18361-
test(2249.04, frev(as.raw(0:1)), as.raw(1:0))
18362-
test(2249.05, frev(NULL), NULL)
18363-
test(2249.06, frev(character(5)), character(5))
18364-
test(2249.07, frev(integer(0)), integer(0))
18365-
test(2249.08, frev(list(1, "a")), list("a", 1))
18366-
test(2249.09, setrev(c(0L, NA)), c(NA, 0L))
18367-
test(2249.10, setrev(d), c(-Inf, Inf, NaN, NA))
18368-
test(2249.11, setrev(c(NA, 1, 0+2i)), c(0+2i, 1, NA))
18369-
test(2249.12, setrev(as.raw(0:1)), as.raw(1:0))
18370-
test(2249.13, setrev(NULL), NULL)
18371-
test(2249.14, setrev(character(5)), character(5))
18372-
test(2249.15, setrev(integer(0)), integer(0))
18373-
test(2249.16, setrev(list(1, "a")), list("a", 1))
18374-
test(2249.17, frev(1:1e2), rev(1:1e2))
18396+
test(2251.00, frev(c(FALSE, NA)), c(NA, FALSE))
18397+
test(2251.01, frev(c(0L, NA)), c(NA, 0L))
18398+
test(2251.02, frev(d), c(-Inf, Inf, NaN, NA))
18399+
test(2251.03, frev(c(NA, 1, 0+2i)), c(0+2i, 1, NA))
18400+
test(2251.04, frev(as.raw(0:1)), as.raw(1:0))
18401+
test(2251.05, frev(NULL), NULL)
18402+
test(2251.06, frev(character(5)), character(5))
18403+
test(2251.07, frev(integer(0)), integer(0))
18404+
test(2251.08, frev(list(1, "a")), list("a", 1))
18405+
test(2251.09, setrev(c(0L, NA)), c(NA, 0L))
18406+
test(2251.10, setrev(d), c(-Inf, Inf, NaN, NA))
18407+
test(2251.11, setrev(c(NA, 1, 0+2i)), c(0+2i, 1, NA))
18408+
test(2251.12, setrev(as.raw(0:1)), as.raw(1:0))
18409+
test(2251.13, setrev(NULL), NULL)
18410+
test(2251.14, setrev(character(5)), character(5))
18411+
test(2251.15, setrev(integer(0)), integer(0))
18412+
test(2251.16, setrev(list(1, "a")), list("a", 1))
18413+
test(2251.17, frev(1:1e2), rev(1:1e2))
1837518414
# copy arguments
1837618415
x = 1:3
18377-
test(2249.21, {frev(x); x}, 1:3)
18378-
test(2249.22, {setrev(x); x}, 3:1)
18379-
test(2249.23, address(x) == address(setrev(x)))
18380-
test(2249.24, address(x) != address(frev(x)))
18416+
test(2251.21, {frev(x); x}, 1:3)
18417+
test(2251.22, {setrev(x); x}, 3:1)
18418+
test(2251.23, address(x) == address(setrev(x)))
18419+
test(2251.24, address(x) != address(frev(x)))
1838118420
# do not alter on subsets
18382-
test(2249.25, {setrev(x[1:2]); x}, 1:3)
18421+
test(2251.25, {setrev(x[1:2]); x}, 1:3)
1838318422
# levels
1838418423
f = as.factor(letters)
18385-
test(2249.31, frev(f), rev(f))
18386-
test(2249.32, frev(as.IDate(1:10)), as.IDate(10:1))
18387-
test(2249.33, frev(as.IDate(1:10)), as.IDate(10:1))
18424+
test(2251.31, frev(f), rev(f))
18425+
test(2251.32, frev(as.IDate(1:10)), as.IDate(10:1))
18426+
test(2251.33, frev(as.IDate(1:10)), as.IDate(10:1))
1838818427
# names
1838918428
x = c(a=1L, b=2L, c=3L)
18390-
test(2249.41, frev(x), rev(x))
18391-
test(2249.42, setrev(x), x)
18429+
test(2251.41, frev(x), rev(x))
18430+
test(2251.42, setrev(x), x)
1839218431
# attributes
1839318432
x = structure(1:10, class = c("IDate", "Date"), att = 1L)
18394-
test(2249.51, attr(frev(x), "att"), 1L)
18395-
test(2249.52, attr(setrev(x), "att"), 1L)
18433+
test(2251.51, attr(frev(x), "att"), 1L)
18434+
test(2251.52, attr(setrev(x), "att"), 1L)
1839618435
# errors
18397-
test(2249.61, frev(data.table()), error="should not be data.frame or data.table")
18398-
test(2249.62, frev(expression(1)), error="is not supported by frev")
18399-
test(2249.63, frev(matrix(1)), error="should not be matrix or array")
18436+
test(2251.61, frev(data.table()), error="should not be data.frame or data.table")
18437+
test(2251.62, frev(expression(1)), error="is not supported by frev")
18438+
test(2251.63, frev(matrix(1)), error="should not be matrix or array")
1840018439
if (test_bit64) {
1840118440
x = as.integer64(c(1, NA, 3))
18402-
test(2249.71, frev(x), rev(x))
18403-
test(2249.72, setrev(x), x)
18441+
test(2251.71, frev(x), rev(x))
18442+
test(2251.72, setrev(x), x)
1840418443
}

man/assign.Rd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
# LHS2 = RHS2,
2727
# ...), by = ...]
2828

29+
# 3. Multiple columns in place
30+
# DT[i, names(.SD) := lapply(.SD, fx), by = ..., .SDcols = ...]
31+
2932
set(x, i = NULL, j, value)
3033
}
3134
\arguments{

vignettes/datatable-reference-semantics.Rmd

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,23 @@ flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL]
258258
head(flights)
259259
```
260260

261+
#### -- How can we update multiple existing columns in place using `.SD`?
262+
263+
```{r}
264+
flights[, names(.SD) := lapply(.SD, as.factor), .SDcols = is.character]
265+
```
266+
Let's clean up again and convert our newly-made factor columns back into character columns. This time we will make use of `.SDcols` accepting a function to decide which columns to include. In this case, `is.factor()` will return the columns which are factors. For more on the **S**ubset of the **D**ata, there is also an [SD Usage vignette](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html).
267+
268+
Sometimes, it is also nice to keep track of columns that we transform. That way, even after we convert our columns we would be able to call the specific columns we were updating.
269+
```{r}
270+
factor_cols <- sapply(flights, is.factor)
271+
flights[, names(.SD) := lapply(.SD, as.character), .SDcols = factor_cols]
272+
str(flights[, ..factor_cols])
273+
```
274+
#### {.bs-callout .bs-callout-info}
275+
276+
* We also could have used `(factor_cols)` on the `LHS` instead of `names(.SD)`.
277+
261278
## 3. `:=` and `copy()`
262279

263280
`:=` modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use `copy()` function, as we will see in a moment.

vignettes/datatable-sd-usage.Rmd

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,15 @@ The first way to impact what `.SD` is is to limit the _columns_ contained in `.S
7777
Pitching[ , .SD, .SDcols = c('W', 'L', 'G')]
7878
```
7979

80-
This is just for illustration and was pretty boring. But even this simply usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations:
80+
This is just for illustration and was pretty boring. In addition to accepting a character vector, `.SDcols` also accepts:
81+
82+
1. any function such as `is.character` to filter _columns_
83+
2. the function^{*} `patterns()` to filter _column names_ by regular expression
84+
3. integer and logical vectors
85+
86+
*see `?patterns` for more details
87+
88+
This simple usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations:
8189

8290
## Column Type Conversion
8391

@@ -91,52 +99,40 @@ We notice that the following columns are stored as `character` in the `Teams` da
9199
# teamIDretro: Team ID used by Retrosheet
92100
fkt = c('teamIDBR', 'teamIDlahman45', 'teamIDretro')
93101
# confirm that they're stored as `character`
94-
Teams[ , sapply(.SD, is.character), .SDcols = fkt]
102+
str(Teams[ , ..fkt])
95103
```
96104

97-
If you're confused by the use of `sapply` here, note that it's quite similar for base R `data.frames`:
98-
99-
```{r identify_factors_as_df}
100-
setDF(Teams) # convert to data.frame for illustration
101-
sapply(Teams[ , fkt], is.character)
102-
setDT(Teams) # convert back to data.table
103-
```
104-
105-
The key to understanding this syntax is to recall that a `data.table` (as well as a `data.frame`) can be considered as a `list` where each element is a column -- thus, `sapply`/`lapply` applies the `FUN` argument (in this case, `is.character`) to each _column_ and returns the result as `sapply`/`lapply` usually would.
106-
107-
The syntax to now convert these columns to `factor` is very similar -- simply add the `:=` assignment operator:
105+
The syntax to now convert these columns to `factor` is simple:
108106

109107
```{r assign_factors}
110-
Teams[ , (fkt) := lapply(.SD, factor), .SDcols = fkt]
108+
Teams[ , names(.SD) := lapply(.SD, factor), .SDcols = patterns('teamID')]
111109
# print out the first column to demonstrate success
112110
head(unique(Teams[[fkt[1L]]]))
113111
```
114112

115-
Note that we must wrap `fkt` in parentheses `()` to force `data.table` to interpret this as column names, instead of trying to assign a column named `'fkt'`.
113+
Note:
116114

117-
Actually, the `.SDcols` argument is quite flexible; above, we supplied a `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. `.SDcols` even accepts regular expression-based pattern matching.
115+
1. The `:=` is an assignment operator to update the `data.table` in place without making a copy. See [reference semantics](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reference-semantics.html) for more.
116+
2. The LHS, `names(.SD)`, indicates which columns we are updating - in this case we update the entire `.SD`.
117+
3. The RHS, `lapply()`, loops through each column of the `.SD` and converts the column to a factor.
118+
4. We use the `.SDcols` to only select columns that have pattern of `teamID`.
119+
120+
Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` but we could have also supplied `fkt` or any `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. Finally, the use of a function to filter columns is very helpful.
118121

119122
For example, we could do the following to convert all `factor` columns to `character`:
120123

121124
```{r sd_as_logical}
122-
# while .SDcols accepts a logical vector,
123-
# := does not, so we need to convert to column
124-
# positions with which()
125-
fkt_idx = which(sapply(Teams, is.factor))
126-
Teams[ , (fkt_idx) := lapply(.SD, as.character), .SDcols = fkt_idx]
127-
head(unique(Teams[[fkt_idx[1L]]]))
125+
fct_idx = Teams[, which(sapply(.SD, is.factor))] # column numbers to show the class changing
126+
str(Teams[[fct_idx[1L]]])
127+
Teams[ , names(.SD) := lapply(.SD, as.character), .SDcols = is.factor]
128+
str(Teams[[fct_idx[1L]]])
128129
```
129130

130131
Lastly, we can do pattern-based matching of columns in `.SDcols` to select all columns which contain `team` back to `factor`:
131132

132133
```{r sd_patterns}
133134
Teams[ , .SD, .SDcols = patterns('team')]
134-
135-
# now convert these columns to factor;
136-
# value = TRUE in grep() is for the LHS of := to
137-
# get column names instead of positions
138-
team_idx = grep('team', names(Teams), value = TRUE)
139-
Teams[ , (team_idx) := lapply(.SD, factor), .SDcols = team_idx]
135+
Teams[ , names(.SD) := lapply(.SD, factor), .SDcols = patterns('team')]
140136
```
141137

142138
** A proviso to the above: _explicitly_ using column numbers (like `DT[ , (1) := rnorm(.N)]`) is bad practice and can lead to silently corrupted code over time if column positions change. Even implicitly using numbers can be dangerous if we don't keep smart/strict control over the ordering of when we create the numbered index and when we use it.

0 commit comments

Comments
 (0)