Rdatatable · venom1204 · Aug 16, 2025 · Aug 16, 2025 · Aug 16, 2025 · Aug 16, 2025
@@ -107,6 +107,21 @@
 
 18. `fwrite` now allows `dec` to be the same as `sep` for edge cases where only one will be written, e.g. 0-row or 1-column tables. [#7227](https://github.com/Rdatatable/data.table/issues/7227). Thanks @MichaelChirico for the report and @venom1204 for the fix.
 
+19. Using `by=` or `keyby=` with a simple numeric or character vector in `j` (e.g. `DT[, 1:2, by=grp]`) used to silently ignore the grouping argument. This now issues a warning to alert the user that grouping is not applied in this syntax and guides them to use the `.SD` idiom instead. [#5397](https://github.com/Rdatatable/data.table/issues/5397). Thanks to @mcol for the report and @venom1204 for the fix.
+
+    ```r
+    DT = data.table(a=1:4, grp=c(1,1,2,2))
+    DT[, 1, by = grp]
+    #      a
+    #  <int>
+    # 1:     1
+    # 2:     2
+    # 3:     3
+    # 4:     4
+    # Warning message:
+    # `by` or `keyby` is ignored when `j` is a numeric vector...
+    ```
+
 ### NOTES
 
 1. The following in-progress deprecations have proceeded:

@@ -739,6 +739,13 @@ replace_dot_alias = function(e) {
       if (!length(j) && !notj) return( null.data.table() )
       if (is.factor(j)) j = as.character(j)  # fix for FR: #358
       if (is.character(j)) {
+        if (!missingby) {
+        warning(
+          "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ",
+          "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]",
+          call. = FALSE
+        )
+        }
         if (notj) {
           if (anyNA(idx <- chmatch(j, names_x)))
             warningf(ngettext(sum(is.na(idx)), "column not removed because not found: %s", "columns not removed because not found: %s"),
@@ -762,6 +769,12 @@ replace_dot_alias = function(e) {
         # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977)
         #   in both cases leave to the R-level subsetting of i and x together further below
       } else if (is.numeric(j)) {
+        if (!missingby) {
+        warning(
+          "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]",
+          call. = FALSE
+        )
+        }
         j = as.integer(j)
         if (any(w <- (j>ncol(x)))) stopf("Item %d of j is %d which is outside the column number range [1,ncol=%d]", idx <- which.first(w), j[idx], ncol(x))
         j = j[j!=0L]

@@ -21620,3 +21620,12 @@ local({
   test(2338.9, {fwrite(dd, f, forceDecimal=FALSE); fread(f)}, di)
 })
 
+# 5397 - keyby/key ignored if numeric indices used in j
+DT = data.table(a=1:4, b=5:8, g=c(1,1,2,2))
+test(2339.1, DT[, 1:2, by=g], DT[, 1:2], warning="`by` or `keyby` is ignored")
+test(2339.2, DT[, 2:1, keyby=g], DT[, 2:1], warning="`by` or `keyby` is ignored")
+test(2339.3, DT[, c("b", "a"), by=g, with=FALSE], DT[, c("b", "a")], warning="`by` or `keyby` is ignored")
+expected_sd = data.table(g=c(1,1,2,2), a=1:4, b=5:8)
+test(2339.4, DT[, .SD[, 1:2], by=g], expected_sd)
+expected_single_int = data.table(g=c(1,2), V1=c(1,1))
+test(2339.5, DT[, 1, by=g], expected_single_int)
@@ -97,7 +97,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac
 
     See \href{../doc/datatable-intro.html}{\code{vignette("datatable-intro")}} and \code{example(data.table)}.}
 
-    \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts:
+    \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). \emph{Note that \code{by} and \code{keyby} are ignored when \code{j} is a character or numeric vector used for selecting columns (i.e., when the internal \code{with=FALSE} is triggered).} The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts:    
 
     \itemize{
         \item A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]}