Merge branch 'master' into redundantParameterCleanup

badasahog · web-flow · commit b7aed3177fe0 · 2025-07-17T03:25:19.000-04:00
diff --git a/.ci/linters/r/eval_parse_linter.R b/.ci/linters/r/eval_parse_linter.R
@@ -0,0 +1,8 @@
+eval_parse_linter = make_linter_from_xpath(
+  "//SYMBOL_FUNCTION_CALL[text() = 'parse']
+     /ancestor::expr
+     /preceding-sibling::expr[SYMBOL_FUNCTION_CALL[text() = 'eval']]
+     /parent::expr
+  ",
+  "Avoid eval(parse()); build the language directly, possibly using substitute2()."
+)
diff --git a/NEWS.md b/NEWS.md
@@ -84,6 +84,8 @@
 
 13. Reference to `.SD` in `...` arguments to `lapply()`, e.g. ``lapply(list_of_tables, `[`, j=.SD[1L])`` is evaluated correctly, [#2982](https://github.com/Rdatatable/data.table/issues/2982). Thanks @franknarf1 for the report and @MichaelChirico for the fix.
 
+14. Filling columns of class Date with POSIXct (and vice versa) using `shift()` now yields a clear, informative error message specifying the class mismatch, [#5218](https://github.com/Rdatatable/data.table/issues/5218). Thanks @ashbaldry for the report and @ben-schwen for the fix.
+
 ### NOTES
 
 1. The following in-progress deprecations have proceeded:
diff --git a/R/data.table.R b/R/data.table.R
@@ -97,34 +97,32 @@ replace_dot_alias = function(e) {
 }
 
 .checkTypos = function(err, ref) {
+  err_str <- conditionMessage(err)
   # a slightly wonky workaround so that this still works in non-English sessions, #4989
   # generate this at run time (as opposed to e.g. onAttach) since session language is
   #   technically OK to update (though this should be rare), and since it's low-cost
   #   to do so here because we're about to error anyway.
-  missing_obj_fmt = gsub(
-    "'missing_datatable_variable____'",
+  missing_obj_regex = gsub(
+    "'____missing_datatable_variable____'",
     "'(?<obj_name>[^']+)'",
-    tryCatch(eval(parse(text="missing_datatable_variable____")), error=identity)$message
-    # eval(parse()) to avoid "no visible binding for global variable" note from R CMD check
-    # names starting with _ don't parse, so no leading _ in the name
+    # expression() to avoid "no visible binding for global variable" note from R CMD check
+    conditionMessage(tryCatch(eval(quote(`____missing_datatable_variable____`)), error=identity)),
+    fixed=TRUE
   )
-  idx = regexpr(missing_obj_fmt, err$message, perl=TRUE)
-  if (idx > 0L) {
-    start = attr(idx, "capture.start", exact=TRUE)[ , "obj_name"]
-    used = substr(
-      err$message,
-      start,
-      start + attr(idx, "capture.length", exact=TRUE)[ , "obj_name"] - 1L
-    )
-    found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE)
-    if (length(found)) {
-      stopf("Object '%s' not found. Perhaps you intended %s", used, brackify(found))
-    } else {
-      stopf("Object '%s' not found amongst %s", used, brackify(ref))
-    }
+  idx = regexpr(missing_obj_regex, err_str, perl=TRUE)
+  if (idx == -1L)
+    stopf("%s", err_str, domain=NA) # Don't use stopf() directly, since err_str might have '%', #6588
+  start = attr(idx, "capture.start", exact=TRUE)[ , "obj_name"]
+  used = substr(
+    err_str,
+    start,
+    start + attr(idx, "capture.length", exact=TRUE)[ , "obj_name"] - 1L
+  )
+  found = agrep(used, ref, value=TRUE, ignore.case=TRUE, fixed=TRUE)
+  if (length(found)) {
+    stopf("Object '%s' not found. Perhaps you intended %s", used, brackify(found))
   } else {
-    # Don't use stopf() directly, since err$message might have '%', #6588
-    stopf("%s", err$message, domain=NA)
+    stopf("Object '%s' not found amongst %s", used, brackify(ref))
   }
 }
 
diff --git a/R/onLoad.R b/R/onLoad.R
@@ -73,31 +73,29 @@
   # In fread and fwrite we have moved back to using getOption's default argument since it is unlikely fread and fread will be called in a loop many times, plus they
   # are relatively heavy functions where the overhead in getOption() would not be noticed.  It's only really [.data.table where getOption default bit.
   # Improvement to base::getOption() now submitted (100x; 5s down to 0.05s):  https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17394
-  opts = c(
-       "datatable.verbose"="FALSE",            # datatable.<argument name>
-       "datatable.optimize"="Inf",             # datatable.<argument name>
-       "datatable.print.nrows"="100L",         # datatable.<argument name>
-       "datatable.print.topn"="5L",            # datatable.<argument name>
-       "datatable.print.class"="TRUE",         # for print.data.table
-       "datatable.print.rownames"="TRUE",      # for print.data.table
-       "datatable.print.colnames"="'auto'",    # for print.data.table
-       "datatable.print.keys"="TRUE",          # for print.data.table
-       "datatable.print.trunc.cols"="FALSE",   # for print.data.table
-       "datatable.show.indices"="FALSE",       # for print.data.table
-       "datatable.allow.cartesian"="FALSE",    # datatable.<argument name>
-       "datatable.join.many"="TRUE",           # mergelist, [.data.table #4383 #914
-       "datatable.dfdispatchwarn"="TRUE",      # not a function argument
-       "datatable.warnredundantby"="TRUE",     # not a function argument
-       "datatable.alloccol"="1024L",           # argument 'n' of alloc.col. Over-allocate 1024 spare column slots
-       "datatable.auto.index"="TRUE",          # DT[col=="val"] to auto add index so 2nd time faster
-       "datatable.use.index"="TRUE",           # global switch to address #1422
-       "datatable.prettyprint.char" = NULL,    # FR #1091
-       "datatable.old.matrix.autoname"="TRUE", # #7145: how data.table(x=1, matrix(1)) is auto-named set to change
-       NULL
-       )
-  for (i in setdiff(names(opts),names(options()))) {
-    eval(parse(text=paste0("options(",i,"=",opts[i],")")))
-  }
+  opts = list(
+    datatable.verbose=FALSE,            # datatable.<argument name>
+    datatable.optimize=Inf,             # datatable.<argument name>
+    datatable.print.nrows=100L,         # datatable.<argument name>
+    datatable.print.topn=5L,            # datatable.<argument name>
+    datatable.print.class=TRUE,         # for print.data.table
+    datatable.print.rownames=TRUE,      # for print.data.table
+    datatable.print.colnames='auto',    # for print.data.table
+    datatable.print.keys=TRUE,          # for print.data.table
+    datatable.print.trunc.cols=FALSE,   # for print.data.table
+    datatable.show.indices=FALSE,       # for print.data.table
+    datatable.allow.cartesian=FALSE,    # datatable.<argument name>
+    datatable.join.many=TRUE,           # mergelist, [.data.table #4383 #914
+    datatable.dfdispatchwarn=TRUE,      # not a function argument
+    datatable.warnredundantby=TRUE,     # not a function argument
+    datatable.alloccol=1024L,           # argument 'n' of alloc.col. Over-allocate 1024 spare column slots
+    datatable.auto.index=TRUE,          # DT[col=="val"] to auto add index so 2nd time faster
+    datatable.use.index=TRUE,           # global switch to address #1422
+    datatable.prettyprint.char=NULL,    # FR #1091
+    datatable.old.matrix.autoname=TRUE  # #7145: how data.table(x=1, matrix(1)) is auto-named set to change
+  )
+  opts = opts[!names(opts) %chin% names(options())]
+  options(opts)
 
   # Test R behaviour that changed in v3.1 and is now depended on
   x = 1L:3L
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -6851,6 +6851,13 @@ test(1463.78, shift(x,1:2,   type="cyclic"), list(as.raw(c(5, 1:4)), as.raw(c(4:
 test(1463.79, shift(x,-1L,   type="cyclic"), as.raw(c(2:5, 1)))
 test(1463.80, shift(x,-(1:2),type="cyclic"), list(as.raw(c(2:5, 1)), as.raw(c(3:5,1:2))))
 
+# shift incompatible types (e.g. Date and POSIXct)
+d = .Date(0:4)
+p = .POSIXct(1:5)
+test(1463.81, shift(d, fill=p[1L]), error="Filling Date with POSIXct .* unsupported.*")
+test(1463.82, shift(p, fill=d[1L]), error="Filling POSIXct with Date .* unsupported.*")
+test(1463.83, shift(d, fill=as.IDate(2000L)), .Date(c(2000L, 0:3)))
+
 # FR #686
 DT = data.table(a=rep(c("A", "B", "C", "A", "B"), c(2,2,3,1,2)), foo=1:10)
 # Seemingly superfluous 'foo' is needed to test fix for #1942
diff --git a/src/shift.c b/src/shift.c
@@ -40,6 +40,12 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type)
     SEXP elem  = VECTOR_ELT(x, i);
     size_t size  = RTYPE_SIZEOF(elem);
     R_xlen_t xrows = xlength(elem);
+    if ((INHERITS(elem, char_Date) && INHERITS(fill, char_POSIXct)) || (INHERITS(elem, char_POSIXct) && INHERITS(fill, char_Date))) {
+      const char* elem_type = INHERITS(elem, char_Date) ? "Date" : "POSIXct";
+      const char* fill_type = INHERITS(fill, char_Date) ? "Date" : "POSIXct";
+      error(_("Filling %s with %s using shift() is unsupported. Please convert fill to %s first."),
+            elem_type, fill_type, elem_type);
+    }
     SEXP thisfill = PROTECT(coerceAs(fill, elem, ScalarLogical(0)));  // #4865 use coerceAs for type coercion
     switch (TYPEOF(elem)) {
     case INTSXP: case LGLSXP: {
diff --git a/vignettes/datatable-joins.Rmd b/vignettes/datatable-joins.Rmd
@@ -117,7 +117,7 @@ x[i, on, nomatch]
 \____ secondary data.table
 ```
 
-> Please keep in mind that the standard argument order in `data.table` is `dt[i, j, by]`. For join operations, it is recommended to pass the `on` and `nomatch` arguments by name to avoid using `j` and `by` when they are not needed.
+Note: Please keep in mind that the standard argument order in `data.table` is `dt[i, j, by]`. For join operations, it is recommended to pass the `on` and `nomatch` arguments by name to avoid using `j` and `by` when they are not needed.
 
 ## 3. Equi joins
 
@@ -439,7 +439,7 @@ ProductReceived[ProductSales,
                 allow.cartesian = TRUE]
 ```
 
-> `allow.cartesian` is defaulted to FALSE as this is seldom what the user wants, and such a cross join can lead to a very large number of rows in the result. For example, if Table A has 100 rows and Table B has 50 rows, their Cartesian product would result in 5000 rows (100 * 50). This can quickly become memory-intensive for large datasets.
+Note: `allow.cartesian` is defaulted to FALSE as this is seldom what the user wants, and such a cross join can lead to a very large number of rows in the result. For example, if Table A has 100 rows and Table B has 50 rows, their Cartesian product would result in 5000 rows (100 * 50). This can quickly become memory-intensive for large datasets.
 
 
 #### 3.6.1. Selecting one match
diff --git a/vignettes/fr/datatable-joins.Rmd b/vignettes/fr/datatable-joins.Rmd