dcast only computes default fill if necessary (#5549)

tdhock · shrektan · MichaelChirico · web-flow · commit f92aee69e653 · 2024-03-14T09:11:11.000-07:00
* delete old commented code * new test for no warning fails * only compute default fill if missing cells present * any_NA_int helper * bugfix #5512 * Update src/fcast.c Co-authored-by: Xianying Tan <shrektan@126.com> * Update src/fcast.c Co-authored-by: Xianying Tan <shrektan@126.com> * mention warning text * const int args * add back ithiscol * get pointer before for loop * add test case from Michael * test min(dbl) and no warning when fill specified * Revert "delete old commented code" This reverts commit 2886c4f. * use suggestions from Michael * rm inline any_NA_int since that causes install to fail * clarify comment * link 5390 * mymin test fails * compute some_fill using anyNA in R then pass to C * Update R/fcast.R Co-authored-by: Michael Chirico <michaelchirico4@gmail.com> * Update R/fcast.R Co-authored-by: Michael Chirico <michaelchirico4@gmail.com> * dat_for_default_fill is zero-row dt * !length instead of length==0 * new dcast tests with fill=character * dat_for_default_fill is dat again, not 0-row, because that causes some test failure --------- Co-authored-by: Xianying Tan <shrektan@126.com> Co-authored-by: Michael Chirico <michaelchirico4@gmail.com>
diff --git a/NEWS.md b/NEWS.md
@@ -28,6 +28,8 @@
 
 3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and Benjamin Schwendinger for the fix.
 
+4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix.
+
 ## NOTES
 
 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1.
diff --git a/R/fcast.R b/R/fcast.R
@@ -152,23 +152,22 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
     dat = .Call(CsubsetDT, dat, idx, seq_along(dat))
   }
   fun.call = m[["fun.aggregate"]]
-  fill.default = NULL
   if (is.null(fun.call)) {
     oo = forderv(dat, by=varnames, retGrp=TRUE)
     if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) {
       messagef("'fun.aggregate' is NULL, but found duplicate row/column combinations, so defaulting to length(). That is, the variables %s used in 'formula' do not uniquely identify rows in the input 'data'. In such cases, 'fun.aggregate' is used to derive a single representative value for each combination in the output data.table, for example by summing or averaging (fun.aggregate=sum or fun.aggregate=mean, respectively). Check the resulting table for values larger than 1 to see which combinations were not unique. See ?dcast.data.table for more details.", brackify(varnames))
       fun.call = quote(length)
     }
   }
-  if (!is.null(fun.call)) {
+  dat_for_default_fill = dat
+  run_agg_funs = !is.null(fun.call)
+  if (run_agg_funs) {
     fun.call = aggregate_funs(fun.call, lvals, sep, ...)
-    errmsg = gettext("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.")
-    if (is.null(fill)) {
-      fill.default = suppressWarnings(dat[0L][, eval(fun.call)])
-      # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stopf(errmsg))
-      if (nrow(fill.default) != 1L) stopf(errmsg)
+    maybe_err = function(list.of.columns) {
+      if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.")
+      list.of.columns
     }
-    dat = dat[, eval(fun.call), by=c(varnames)]
+    dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)]
   }
   order_ = function(x) {
     o = forderv(x, retGrp=TRUE, sort=TRUE)
@@ -211,7 +210,12 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
     }
     maplen = vapply_1i(mapunique, length)
     idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
-    ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call))
+    some_fill = anyNA(idx)
+    fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))]
+    if (run_agg_funs && is.null(fill) && some_fill) {
+      fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))]
+    }
+    ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill)
     allcols = do.call("paste", c(rhs, sep=sep))
     if (length(valnames) > 1L)
       allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -3729,6 +3729,21 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2,
   DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2],20,TRUE), d1=runif(20), d2=1L)
   test(1102.38, names(dcast(DT, x ~ y + z, fun.aggregate=length, value.var = "d2", sep=".")),
                 c("x", "1.a", "1.b", "2.a", "2.b"))
+
+  # test for #5512, only compute default fill if needed.
+  DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6))
+  mymin <- function(x){
+    if (!length(x)) stop("calling mymin on vector of length 0")
+    min(x)
+  }
+  test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) # fill not used in output, so default fill not computed.
+  ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int")
+  test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c("no non-missing arguments to min; returning Inf", "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA.
+  test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()).
+  test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning="no non-missing arguments to min; returning Inf") # only one warning, because no coercion.
+  test(1102.43, dcast(DT, int ~ chr, min, value.var="dbl", fill="coerced to NA"), data.table(int=1:3, a=c(4,NA,NA), b=c(NA,5,6), key="int"), warning=c("Coercing 'character' RHS to 'double' to match the type of target vector.", "NAs introduced by coercion"))
+  test(1102.44, dcast(DT, int ~ ., value.var="dbl", fill="ignored"), data.table(int=1:3, .=c(4,5,6), key="int"))
+  
 }
 
 # test for freading commands
diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd
@@ -22,7 +22,7 @@
   \item{\dots}{Any other arguments that may be passed to the aggregating function.}
   \item{margins}{Not implemented yet. Should take variable names to compute margins on. A value of \code{TRUE} would compute all margins.}
   \item{subset}{Specified if casting should be done on a subset of the data. Ex: \code{subset = .(col1 <= 5)} or \code{subset = .(variable != "January")}.}
-  \item{fill}{Value with which to fill missing cells. If \code{fun.aggregate} is present, takes the value by applying the function on a 0-length vector.}
+  \item{fill}{Value with which to fill missing cells. If \code{fill=NULL} and missing cells are present, then \code{fun.aggregate} is used on a 0-length vector to obtain a fill value.}
   \item{drop}{\code{FALSE} will cast by including all missing combinations.
 
   \code{c(FALSE, TRUE)} will only include all missing combinations of formula \code{LHS}; \code{c(TRUE, FALSE)} will only include all missing combinations of formula RHS. See Examples.}
diff --git a/src/data.table.h b/src/data.table.h
@@ -289,7 +289,7 @@ SEXP setlistelt(SEXP, SEXP, SEXP);
 SEXP address(SEXP);
 SEXP expandAltRep(SEXP);
 SEXP fmelt(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 SEXP issorted(SEXP, SEXP);
 SEXP gforce(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 SEXP gsum(SEXP, SEXP);
diff --git a/src/fcast.c b/src/fcast.c
@@ -4,7 +4,7 @@
 // raise(SIGINT);
 
 // TO DO: margins
-SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg) {
+SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg, SEXP some_fillArg) {
   int nrows=INTEGER(nrowArg)[0], ncols=INTEGER(ncolArg)[0];
   int nlhs=length(lhs), nval=length(val), *idx = INTEGER(idxArg);
   SEXP target;
@@ -15,24 +15,28 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil
     SET_VECTOR_ELT(ans, i, VECTOR_ELT(lhs, i));
   }
   // get val cols
+  bool some_fill = LOGICAL(some_fillArg)[0];
   for (int i=0; i<nval; ++i) {
     const SEXP thiscol = VECTOR_ELT(val, i);
     SEXP thisfill = fill;
     const SEXPTYPE thistype = TYPEOF(thiscol);
     int nprotect = 0;
-    if (isNull(fill)) {
-      if (LOGICAL(is_agg)[0]) {
-        thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++;
-      } else thisfill = VECTOR_ELT(fill_d, i);
-    }
-    if (isVectorAtomic(thiscol)) { // defer error handling to below, but also skip on list
-      thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++;
+    if(some_fill){
+      if (isNull(fill)) {
+	if (LOGICAL(is_agg)[0]) {
+	  thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++;
+	} else thisfill = VECTOR_ELT(fill_d, i);
+      }
+      if (isVectorAtomic(thiscol)) { // defer error handling to below, but also skip on list
+	thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++;
+      }
     }
     switch (thistype) {
     case INTSXP:
     case LGLSXP: {
       const int *ithiscol = INTEGER(thiscol);
-      const int *ithisfill = INTEGER(thisfill);
+      const int *ithisfill = 0;
+      if (some_fill) ithisfill = INTEGER(thisfill);
       for (int j=0; j<ncols; ++j) {
         SET_VECTOR_ELT(ans, nlhs+j+i*ncols, target=allocVector(thistype, nrows) );
         int *itarget = INTEGER(target);
@@ -45,7 +49,8 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil
     } break;
     case REALSXP: {
       const double *dthiscol = REAL(thiscol);
-      const double *dthisfill = REAL(thisfill);
+      const double *dthisfill = 0;
+      if (some_fill) dthisfill = REAL(thisfill);
       for (int j=0; j<ncols; ++j) {
         SET_VECTOR_ELT(ans, nlhs+j+i*ncols, target=allocVector(thistype, nrows) );
         double *dtarget = REAL(target);
@@ -58,7 +63,8 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil
     } break;
     case CPLXSXP: {
       const Rcomplex *zthiscol = COMPLEX(thiscol);
-      const Rcomplex *zthisfill = COMPLEX(thisfill);
+      const Rcomplex *zthisfill = 0;
+      if (some_fill) zthisfill = COMPLEX(thisfill);
       for (int j=0; j<ncols; ++j) {
         SET_VECTOR_ELT(ans, nlhs+j+i*ncols, target=allocVector(thistype, nrows) );
         Rcomplex *ztarget = COMPLEX(target);