Add nan parameter to fcoalesce for NaN/NA distinction control (#7189)

Mukulyadav2004 · MichaelChirico · web-flow · commit 66cb6d2393ce · 2025-07-21T12:19:02.000-07:00
* added nan parameter to fcoalesce * Style, link ?nafill * incorporate #7186 insights here too * duplicate loop for NA and NAN arg * tests * added tests for use of vector replacement also * added news entry --------- Co-authored-by: Michael Chirico <chiricom@google.com>
diff --git a/NEWS.md b/NEWS.md
@@ -56,6 +56,8 @@
 
 13. New `mergelist()` and `setmergelist()` similarly work _a la_ `Reduce()` to recursively merge a `list` of data.tables, [#599](https://github.com/Rdatatable/data.table/issues/599). Different join modes (_left_, _inner_, _full_, _right_, _semi_, _anti_, and _cross_) are supported through the `how` argument; duplicate handling goes through the `mult` argument. `setmergelist()` carefully avoids copies where one is not needed, e.g. in a 1:1 left join. Thanks Patrick Nicholson for the FR (in 2013!), @jangorecki for the PR, and @MichaelChirico for extensive reviews and fine-tuning.
 
+14. `fcoalesce()` and `setcoalesce()` gain `nan` argument to control whether `NaN` values should be treated as missing (`nan=NA`, the default) or non-missing (`nan=NaN`), [#4567](https://github.com/Rdatatable/data.table/issues/4567). This provides full compatibility with `nafill()` behavior. Thanks to @ethanbsmith for the feature request and @Mukulyadav2004 for the implementation.
+
 ### BUG FIXES
 
 1. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR.
diff --git a/R/wrappers.R b/R/wrappers.R
@@ -2,8 +2,8 @@
 # Very small (e.g. one line) R functions that just call C.
 # One file wrappers.R to avoid creating lots of small .R files.
 
-fcoalesce   = function(...) .Call(Ccoalesce, list(...), FALSE)
-setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE)
+fcoalesce   = function(..., nan=NA) .Call(Ccoalesce, list(...), FALSE, nan_is_na(nan))
+setcoalesce = function(..., nan=NA) .Call(Ccoalesce, list(...), TRUE, nan_is_na(nan))
 
 fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na)
 fcase   = function(..., default=NA) {
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -15586,6 +15586,11 @@ test(2060.154, fcoalesce(list(x)), x)
 test(2060.155, setcoalesce(list(x)), x)
 test(2060.156, setcoalesce(list(x,y,z)), ans)
 test(2060.157, x, ans)  # setcoalesce updated the first item (x) by reference
+# nan parameter, #4567
+test(2060.158, fcoalesce(c(NA_real_, NaN), 0, nan=NA), c(0, 0))
+test(2060.159, fcoalesce(c(NA_real_, NaN), 0, nan=NaN), c(0, NaN))
+test(2060.160, fcoalesce(c(NA_real_, NaN), c(1, 2), nan=NA), c(1, 2))
+test(2060.161, fcoalesce(c(NA_real_, NaN), c(1, 2), nan=NaN), c(1, NaN))
 # factor of different levels
 x = factor(c('a','b',NA,NA,'b'))
 y = factor(c('b','b','a',NA,'b'))
diff --git a/man/coalesce.Rd b/man/coalesce.Rd
@@ -7,10 +7,11 @@ Fill in missing values in a vector by successively pulling from candidate vector
 Written in C, and multithreaded for numeric and factor types.
 }
 \usage{
-  fcoalesce(\dots)
+  fcoalesce(\dots, nan=NA)
 }
 \arguments{
   \item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. }
+  \item{nan}{ Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement (double columns only). }
 }
 \details{
 Factor type is supported only when the factor levels of each item are equal.
@@ -22,7 +23,7 @@ Atomic vector of the same type and length as the first vector, having \code{NA}
 If the first item is \code{NULL}, the result is \code{NULL}.
 }
 \seealso{
-  \code{\link{fifelse}}
+  \code{\link{fifelse}}, \code{\link{nafill}}
 }
 \examples{
 x = c(11L, NA, 13L, NA, 15L, NA)
@@ -31,6 +32,9 @@ z = c(11L, NA, 1L, 14L, NA, NA)
 fcoalesce(x, y, z)
 fcoalesce(list(x,y,z))   # same
 fcoalesce(x, list(y,z))  # same
+x_num = c(NaN, NA_real_, 3.0)
+fcoalesce(x_num, 1)           # default: NaN treated as missing -> c(1, 1, 3)
+fcoalesce(x_num, 1, nan=NaN)  # preserve NaN -> c(NaN, 1, 3)
 }
 \keyword{ data }
 
diff --git a/src/coalesce.c b/src/coalesce.c
@@ -6,10 +6,12 @@
     - The replacement of NAs with non-NA values from subsequent vectors
     - The conditional checks within parallelized loops
 */
-SEXP coalesce(SEXP x, SEXP inplaceArg) {
+SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) {
   if (TYPEOF(x)!=VECSXP) internal_error(__func__, "input is list(...) at R level"); // # nocov
   if (!IS_TRUE_OR_FALSE(inplaceArg)) internal_error(__func__, "argument 'inplaceArg' must be TRUE or FALSE"); // # nocov
+  if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) internal_error(__func__, "argument 'nan_is_na_arg' must be TRUE or FALSE"); // # nocov
   const bool inplace = LOGICAL(inplaceArg)[0];
+  const bool nan_is_na = LOGICAL(nan_is_na_arg)[0];
   const bool verbose = GetVerbose();
   int nprotect = 0;
   if (length(x)==0 || isNull(VECTOR_ELT(x,0))) return R_NilValue;  // coalesce(NULL, "foo") return NULL even though character type mismatches type NULL
@@ -102,23 +104,44 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
     } else {
       double *xP = REAL(first), finalVal=NA_REAL;
       int k=0;
-      for (int j=0; j<nval; ++j) {
-        SEXP item = VECTOR_ELT(x, j+off);
-        if (length(item)==1) {
-          double tt = REAL(item)[0];
-          if (ISNAN(tt)) continue;
-          finalVal = tt;
-          break;
+      if (nan_is_na) {
+        for (int j=0; j<nval; ++j) {
+          SEXP item = VECTOR_ELT(x, j+off);
+          if (length(item)==1) {
+            double tt = REAL(item)[0];
+            if (ISNAN(tt)) continue;
+            finalVal = tt;
+            break;
+          }
+          valP[k++] = REAL_RO(item);
+        }
+        const bool final = !ISNAN(finalVal);
+        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
+        for (int i=0; i<nrow; ++i) {
+          double val=xP[i];
+          if (!ISNAN(val)) continue;
+          int j=0; while (ISNAN(val) && j<k) val=((double *)valP[j++])[i];
+          if (!ISNAN(val)) xP[i]=val; else if (final) xP[i]=finalVal;
+        }
+      } else {
+        for (int j=0; j<nval; ++j) {
+          SEXP item = VECTOR_ELT(x, j+off);
+          if (length(item)==1) {
+            double tt = REAL(item)[0];
+            if (ISNA(tt)) continue;
+            finalVal = tt;
+            break;
+          }
+          valP[k++] = REAL_RO(item);
+        }
+        const bool final = !ISNA(finalVal);
+        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
+        for (int i=0; i<nrow; ++i) {
+          double val=xP[i];
+          if (!ISNA(val)) continue;
+          int j=0; while (ISNA(val) && j<k) val=((double *)valP[j++])[i];
+          if (!ISNA(val)) xP[i]=val; else if (final) xP[i]=finalVal;
         }
-        valP[k++] = REAL_RO(item);
-      }
-      const bool final = !ISNAN(finalVal);
-      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
-      for (int i=0; i<nrow; ++i) {
-        double val=xP[i];
-        if (!ISNAN(val)) continue;
-        int j=0; while (ISNAN(val) && j<k) val=((double *)valP[j++])[i];
-        if (!ISNAN(val)) xP[i]=val; else if (final) xP[i]=finalVal;
       }
     }
   } break;
diff --git a/src/data.table.h b/src/data.table.h
@@ -251,7 +251,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
 SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check);
 
 // coalesce.c
-SEXP coalesce(SEXP x, SEXP inplace);
+SEXP coalesce(SEXP x, SEXP inplace, SEXP nan_is_na_arg);
 
 // utils.c
 bool within_int32_repres(double x);

Original file line number	Diff line number	Diff line change
`@@ -7,10 +7,11 @@ Fill in missing values in a vector by successively pulling from candidate vector`
`7`	`7`	`Written in C, and multithreaded for numeric and factor types.`
`8`	`8`	`}`
`9`	`9`	`\usage{`
`10`		`- fcoalesce(\dots)`
	`10`	`+ fcoalesce(\dots, nan=NA)`
`11`	`11`	`}`
`12`	`12`	`\arguments{`
`13`	`13`	`\item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. }`
	`14`	`+ \item{nan}{ Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement (double columns only). }`
`14`	`15`	`}`
`15`	`16`	`\details{`
`16`	`17`	`Factor type is supported only when the factor levels of each item are equal.`
`@@ -22,7 +23,7 @@ Atomic vector of the same type and length as the first vector, having \code{NA}`
`22`	`23`	`If the first item is \code{NULL}, the result is \code{NULL}.`
`23`	`24`	`}`
`24`	`25`	`\seealso{`
`25`		`- \code{\link{fifelse}}`
	`26`	`+ \code{\link{fifelse}}, \code{\link{nafill}}`
`26`	`27`	`}`
`27`	`28`	`\examples{`
`28`	`29`	`x = c(11L, NA, 13L, NA, 15L, NA)`
`@@ -31,6 +32,9 @@ z = c(11L, NA, 1L, 14L, NA, NA)`
`31`	`32`	`fcoalesce(x, y, z)`
`32`	`33`	`fcoalesce(list(x,y,z)) # same`
`33`	`34`	`fcoalesce(x, list(y,z)) # same`
	`35`	`+x_num = c(NaN, NA_real_, 3.0)`
	`36`	`+fcoalesce(x_num, 1) # default: NaN treated as missing -> c(1, 1, 3)`
	`37`	`+fcoalesce(x_num, 1, nan=NaN) # preserve NaN -> c(NaN, 1, 3)`
`34`	`38`	`}`
`35`	`39`	`\keyword{ data }`
`36`	`40`