Merge branch 'master' into between-int64

MichaelChirico · web-flow · commit d2fa90f8b4b8 · 2025-07-22T10:18:25.000-07:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -153,7 +153,7 @@ if (getRversion() >= "3.6.0") {
 
 # IDateTime support:
 export(as.IDate,as.ITime,IDateTime)
-export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year,yearmon,yearqtr)
+export(second,minute,hour,yday,wday,mday,week,isoweek,isoyear,month,quarter,year,yearmon,yearqtr)
 
 S3method("[", ITime)
 S3method("+", IDate)
diff --git a/NEWS.md b/NEWS.md
@@ -10,7 +10,16 @@
 
 ### NEW FEATURES
 
-1. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
+1. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also matches `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
+
+    ```r
+    DT = data.table(a=c(1L, 2L, 1L), b=c(3L, 1L, 2L))
+    sort_by(DT, ~a + b)
+    #    a b
+    # 1: 1 2
+    # 2: 1 3
+    # 3: 2 1
+    ```
 
 2. `melt()` now supports using `patterns()` with `id.vars`, [#6867](https://github.com/Rdatatable/data.table/issues/6867). Thanks to Toby Dylan Hocking for the suggestion and PR.
 
@@ -56,6 +65,10 @@
 
 13. New `mergelist()` and `setmergelist()` similarly work _a la_ `Reduce()` to recursively merge a `list` of data.tables, [#599](https://github.com/Rdatatable/data.table/issues/599). Different join modes (_left_, _inner_, _full_, _right_, _semi_, _anti_, and _cross_) are supported through the `how` argument; duplicate handling goes through the `mult` argument. `setmergelist()` carefully avoids copies where one is not needed, e.g. in a 1:1 left join. Thanks Patrick Nicholson for the FR (in 2013!), @jangorecki for the PR, and @MichaelChirico for extensive reviews and fine-tuning.
 
+14. `fcoalesce()` and `setcoalesce()` gain `nan` argument to control whether `NaN` values should be treated as missing (`nan=NA`, the default) or non-missing (`nan=NaN`), [#4567](https://github.com/Rdatatable/data.table/issues/4567). This provides full compatibility with `nafill()` behavior. Thanks to @ethanbsmith for the feature request and @Mukulyadav2004 for the implementation.
+
+15. New function `isoyear()` has been implemented as a complement to `isoweek()`, returning the ISO 8601 year corresponding to a given date, [#7154](https://github.com/Rdatatable/data.table/issues/7154). Thanks to @ben-schwen and @MichaelChirico for the suggestion and @venom1204 for the implementation.
+
 ### BUG FIXES
 
 1. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR.
@@ -86,6 +99,8 @@
 
 14. Filling columns of class Date with POSIXct (and vice versa) using `shift()` now yields a clear, informative error message specifying the class mismatch, [#5218](https://github.com/Rdatatable/data.table/issues/5218). Thanks @ashbaldry for the report and @ben-schwen for the fix.
 
+15. `split.data.table()` output list elements retain the S3 class of the generating data.table, e.g. in `l=split(x, ...)` if `x` has class `my_class`, so will `l[[1]]` and so on, [#7105](https://github.com/Rdatatable/data.table/issues/7105). Thanks @m-muecke for the bug report and @MichaelChirico for the fix.
+
 ### NOTES
 
 1. The following in-progress deprecations have proceeded:
diff --git a/R/IDateTime.R b/R/IDateTime.R
@@ -355,7 +355,7 @@ isoweek = function(x) as.integer(format(as.IDate(x), "%V"))
 #  nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
 #  year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
 #  1L + (nearest_thurs - year_start) %/% 7L
-
+isoyear = function(x) as.integer(format(as.IDate(x), "%G"))
 
 month   = function(x) convertDate(as.IDate(x), "month")
 quarter = function(x) convertDate(as.IDate(x), "quarter")
diff --git a/R/data.table.R b/R/data.table.R
@@ -2491,7 +2491,7 @@ Ops.data.table = function(e1, e2 = NULL)
 }
 
 split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) {
-  if (!is.data.table(x)) stopf("x argument must be a data.table")
+  if (!is.data.table(x)) internal_error("x argument to split.data.table must be a data.table") # nocov
   stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by),  is.logical(flatten))
   # split data.frame way, using `f` and not `by` argument
   if (!missing(f)) {
@@ -2566,8 +2566,11 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
   setattr(ll, "names", nm)
   # handle nested split
   if (flatten || length(by) == 1L) {
-    for (x in ll) .Call(C_unlock, x)
-    lapply(ll, setDT)
+    for (xi in ll) .Call(C_unlock, xi)
+    out = lapply(ll, setDT)
+    # TODO(#2000): just let setDT handle this
+    if (!identical(old_class <- class(x), c("data.table", "data.frame"))) for (xi in out) setattr(xi, "class", old_class)
+    out
     # alloc.col could handle DT in list as done in: c9c4ff80bdd4c600b0c4eff23b207d53677176bd
   } else if (length(by) > 1L) {
     lapply(ll, split.data.table, drop=drop, by=by[-1L], sorted=sorted, keep.by=keep.by, flatten=flatten)
diff --git a/R/wrappers.R b/R/wrappers.R
@@ -2,8 +2,8 @@
 # Very small (e.g. one line) R functions that just call C.
 # One file wrappers.R to avoid creating lots of small .R files.
 
-fcoalesce   = function(...) .Call(Ccoalesce, list(...), FALSE)
-setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE)
+fcoalesce   = function(..., nan=NA) .Call(Ccoalesce, list(...), FALSE, nan_is_na(nan))
+setcoalesce = function(..., nan=NA) .Call(Ccoalesce, list(...), TRUE, nan_is_na(nan))
 
 fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na)
 fcase   = function(..., default=NA) {
diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw
@@ -114,8 +114,9 @@ test(3.02, setnafill(list(copy(x)), "locf", fill=0L), list(x))
 test(3.03, setnafill(x, "locf"), error="in-place update is supported only for list")
 test(3.04, nafill(letters[1:5], fill=0), error="must be numeric type, or list/data.table")
 test(3.05, setnafill(list(letters[1:5]), fill=0), error="must be numeric type, or list/data.table")
-test(3.06, nafill(x, fill=1:2), error="fill must be a vector of length 1")
-test(3.07, nafill(x, fill="asd"), x, warning=c("Coercing.*character.*integer","NAs introduced by coercion"))
+test(3.06, nafill(x, fill=1:2), error="fill must be a vector of length 1.*fcoalesce")
+test(3.07, nafill(x, "locf", fill=1:2), error="fill must be a vector of length 1.*x\\.$")
+test(3.08, nafill(x, fill="asd"), x, warning=c("Coercing.*character.*integer","NAs introduced by coercion"))
 
 # colnamesInt helper
 dt = data.table(a=1, b=2, d=3)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -74,7 +74,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   setfrev = data.table:::setfrev
   shallow = data.table:::shallow # until exported
   .shallow = data.table:::.shallow
-  split.data.table = data.table:::split.data.table
   stopf = data.table:::stopf
   test = data.table:::test
   uniqlengths = data.table:::uniqlengths
@@ -9764,6 +9763,14 @@ test(1639.141, all(sapply(dtL, truelength) > 1000))
 dt <- data.table(x = factor("a"), y = 1)
 test(1639.142, x = split(dt, by = "x"), y = list(a = dt))
 test(1639.143, x = split(dt, by = "y"), y = list(`1` = dt))
+
+# retain a custom class after splitting, #7105
+DT = data.table(x=letters[1:10], y=1:10, z=rnorm(10))
+setattr(DT, "class", c("my_class", class(DT)))
+test(1639.144, "my_class" %in% unlist(lapply(split(DT, by="x"), class)))
+test(1639.145, "my_class" %in% unlist(lapply(split(DT, ~x), class)))
+test(1639.146, "my_class" %in% unlist(lapply(split(DT, by=c("x", "y")), class)))
+test(1639.147, "my_class" %in% unlist(lapply(split(DT, ~x+y), class)))
 rm_all()
 
 # allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax
@@ -14277,7 +14284,7 @@ test(1984.25, rbindlist(list(DT[1L], DT[2L]), idcol = TRUE), data.table(.id=1:2,
 test(1984.26, setalloccol(`*tmp*`), error='setalloccol attempting to modify `*tmp*`')
 DF = as.data.frame(DT)
 test(1984.27, identical(shallow(DF), DF))  # shallow (which is not exported) works on DF from v1.14.2. identical() to force checking the selfref attribute for #5286.
-test(1984.28, split.data.table(DF), error='argument must be a data.table')
+# 1984.28 was a coverage test converted to 'nocov' of an internal_error instead
 test(1984.29, split(DT, by='a', f='a'), error="passing 'f' argument together with 'by' is not allowed")
 test(1984.30, split(DT), error="Either 'by' or 'f' argument must be supplied")
 setnames(DT, '.ll.tech.split')
@@ -15583,6 +15590,11 @@ test(2060.154, fcoalesce(list(x)), x)
 test(2060.155, setcoalesce(list(x)), x)
 test(2060.156, setcoalesce(list(x,y,z)), ans)
 test(2060.157, x, ans)  # setcoalesce updated the first item (x) by reference
+# nan parameter, #4567
+test(2060.158, fcoalesce(c(NA_real_, NaN), 0, nan=NA), c(0, 0))
+test(2060.159, fcoalesce(c(NA_real_, NaN), 0, nan=NaN), c(0, NaN))
+test(2060.160, fcoalesce(c(NA_real_, NaN), c(1, 2), nan=NA), c(1, 2))
+test(2060.161, fcoalesce(c(NA_real_, NaN), c(1, 2), nan=NaN), c(1, NaN))
 # factor of different levels
 x = factor(c('a','b',NA,NA,'b'))
 y = factor(c('b','b','a',NA,'b'))
@@ -21541,3 +21553,11 @@ f = tempfile()
 writeLines(c('a', rep('0x1.ffffp0', 10000L), '0x1.ff\x9fp0', rep('0x1.ffffp0', 20000L)), f)
 test(2334, names(fread(f)), "a")
 unlink(f)
+
+# Tests for new isoyear() helper (complement to isoweek) #7154
+test(2335.1, isoyear(as.IDate("2019-12-30")), 2020L)  # End of year edge case
+test(2335.2, isoyear(as.IDate("2016-01-01")), 2015L)  # Start of year edge case
+test(2335.3, isoyear(as.IDate("2023-08-15")), 2023L)  # Normal mid-year case
+test(2335.4, isoyear(as.IDate(c("2019-12-30", "2016-01-01", "2023-08-15"))),c(2020L, 2015L, 2023L))
+test(2335.5, isoyear("2019-12-30"), 2020L)
+test(2335.6, isoyear(as.Date("2019-12-30")), 2020L)
diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd
@@ -38,6 +38,7 @@
 \alias{mday}
 \alias{week}
 \alias{isoweek}
+\alias{isoyear}
 \alias{month}
 \alias{quarter}
 \alias{year}
@@ -92,6 +93,7 @@ wday(x)
 mday(x)
 week(x)
 isoweek(x)
+isoyear(x)     
 month(x)
 quarter(x)
 year(x)
@@ -187,6 +189,8 @@ which specify that the first week of the year is the one containing the first Th
 This convention ensures that week boundaries align consistently with year boundaries,
 accounting for both year transitions and varying day counts per week.
 
+Similarly, \code{isoyear()} returns the ISO 8601 year corresponding to the ISO week.
+
 }
 
 \value{
@@ -200,7 +204,7 @@ accounting for both year transitions and varying day counts per week.
    \code{itime} in \code{IDate} and \code{ITime} format.
 
    \code{second}, \code{minute}, \code{hour}, \code{yday}, \code{wday},
-   \code{mday}, \code{week}, \code{month}, \code{quarter},
+   \code{mday}, \code{week}, \code{isoweek}, \code{isoyear}, \code{month}, \code{quarter},
    and \code{year} return integer values
    for second, minute, hour, day of year, day of week,
    day of month, week, month, quarter, and year, respectively.
@@ -281,6 +285,17 @@ round(seqdates, "months")
 round(seqtimes, "hours")
 trunc(seqtimes, "hours")
 
+# Examples for isoyear() and isoweek()
+d1 = as.IDate("2019-12-30")
+year(d1)
+isoweek(d1)
+isoyear(d1)
+
+d2 = as.IDate("2016-01-01")
+year(d2)
+isoweek(d2)
+isoyear(d2)
+
 }
 \keyword{utilities}
 
diff --git a/man/coalesce.Rd b/man/coalesce.Rd
@@ -7,10 +7,11 @@ Fill in missing values in a vector by successively pulling from candidate vector
 Written in C, and multithreaded for numeric and factor types.
 }
 \usage{
-  fcoalesce(\dots)
+  fcoalesce(\dots, nan=NA)
 }
 \arguments{
   \item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. }
+  \item{nan}{ Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement (double columns only). }
 }
 \details{
 Factor type is supported only when the factor levels of each item are equal.
@@ -22,7 +23,7 @@ Atomic vector of the same type and length as the first vector, having \code{NA}
 If the first item is \code{NULL}, the result is \code{NULL}.
 }
 \seealso{
-  \code{\link{fifelse}}
+  \code{\link{fifelse}}, \code{\link{nafill}}
 }
 \examples{
 x = c(11L, NA, 13L, NA, 15L, NA)
@@ -31,6 +32,9 @@ z = c(11L, NA, 1L, 14L, NA, NA)
 fcoalesce(x, y, z)
 fcoalesce(list(x,y,z))   # same
 fcoalesce(x, list(y,z))  # same
+x_num = c(NaN, NA_real_, 3.0)
+fcoalesce(x_num, 1)           # default: NaN treated as missing -> c(1, 1, 3)
+fcoalesce(x_num, 1, nan=NaN)  # preserve NaN -> c(NaN, 1, 3)
 }
 \keyword{ data }
 
diff --git a/src/coalesce.c b/src/coalesce.c
@@ -6,10 +6,12 @@
     - The replacement of NAs with non-NA values from subsequent vectors
     - The conditional checks within parallelized loops
 */
-SEXP coalesce(SEXP x, SEXP inplaceArg) {
+SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) {
   if (TYPEOF(x)!=VECSXP) internal_error(__func__, "input is list(...) at R level"); // # nocov
   if (!IS_TRUE_OR_FALSE(inplaceArg)) internal_error(__func__, "argument 'inplaceArg' must be TRUE or FALSE"); // # nocov
+  if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) internal_error(__func__, "argument 'nan_is_na_arg' must be TRUE or FALSE"); // # nocov
   const bool inplace = LOGICAL(inplaceArg)[0];
+  const bool nan_is_na = LOGICAL(nan_is_na_arg)[0];
   const bool verbose = GetVerbose();
   int nprotect = 0;
   if (length(x)==0 || isNull(VECTOR_ELT(x,0))) return R_NilValue;  // coalesce(NULL, "foo") return NULL even though character type mismatches type NULL
@@ -102,23 +104,44 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
     } else {
       double *xP = REAL(first), finalVal=NA_REAL;
       int k=0;
-      for (int j=0; j<nval; ++j) {
-        SEXP item = VECTOR_ELT(x, j+off);
-        if (length(item)==1) {
-          double tt = REAL(item)[0];
-          if (ISNAN(tt)) continue;
-          finalVal = tt;
-          break;
+      if (nan_is_na) {
+        for (int j=0; j<nval; ++j) {
+          SEXP item = VECTOR_ELT(x, j+off);
+          if (length(item)==1) {
+            double tt = REAL(item)[0];
+            if (ISNAN(tt)) continue;
+            finalVal = tt;
+            break;
+          }
+          valP[k++] = REAL_RO(item);
+        }
+        const bool final = !ISNAN(finalVal);
+        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
+        for (int i=0; i<nrow; ++i) {
+          double val=xP[i];
+          if (!ISNAN(val)) continue;
+          int j=0; while (ISNAN(val) && j<k) val=((double *)valP[j++])[i];
+          if (!ISNAN(val)) xP[i]=val; else if (final) xP[i]=finalVal;
+        }
+      } else {
+        for (int j=0; j<nval; ++j) {
+          SEXP item = VECTOR_ELT(x, j+off);
+          if (length(item)==1) {
+            double tt = REAL(item)[0];
+            if (ISNA(tt)) continue;
+            finalVal = tt;
+            break;
+          }
+          valP[k++] = REAL_RO(item);
+        }
+        const bool final = !ISNA(finalVal);
+        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
+        for (int i=0; i<nrow; ++i) {
+          double val=xP[i];
+          if (!ISNA(val)) continue;
+          int j=0; while (ISNA(val) && j<k) val=((double *)valP[j++])[i];
+          if (!ISNA(val)) xP[i]=val; else if (final) xP[i]=finalVal;
         }
-        valP[k++] = REAL_RO(item);
-      }
-      const bool final = !ISNAN(finalVal);
-      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
-      for (int i=0; i<nrow; ++i) {
-        double val=xP[i];
-        if (!ISNAN(val)) continue;
-        int j=0; while (ISNAN(val) && j<k) val=((double *)valP[j++])[i];
-        if (!ISNAN(val)) xP[i]=val; else if (final) xP[i]=finalVal;
       }
     }
   } break;
diff --git a/src/data.table.h b/src/data.table.h
@@ -251,7 +251,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
 SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP check);
 
 // coalesce.c
-SEXP coalesce(SEXP x, SEXP inplace);
+SEXP coalesce(SEXP x, SEXP inplace, SEXP nan_is_na_arg);
 
 // utils.c
 bool within_int32_repres(double x);
diff --git a/src/mergelist.c b/src/mergelist.c
@@ -32,7 +32,7 @@ void mergeIndexAttrib(SEXP to, SEXP from) {
 }
 
 SEXP cbindlist(SEXP x, SEXP copyArg) {
-  if (!isNewList(x) || isFrame(x))
+  if (!isNewList(x) || isDataFrame(x))
     error(_("'%s' must be a list"), "x");
   bool copy = (bool)LOGICAL(copyArg)[0];
   const bool verbose = GetVerbose();
diff --git a/src/nafill.c b/src/nafill.c
@@ -180,8 +180,13 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
     isInt64[i] = INHERITS(VECTOR_ELT(x, i), char_integer64);
   const void **fillp = (const void **)R_alloc(nx, sizeof(*fillp)); // fill is (or will be) a list of length nx of matching types, scalar values for each column, this pointer points to each of those columns data pointers
   if (hasFill) {
-    if (nx!=length(fill) && length(fill)!=1)
-      error(_("fill must be a vector of length 1 or a list of length of x"));
+    if (nx!=length(fill) && length(fill)!=1) {
+      if (itype == 0) {
+        error(_("fill must be a vector of length 1 or a list of length of x. Consider fcoalesce() to specify element-wise replacements."));
+      } else {
+        error(_("fill must be a vector of length 1 or a list of length of x."));
+      }
+    }
     if (!isNewList(fill)) {
       SEXP fill1 = fill;
       fill = PROTECT(allocVector(VECSXP, nx)); protecti++;
diff --git a/src/utils.c b/src/utils.c
@@ -533,10 +533,8 @@ bool isRectangularList(SEXP x) {
   return isRectangular(x);
 }
 
-// TODO: use isDataFrame (when included in any R release).
-// isDataTable(x) || isFrame(x) || isRectangularList(x)
 bool perhapsDataTable(SEXP x) {
-  return isDataTable(x) || isFrame(x) || isRectangularList(x);
+  return isDataTable(x) || isDataFrame(x) || isRectangularList(x);
 }
 SEXP perhapsDataTableR(SEXP x) {
   return ScalarLogical(perhapsDataTable(x));

Original file line number	Diff line number	Diff line change
`@@ -7,10 +7,11 @@ Fill in missing values in a vector by successively pulling from candidate vector`
`7`	`7`	`Written in C, and multithreaded for numeric and factor types.`
`8`	`8`	`}`
`9`	`9`	`\usage{`
`10`		`- fcoalesce(\dots)`
	`10`	`+ fcoalesce(\dots, nan=NA)`
`11`	`11`	`}`
`12`	`12`	`\arguments{`
`13`	`13`	`\item{\dots}{ A set of same-class vectors. These vectors can be supplied as separate arguments or as a single plain list, data.table or data.frame, see examples. }`
	`14`	`+ \item{nan}{ Either \code{NaN} or \code{NA}; if \code{NaN}, then \code{NaN} is treated as distinct from \code{NA}, otherwise they are treated the same during replacement (double columns only). }`
`14`	`15`	`}`
`15`	`16`	`\details{`
`16`	`17`	`Factor type is supported only when the factor levels of each item are equal.`
`@@ -22,7 +23,7 @@ Atomic vector of the same type and length as the first vector, having \code{NA}`
`22`	`23`	`If the first item is \code{NULL}, the result is \code{NULL}.`
`23`	`24`	`}`
`24`	`25`	`\seealso{`
`25`		`- \code{\link{fifelse}}`
	`26`	`+ \code{\link{fifelse}}, \code{\link{nafill}}`
`26`	`27`	`}`
`27`	`28`	`\examples{`
`28`	`29`	`x = c(11L, NA, 13L, NA, 15L, NA)`
`@@ -31,6 +32,9 @@ z = c(11L, NA, 1L, 14L, NA, NA)`
`31`	`32`	`fcoalesce(x, y, z)`
`32`	`33`	`fcoalesce(list(x,y,z)) # same`
`33`	`34`	`fcoalesce(x, list(y,z)) # same`
	`35`	`+x_num = c(NaN, NA_real_, 3.0)`
	`36`	`+fcoalesce(x_num, 1) # default: NaN treated as missing -> c(1, 1, 3)`
	`37`	`+fcoalesce(x_num, 1, nan=NaN) # preserve NaN -> c(NaN, 1, 3)`
`34`	`38`	`}`
`35`	`39`	`\keyword{ data }`
`36`	`40`