Allow double-integer64 joins when double is in (integer32 , integer64] range (#6626)

MichaelChirico · ben-schwen · web-flow · commit 96e89fad997c · 2024-12-06T08:08:14.000-08:00
* Allow double-integer64 joins when double is in (integer32 , integer64] range

* rename R-side argument for readability?

* Totally drop isReallyReal, just use isRealReallyInt with flavors for 32/64

* Error: flip result when changing to isRealReallyInt*

* Further simplify -- first* helpers not needed if we just return bool

* logical inversion

* Subtle difference vs isReallyReal (type check)

* Same subtle difference in .prepareFastSubset

* .prepareFastSubset fix

* fix test output

* Add duplicate bug number to NEWS

* amend a new call site for isReallyReal

* fix botched merge

* add codecov test

* add non exported function

* isRealReallyInt -&gt; fitsInInt

---------

Co-authored-by: Benjamin Schwendinger &lt;benjaminschwe@gmail.com&gt;
diff --git a/NEWS.md b/NEWS.md
@@ -117,6 +117,8 @@ rowwiseDT(
 
 15. The auto-printing suppression in `knitr` documents is now done by implementing a method for `knit_print` instead of looking up the call stack, [#6589](https://github.com/Rdatatable/data.table/pull/6589). Thanks to @jangorecki for the report [#6509](https://github.com/Rdatatable/data.table/issues/6509) and @aitap for the fix.
 
+16. Joins of `integer64` and `double` columns succeed when the `double` column has lossless `integer64` representation, [#4167](https://github.com/Rdatatable/data.table/issues/4167) and [#6625](https://github.com/Rdatatable/data.table/issues/6625). Previously, this only worked when the double column had lossless _32-bit_ integer representation. Thanks @MichaelChirico for the reports and fix.
+
 ## NOTES
 
 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181).
diff --git a/R/IDateTime.R b/R/IDateTime.R
@@ -99,9 +99,9 @@ round.IDate = function(x, digits=c("weeks", "months", "quarters", "years"), ...)
   # TODO: investigate Ops.IDate method a la Ops.difftime
   if (inherits(e1, "difftime") || inherits(e2, "difftime"))
     internal_error("difftime objects may not be added to IDate, but Ops dispatch should have intervened to prevent this") # nocov
-  if (isReallyReal(e1) || isReallyReal(e2)) {
+  # IDate doesn't support fractional days; revert to base Date
+  if ((is.double(e1) && !fitsInInt32(e1)) || (is.double(e2) && !fitsInInt32(e2))) {
     return(`+.Date`(e1, e2))
-    # IDate doesn't support fractional days; revert to base Date
   }
   if (inherits(e1, "Date") && inherits(e2, "Date"))
     stopf("binary + is not defined for \"IDate\" objects")
@@ -120,7 +120,7 @@ round.IDate = function(x, digits=c("weeks", "months", "quarters", "years"), ...)
   if (inherits(e2, "difftime"))
     internal_error("difftime objects may not be subtracted from IDate, but Ops dispatch should have intervened to prevent this") # nocov
 
-  if ( isReallyReal(e2) ) {
+  if ( is.double(e2) && !fitsInInt32(e2) ) {
     # IDate deliberately doesn't support fractional days so revert to base Date
     return(base::`-.Date`(as.Date(e1), e2))
     # can't call base::.Date directly (last line of base::`-.Date`) as tried in PR#3168 because
diff --git a/R/bmerge.R b/R/bmerge.R
@@ -4,7 +4,7 @@ mergeType = function(x) {
   ans = typeof(x)
   if      (ans=="integer") { if (is.factor(x))             ans = "factor"    }
   else if (ans=="double")  { if (inherits(x, "integer64")) ans = "integer64" }
-  # do not call isReallyReal(x) yet because i) if both types are double we don't need to coerce even if one or both sides
+  # do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides
   # are int-as-double, and ii) to save calling it until we really need it
   ans
 }
@@ -103,23 +103,23 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
     if (x_merge_type=="integer64" || i_merge_type=="integer64") {
       nm = c(iname, xname)
       if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) }  # w is which to coerce
-      if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) {
-        if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L])
+      if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) {
+        if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which has integer64 representation, e.g. no fractions)" else "", nm[2L])
         set(w, j=wc, value=bit64::as.integer64(w[[wc]]))
-      } else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L])
+      } else stopf("Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)", nm[2L], nm[1L])
     } else {
       # just integer and double left
       ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602
       if (i_merge_type=="double") {
         coerce_x = FALSE
-        if (!isReallyReal(i[[icol]])) {
+        if (fitsInInt32(i[[icol]])) {
           coerce_x = TRUE
           # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
           # we've always coerced to int and returned int, for convenience.
           if (length(ic_idx)>1L) {
             xc_idx = xcols[ic_idx]
             for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
-              if (isReallyReal(x[[xb]])) {
+              if (!fitsInInt32(x[[xb]])) {
                 coerce_x = FALSE
                 break
               }
diff --git a/R/data.table.R b/R/data.table.R
@@ -3241,9 +3241,9 @@ is_constantish = function(q, check_singleton=FALSE) {
       if (length(RHS) != nrow(x)) stopf("RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %%in%% instead.", operator, length(RHS), nrow(x))
       return(NULL) # DT[colA == colB] regular element-wise vector scan
     }
-    if ( mode(x[[col]]) != mode(RHS) ||                # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS!isReallyReal are optimized (both sides mode 'numeric')
-         is.factor(x[[col]])+is.factor(RHS) == 1L ||   # but factor is also mode 'numeric' so treat that separately
-         is.integer(x[[col]]) && isReallyReal(RHS) ) { # and if RHS contains fractions then don't optimize that as bmerge truncates the fractions to match to the target integer type
+    if ( (mode(x[[col]]) != mode(RHS)) ||                # mode() so that doubleLHS/integerRHS and integerLHS/doubleRHS&fitsInInt32 are optimized (both sides mode 'numeric')
+         (is.factor(x[[col]])+is.factor(RHS) == 1L) ||   # but factor is also mode 'numeric' so treat that separately
+         (is.integer(x[[col]]) && is.double(RHS) && !fitsInInt32(RHS)) ) { # and if RHS contains fractions then don't optimize that as bmerge truncates the fractions to match to the target integer type
       # re-direct non-matching type cases to base R, as data.table's binary
       # search based join is strict in types. #957, #961 and #1361
       # the mode() checks also deals with NULL since mode(NULL)=="NULL" and causes this return, as one CRAN package (eplusr 0.9.1) relies on
diff --git a/R/wrappers.R b/R/wrappers.R
@@ -17,7 +17,7 @@ colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(Ccoln
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
-isRealReallyInt = function(x) .Call(CisRealReallyIntR, x)
-isReallyReal = function(x) .Call(CisReallyReal, x)
+fitsInInt32 = function(x) .Call(CfitsInInt32R, x)
+fitsInInt64 = function(x) .Call(CfitsInInt64R, x)
 
 coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -56,8 +56,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   internal_error = data.table:::internal_error
   is_na = data.table:::is_na
   is.sorted = data.table:::is.sorted
-  isReallyReal = data.table:::isReallyReal
-  isRealReallyInt = data.table:::isRealReallyInt
+  fitsInInt32 = data.table:::fitsInInt32
+  fitsInInt64 = data.table:::fitsInInt64
   is_utc = data.table:::is_utc
   melt.data.table = data.table:::melt.data.table  # for test 1953.4
   messagef = data.table:::messagef
@@ -7356,22 +7356,23 @@ if (test_bit64) {
 X = list(a = 4:1, b=runif(4))
 test(1513, setkey(as.data.table(X), a), setDT(X, key="a"))
 
-# Adding tests for `isReallyReal`
+# Adding tests for `fitsInInt32`
 x = as.numeric(sample(10))
-test(1514.1, isReallyReal(x), 0L)
+test(1514.1, fitsInInt32(x), TRUE)
 x = as.numeric(sample(c(1:5, NA)))
-test(1514.2, isReallyReal(x), 0L) # NAs in numeric can be coerced to integer NA without loss
+test(1514.2, fitsInInt32(x), TRUE) # NAs in numeric can be coerced to integer NA without loss
 x = c(1:2, NaN, NA)
-test(1514.3, isReallyReal(x), 3L)
+test(1514.3, fitsInInt32(x), FALSE)
 x = c(1:2, Inf, NA)
-test(1514.4, isReallyReal(x), 3L)
+test(1514.4, fitsInInt32(x), FALSE)
 x = c(1:2, -Inf, NA)
-test(1514.5, isReallyReal(x), 3L)
+test(1514.5, fitsInInt32(x), FALSE)
 x = runif(2)
-test(1514.6, isReallyReal(x), 1L)
+test(1514.6, fitsInInt32(x), FALSE)
 x = numeric()
-test(1514.7, isReallyReal(x), 0L)
-test(1514.8, isReallyReal(9L), 0L)
+test(1514.7, fitsInInt32(x), TRUE)
+test(1514.8, fitsInInt32(9L), FALSE) # b/c not double input
+test(1514.9, fitsInInt64(9L), FALSE)
 
 # #1091
 options(datatable.prettyprint.char = 5L)
@@ -15150,19 +15151,19 @@ if (test_bit64) {
                 output = "Coercing integer column x.int to type integer64 to match type of i.int64")
   test(2044.67, dt1[dt2, ..cols, on="doubleInt==int64", nomatch=0L, verbose=TRUE],
                 ans,
-                output = "Coercing double column x.doubleInt (which contains no fractions) to type integer64 to match type of i.int64")
+                output = "Coercing double column x.doubleInt (which has integer64 representation, e.g. no fractions) to type integer64 to match type of i.int64")
   test(2044.68, dt1[dt2, ..cols, on="realDouble==int64", nomatch=0L, verbose=TRUE],
-                error="Incompatible join types: i.int64 is type integer64 but x.realDouble is type double and contains fractions")
+                error="Incompatible join types: i.int64 is type integer64 but x.realDouble is type double and cannot be coerced to integer64")
   # int64 in x
   test(2044.69, dt1[dt2, ..cols, on="int64==int", nomatch=0L, verbose=TRUE],
                 ans<-data.table(x.int=1:5, x.doubleInt=as.double(1:5), x.realDouble=c(0.5,1.0,1.5,2.0,2.5), x.int64=as.integer64(1:5),
                                 i.int=1:5, i.doubleInt=as.double(1:5), i.realDouble=c(0.5,1.0,1.5,2.0,2.5), i.int64=as.integer64(c(1:4, 3000000000))),
                 output = "Coercing integer column i.int to type integer64 to match type of x.int64")
   test(2044.70, dt1[dt2, ..cols, on="int64==doubleInt", nomatch=0L, verbose=TRUE],
                 ans,
-                output = "Coercing double column i.doubleInt (which contains no fractions) to type integer64 to match type of x.int64")
+                output = "Coercing double column i.doubleInt (which has integer64 representation, e.g. no fractions) to type integer64 to match type of x.int64")
   test(2044.71, dt1[dt2, ..cols, on="int64==realDouble", nomatch=0L, verbose=TRUE],
-                error="Incompatible join types: x.int64 is type integer64 but i.realDouble is type double and contains fractions")
+                error="Incompatible join types: x.int64 is type integer64 but i.realDouble is type double and cannot be coerced to integer64")
 }
 # coercion of all-NA
 dt1 = data.table(a=1,  b=NA_character_)
@@ -17603,18 +17604,18 @@ test(2204, as.data.table(mtcars, keep.rownames='model', key='model'),
 
 # 2205 tested nanotime moved to other.Rraw 27, #5516
 
-# isRealReallyInt, #3966
-test(2206.01, isRealReallyInt(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE)
-test(2206.02, isRealReallyInt(2147483648.0), FALSE) # >INT_MAX
-test(2206.03, isRealReallyInt(-2147483648.0), FALSE) # <=INT_MIN since INT_MIN==NA_integer_
-test(2206.04, isRealReallyInt(c(5,-5,2147483648)), FALSE)  # test real last position
-test(2206.05, isRealReallyInt(NaN), FALSE)
-test(2206.06, isRealReallyInt(+Inf), FALSE)
-test(2206.07, isRealReallyInt(-Inf), FALSE)
-test(2206.08, isRealReallyInt(0.1), FALSE)
-test(2206.09, isRealReallyInt(numeric()), TRUE)
-test(2206.10, isRealReallyInt(9L), FALSE) # must be type double
-test(2206.11, isRealReallyInt(integer()), FALSE)
+# fitsInInt32, #3966
+test(2206.01, fitsInInt32(c(-2147483647.0, NA, 0.0, 2147483647.0)), TRUE)
+test(2206.02, fitsInInt32(2147483648.0), FALSE) # >INT_MAX
+test(2206.03, fitsInInt32(-2147483648.0), FALSE) # <=INT_MIN since INT_MIN==NA_integer_
+test(2206.04, fitsInInt32(c(5,-5,2147483648)), FALSE)  # test real last position
+test(2206.05, fitsInInt32(NaN), FALSE)
+test(2206.06, fitsInInt32(+Inf), FALSE)
+test(2206.07, fitsInInt32(-Inf), FALSE)
+test(2206.08, fitsInInt32(0.1), FALSE)
+test(2206.09, fitsInInt32(numeric()), TRUE)
+test(2206.10, fitsInInt32(9L), FALSE) # must be type double
+test(2206.11, fitsInInt32(integer()), FALSE)
 
 # dcast supports complex value to cast, #4855
 DT = CJ(x=1:3, y=letters[1:2])
@@ -20664,3 +20665,15 @@ test(2299.09, format_list_item(data.table(a=numeric(), b=numeric())), output="<d
 test(2299.10, data.table(a=1),                                        output="a\n1: *1")
 test(2299.11, data.table(a=list(data.frame(b=1))),                    output="a\n1: <data.frame[1x1]>")
 test(2299.12, data.table(a=list(data.table(b=1))),                    output="a\n1: <data.table[1x1]>")
+
+if (test_bit64) {
+  # Join to integer64 doesn't require integer32 representation, just integer64, #6625
+  i64_val = .Machine$integer.max + 1
+  DT1 = data.table(id = as.integer64(i64_val))
+  DT2 = data.table(id = i64_val)
+  test(2300.1, DT1[DT2, on='id', verbose=TRUE], DT2, output="has integer64 representation")
+  test(2300.2, DT2[DT1, on='id', verbose=TRUE], DT1, output="has integer64 representation")
+  DT2[, id := id+.01]
+  test(2300.3, DT1[DT2, on='id'], error="Incompatible join types")
+  test(2300.4, DT2[DT1, on='id'], error="Incompatible join types")
+}
diff --git a/src/between.c b/src/between.c
@@ -30,8 +30,8 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S
   const bool verbose = GetVerbose();
 
   if (isInteger(x)) {
-    if ((isInteger(lower) || isRealReallyInt(lower)) &&
-        (isInteger(upper) || isRealReallyInt(upper))) { // #3517 coerce to num to int when possible
+    if ((isInteger(lower) || fitsInInt32(lower)) &&
+        (isInteger(upper) || fitsInInt32(upper))) { // #3517 coerce to num to int when possible
       if (!isInteger(lower)) {
         lower = PROTECT(coerceVector(lower, INTSXP)); nprotect++;
       }
diff --git a/src/data.table.h b/src/data.table.h
@@ -243,9 +243,10 @@ SEXP coalesce(SEXP x, SEXP inplace);
 // utils.c
 bool within_int32_repres(double x);
 bool within_int64_repres(double x);
-bool isRealReallyInt(SEXP x);
-SEXP isRealReallyIntR(SEXP x);
-SEXP isReallyReal(SEXP x);
+bool fitsInInt32(SEXP x);
+SEXP fitsInInt32R(SEXP x);
+bool fitsInInt64(SEXP x);
+SEXP fitsInInt64R(SEXP x);
 bool allNA(SEXP x, bool errorForBadType);
 SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent);
 bool INHERITS(SEXP x, SEXP char_);
diff --git a/src/forder.c b/src/forder.c
@@ -595,12 +595,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
       if (INHERITS(x, char_integer64)) {
         range_i64((int64_t *)REAL(x), nrow, &min, &max, &na_count);
       } else {
-        if (verbose && INHERITS(x, char_Date) && INTEGER(isReallyReal(x))[0]==0) {
-          Rprintf(_("\n*** Column %d passed to forder is a date stored as an 8 byte double but no fractions are present. Please consider a 4 byte integer date such as IDate to save space and time.\n"), col+1);
-          // Note the (slightly expensive) isReallyReal will only run when verbose is true. Prefix '***' just to make it stand out in verbose output
+        if (verbose && INHERITS(x, char_Date) && fitsInInt32(x)) {
+          // Note the (slightly expensive) fitsInInt32 will only run when verbose is true. Prefix '***' just to make it stand out in verbose output
           // In future this could be upgraded to option warning. But I figured that's what we use verbose to do (to trace problems and look for efficiencies).
           // If an automatic coerce is desired (see discussion in #1738) then this is the point to do that in this file. Move the INTSXP case above to be
           // next, do the coerce of Date to integer now to a tmp, and then let this case fall through to INTSXP in the same way as CPLXSXP falls through to REALSXP.
+          Rprintf(_("\n*** Column %d passed to forder is a date stored as an 8 byte double but no fractions are present. Please consider a 4 byte integer date such as IDate to save space and time.\n"), col+1);
         }
         range_d(REAL(x), nrow, &min, &max, &na_count, &infnan_count);
         if (min==0 && na_count<nrow) { min=3; max=4; } // column contains no finite numbers and is not-all NA; create dummies to yield positive min-2 later
diff --git a/src/frollR.c b/src/frollR.c
@@ -229,7 +229,7 @@ SEXP frollapplyR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP align, SEXP rho) {
 
   if (!isInteger(k)) {
     if (isReal(k)) {
-      if (isRealReallyInt(k)) {
+      if (fitsInInt32(k)) {
         SEXP ik = PROTECT(coerceVector(k, INTSXP)); protecti++;
         k = ik;
       } else {
diff --git a/src/init.c b/src/init.c
@@ -101,8 +101,8 @@ R_CallMethodDef callMethods[] = {
 {"Cshift", (DL_FUNC) &shift, -1},
 {"Ctranspose", (DL_FUNC) &transpose, -1},
 {"CanyNA", (DL_FUNC) &anyNA, -1},
-{"CisReallyReal", (DL_FUNC) &isReallyReal, -1},
-{"CisRealReallyIntR", (DL_FUNC) &isRealReallyIntR, -1},
+{"CfitsInInt32R", (DL_FUNC) &fitsInInt32R, -1},
+{"CfitsInInt64R", (DL_FUNC) &fitsInInt64R, -1},
 {"Csetlevels", (DL_FUNC) &setlevels, -1},
 {"Crleid", (DL_FUNC) &rleid, -1},
 {"Cgmedian", (DL_FUNC) &gmedian, -1},
diff --git a/src/utils.c b/src/utils.c
@@ -11,30 +11,40 @@ bool within_int64_repres(double x) {
   return R_FINITE(x) && x <= (double)INT64_MAX && x >= (double)INT64_MIN;
 }
 
-static R_xlen_t firstNonInt(SEXP x) {
+// used to error if not passed type double but this needed extra is.double() calls in calling R code
+// which needed a repeat of the argument. Hence simpler and more robust to return false when not type double.
+bool fitsInInt32(SEXP x) {
+  if (!isReal(x))
+    return false;
   R_xlen_t n=xlength(x), i=0;
   const double *dx = REAL(x);
   while (i<n &&
          ( ISNA(dx[i]) ||
          (within_int32_repres(dx[i]) && dx[i]==(int)(dx[i])))) {
     i++;
   }
-  return i==n ? 0 : i+1;
+  return i==n;
 }
 
-bool isRealReallyInt(SEXP x) {
-  return isReal(x) ? firstNonInt(x)==0 : false;
-  // used to error if not passed type double but this needed extra is.double() calls in calling R code
-  // which needed a repeat of the argument. Hence simpler and more robust to return false when not type double.
+SEXP fitsInInt32R(SEXP x) {
+  return ScalarLogical(fitsInInt32(x));
 }
 
-SEXP isRealReallyIntR(SEXP x) {
-  return ScalarLogical(isRealReallyInt(x));
+bool fitsInInt64(SEXP x) {
+  if (!isReal(x))
+    return false;
+  R_xlen_t n=xlength(x), i=0;
+  const double *dx = REAL(x);
+  while (i<n &&
+         ( ISNA(dx[i]) ||
+         (within_int64_repres(dx[i]) && dx[i]==(int64_t)(dx[i])))) {
+    i++;
+  }
+  return i==n;
 }
 
-SEXP isReallyReal(SEXP x) {
-  return ScalarInteger(isReal(x) ? firstNonInt(x) : 0);
-  // return the 1-based location of first element which is really real (i.e. not an integer) otherwise 0 (false)
+SEXP fitsInInt64R(SEXP x) {
+  return ScalarLogical(fitsInInt64(x));
 }
 
 bool allNA(SEXP x, bool errorForBadType) {
@@ -125,7 +135,7 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent) {
       } else
         ricols = cols;
     } else if (isReal(cols)) {
-      if (!isRealReallyInt(cols))
+      if (!fitsInInt32(cols))
         error(_("argument specifying columns is type 'double' and one or more items in it are not whole integers"));
       ricols = PROTECT(coerceVector(cols, INTSXP)); protecti++;
     }