From 4b8028d180a4e831ac14061ddb8ae4ac6b327173 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 23:17:32 +0200 Subject: [PATCH 1/7] add warning for encodings other than utf8 in unique and duplicated --- NEWS.md | 2 ++ R/duplicated.R | 5 +++++ inst/tests/tests.Rraw | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8ae64da71c..45db652f09 100644 --- a/NEWS.md +++ b/NEWS.md @@ -357,6 +357,8 @@ 7. In rare situations a data.table object may lose its internal attribute that holds a self-reference. New helper function `.selfref.ok()` tests just that. It is only intended for technical use cases. See manual for examples. +8. `unique()` and `duplicated()` warn now if columns with encodings other than UTF-8 are present, since these are converted to UTF-8 for comparison, which may lead to unexpected results, [#469](https://github.com/Rdatatable/data.table/issues/469). Thanks to @arunsrinivasan for the request and @ben-schwen for the implementation. + ## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025) 1. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070). diff --git a/R/duplicated.R b/R/duplicated.R index e1a04c9822..9a6594ef22 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -13,6 +13,8 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ if (fromLast) f = cumsum(uniqlengths(f, nrow(x))) } else { o = forderv(x, by=query$by, sort=FALSE, retGrp=TRUE) + if (isTRUE(as.logical(attr(o, "anynotutf8", exact=TRUE)))) + warningf("Mixed encodings detected. Strings were coerced to UTF-8 before duplicated(x).") if (attr(o, 'maxgrpn', exact=TRUE) == 1L) return(rep.int(FALSE, nrow(x))) f = attr(o, "starts", exact=TRUE) if (fromLast) f = cumsum(uniqlengths(f, nrow(x))) @@ -31,6 +33,9 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (nrow(x) <= 1L) return(copy(x)) # unique(x)[, col := val] should not alter x, #5932 if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) + if (isTRUE(as.logical(attr(o, "anynotutf8", exact=TRUE)))) { + warningf("Mixed encodings detected. Strings were coerced to UTF-8 before unique(x).") + } if (!is.null(cols)) { x = .shallow(x, c(by, cols), retain.key=TRUE) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e43843cbb6..0ceee0524b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21688,3 +21688,9 @@ d3 = unserialize(serialize(d2, NULL)) test(2340.05, .selfref.ok(d3), FALSE) setDT(d3) test(2340.06, .selfref.ok(d3), TRUE) + +# warn about different encodings in unique and duplicated, #469 +dt = data.table(x=c(iconv("\u00E9","UTF-8","latin1"), "\u00E9")) +test(2341.1, unique(dt), data.table(x="\u00E9"), warning="Mixed encodings.*") +test(2341.2, duplicated(dt), c(FALSE, TRUE), warning="Mixed encodings.*") +test(2341.3, unique(dt[c(2L,2L)]), data.table(x="\u00E9")) From 4e95497bbb014a2c5d06f7e8f64843520ebfc671 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Oct 2025 11:15:36 +0200 Subject: [PATCH 2/7] add UTF-8 as standard encoding --- NEWS.md | 3 ++- R/fread.R | 2 +- inst/tests/tests.Rraw | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 45db652f09..6bfed74403 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ 5. Negative and missing values of `n` argument of adaptive rolling functions trigger an error. +6. `unique()` and `duplicated()` warn now if columns with encodings other than UTF-8 are present, since these are converted to UTF-8 for comparison, which may lead to unexpected results, [#469](https://github.com/Rdatatable/data.table/issues/469). Additionally, `fread(encoding=)` now defaults to `"UTF-8"` (previously `"unknown"`) since most systems already use UTF-8 as the native encoding. Thanks to @arunsrinivasan for the request and @ben-schwen for the implementation. + ### NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES 1. `data.table(x=1, )`, where `` is an expression resulting in a 1-column matrix without column names, will eventually have names `x` and `V2`, not `x` and `V1`, consistent with `data.table(x=1, )` where `` results in an atomic vector, for example `data.table(x=1, cbind(1))` and `data.table(x=1, 1)` will both have columns named `x` and `V2`. In this release, the matrix case continues to be named `V1`, but the new behavior can be activated by setting `options(datatable.old.matrix.autoname)` to `FALSE`. See point 5 under Bug Fixes for more context; this change will provide more internal consistency as well as more consistency with `data.frame()`. @@ -357,7 +359,6 @@ 7. In rare situations a data.table object may lose its internal attribute that holds a self-reference. New helper function `.selfref.ok()` tests just that. It is only intended for technical use cases. See manual for examples. -8. `unique()` and `duplicated()` warn now if columns with encodings other than UTF-8 are present, since these are converted to UTF-8 for comparison, which may lead to unexpected results, [#469](https://github.com/Rdatatable/data.table/issues/469). Thanks to @arunsrinivasan for the request and @ben-schwen for the implementation. ## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025) diff --git a/R/fread.R b/R/fread.R index 2ea6cb796a..80ec7890ec 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, +col.names, check.names=FALSE, encoding="UTF-8", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), logicalYN=getOption("datatable.logicalYN", FALSE), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0ceee0524b..f2c700d81b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7900,9 +7900,7 @@ test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c")) # Fix for encoding issues in windows, #563 f = testDir("issue_563_fread.txt") ans1 <- fread(f, sep=",", header=TRUE) -ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") -test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") -test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") +test(1548.1, unique(unlist(lapply(ans2, Encoding))), "UTF-8") # 1549 moved to benchmark.Rraw, #5517 From 88b1081e168994c462aa145cdb9991f3dc6db6e8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Oct 2025 11:19:40 +0200 Subject: [PATCH 3/7] remove spilled newline --- NEWS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 6bfed74403..e060488818 100644 --- a/NEWS.md +++ b/NEWS.md @@ -359,7 +359,6 @@ 7. In rare situations a data.table object may lose its internal attribute that holds a self-reference. New helper function `.selfref.ok()` tests just that. It is only intended for technical use cases. See manual for examples. - ## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025) 1. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070). From e35d5add9ce2254c8e398ac57a0397e9cb5448b9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Oct 2025 11:20:56 +0200 Subject: [PATCH 4/7] update fread man page --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index 38352662f2..a6b48d8ed2 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -51,7 +51,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC" \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} - \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } + \item{encoding}{ default is \code{"UTF-8"}. Other possible options are \code{"unknown"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.} \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. } From 48e021e0717f0bf72a38a920795f5350c64b4c9f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Oct 2025 11:37:44 +0200 Subject: [PATCH 5/7] fix typo --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b23bf94acf..daf861f4a8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7900,7 +7900,7 @@ test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c")) # Fix for encoding issues in windows, #563 f = testDir("issue_563_fread.txt") ans1 <- fread(f, sep=",", header=TRUE) -test(1548.1, unique(unlist(lapply(ans2, Encoding))), "UTF-8") +test(1548.1, unique(unlist(lapply(ans1, Encoding))), "UTF-8") # 1549 moved to benchmark.Rraw, #5517 From cc01889c5209eb0cb85a3b97dc85ee227a4675f0 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Oct 2025 11:42:03 +0200 Subject: [PATCH 6/7] add info about enc2utf8 --- R/duplicated.R | 4 ++-- inst/tests/tests.Rraw | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/duplicated.R b/R/duplicated.R index 9a6594ef22..14d3bed3c7 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -14,7 +14,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ } else { o = forderv(x, by=query$by, sort=FALSE, retGrp=TRUE) if (isTRUE(as.logical(attr(o, "anynotutf8", exact=TRUE)))) - warningf("Mixed encodings detected. Strings were coerced to UTF-8 before duplicated(x).") + warningf("Mixed encodings detected. Strings were coerced to UTF-8 before duplicated(x). Use enc2utf8() to avoid this warning.") if (attr(o, 'maxgrpn', exact=TRUE) == 1L) return(rep.int(FALSE, nrow(x))) f = attr(o, "starts", exact=TRUE) if (fromLast) f = cumsum(uniqlengths(f, nrow(x))) @@ -34,7 +34,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) if (isTRUE(as.logical(attr(o, "anynotutf8", exact=TRUE)))) { - warningf("Mixed encodings detected. Strings were coerced to UTF-8 before unique(x).") + warningf("Mixed encodings detected. Strings were coerced to UTF-8 before unique(x). Use enc2utf8() to avoid this warning.") } if (!is.null(cols)) { x = .shallow(x, c(by, cols), retain.key=TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index daf861f4a8..1570bbc036 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21818,4 +21818,4 @@ b dt = data.table(x=c(iconv("\u00E9","UTF-8","latin1"), "\u00E9")) test(2342.1, unique(dt), data.table(x="\u00E9"), warning="Mixed encodings.*") test(2342.2, duplicated(dt), c(FALSE, TRUE), warning="Mixed encodings.*") -test(2342.3, unique(dt[c(2L,2L)]), data.table(x="\u00E9")) \ No newline at end of file +test(2342.3, unique(dt[c(2L,2L)]), data.table(x="\u00E9")) From 2f49a9a73851993e9ebe5809fc04c0e95091f98d Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 22 Oct 2025 11:55:38 +0200 Subject: [PATCH 7/7] update docs --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index 0b69b2aff8..518c54fbc7 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -16,7 +16,7 @@ stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64", "integer64"), col.names, -check.names=FALSE, encoding="unknown", +check.names=FALSE, encoding="UTF-8", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()),