diff --git a/NEWS.md b/NEWS.md index 60cb1dd0c..4690cd349 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ 5. Negative and missing values of `n` argument of adaptive rolling functions trigger an error. +6. `unique()` and `duplicated()` warn now if columns with encodings other than UTF-8 are present, since these are converted to UTF-8 for comparison, which may lead to unexpected results, [#469](https://github.com/Rdatatable/data.table/issues/469). Additionally, `fread(encoding=)` now defaults to `"UTF-8"` (previously `"unknown"`) since most systems already use UTF-8 as the native encoding. Thanks to @arunsrinivasan for the request and @ben-schwen for the implementation. + ### NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES 1. `data.table(x=1, )`, where `` is an expression resulting in a 1-column matrix without column names, will eventually have names `x` and `V2`, not `x` and `V1`, consistent with `data.table(x=1, )` where `` results in an atomic vector, for example `data.table(x=1, cbind(1))` and `data.table(x=1, 1)` will both have columns named `x` and `V2`. In this release, the matrix case continues to be named `V1`, but the new behavior can be activated by setting `options(datatable.old.matrix.autoname)` to `FALSE`. See point 5 under Bug Fixes for more context; this change will provide more internal consistency as well as more consistency with `data.frame()`. diff --git a/R/duplicated.R b/R/duplicated.R index e1a04c982..14d3bed3c 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -13,6 +13,8 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ if (fromLast) f = cumsum(uniqlengths(f, nrow(x))) } else { o = forderv(x, by=query$by, sort=FALSE, retGrp=TRUE) + if (isTRUE(as.logical(attr(o, "anynotutf8", exact=TRUE)))) + warningf("Mixed encodings detected. Strings were coerced to UTF-8 before duplicated(x). Use enc2utf8() to avoid this warning.") if (attr(o, 'maxgrpn', exact=TRUE) == 1L) return(rep.int(FALSE, nrow(x))) f = attr(o, "starts", exact=TRUE) if (fromLast) f = cumsum(uniqlengths(f, nrow(x))) @@ -31,6 +33,9 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon if (nrow(x) <= 1L) return(copy(x)) # unique(x)[, col := val] should not alter x, #5932 if (!length(by)) by = NULL #4594 o = forderv(x, by=by, sort=FALSE, retGrp=TRUE) + if (isTRUE(as.logical(attr(o, "anynotutf8", exact=TRUE)))) { + warningf("Mixed encodings detected. Strings were coerced to UTF-8 before unique(x). Use enc2utf8() to avoid this warning.") + } if (!is.null(cols)) { x = .shallow(x, c(by, cols), retain.key=TRUE) } diff --git a/R/fread.R b/R/fread.R index 16a72ed24..031ae3249 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, +col.names, check.names=FALSE, encoding="UTF-8", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), logicalYN=getOption("datatable.logicalYN", FALSE), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2e1284aea..1570bbc03 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7900,9 +7900,7 @@ test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c")) # Fix for encoding issues in windows, #563 f = testDir("issue_563_fread.txt") ans1 <- fread(f, sep=",", header=TRUE) -ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") -test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") -test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") +test(1548.1, unique(unlist(lapply(ans1, Encoding))), "UTF-8") # 1549 moved to benchmark.Rraw, #5517 @@ -21815,3 +21813,9 @@ test(2341.24, fread('a # leading cmnt b ', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b"))) + +# warn about different encodings in unique and duplicated, #469 +dt = data.table(x=c(iconv("\u00E9","UTF-8","latin1"), "\u00E9")) +test(2342.1, unique(dt), data.table(x="\u00E9"), warning="Mixed encodings.*") +test(2342.2, duplicated(dt), c(FALSE, TRUE), warning="Mixed encodings.*") +test(2342.3, unique(dt[c(2L,2L)]), data.table(x="\u00E9")) diff --git a/man/fread.Rd b/man/fread.Rd index 40d2cfcc9..518c54fbc 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -16,7 +16,7 @@ stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64", "integer64"), col.names, -check.names=FALSE, encoding="unknown", +check.names=FALSE, encoding="UTF-8", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), @@ -51,7 +51,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC" \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} - \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } + \item{encoding}{ default is \code{"UTF-8"}. Other possible options are \code{"unknown"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.} \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }