Merge branch 'master' into omp_limits

ben-schwen · web-flow · commit 5921b4685a12 · 2025-11-01T16:55:23.000+01:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -104,5 +104,6 @@ Authors@R: c(
   person("Reino", "Bruner",        role="ctb"),
   person(given="@badasahog",       role="ctb", comment="GitHub user"),
   person("Vinit", "Thakur",        role="ctb"),
-  person("Mukul", "Kumar",         role="ctb")
+  person("Mukul", "Kumar",         role="ctb"),
+  person("Ildikó", "Czeller",      role="ctb")
   )
diff --git a/NEWS.md b/NEWS.md
@@ -340,7 +340,9 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
 
 20. `forderv` could segfault on keys with long runs of identical bytes (e.g., many duplicate columns) because the single-group branch tail-recursed radix-by-radix until the C stack ran out, [#4300](https://github.com/Rdatatable/data.table/issues/4300). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies for the report and @ben-schwen for the fix.
 
-21. `setDTthreads(percent=)` and `setDTthreads(threads=)` now respect `OMP_NUM_THREADS` and `omp_get_max_threads()`, ensuring consistency with `setDTthreads()` (no arguments) when OpenMP environment variables are set, [#7165](https://github.com/Rdatatable/data.table/issues/7165). Previously, explicitly setting a thread count or percentage would ignore these OpenMP limits, potentially exceeding the user's intended thread cap. Thanks to @bastistician for the report and @ben-schwen for the fix.
+21. `[` now preserves existing key(s) when new columns are added before them, instead of incorrectly setting a new column as key, [#7364](https://github.com/Rdatatable/data.table/issues/7364). Thanks @czeildi for the bug report and the fix.
+
+22. `setDTthreads(percent=)` and `setDTthreads(threads=)` now respect `OMP_NUM_THREADS` and `omp_get_max_threads()`, ensuring consistency with `setDTthreads()` (no arguments) when OpenMP environment variables are set, [#7165](https://github.com/Rdatatable/data.table/issues/7165). Previously, explicitly setting a thread count or percentage would ignore these OpenMP limits, potentially exceeding the user's intended thread cap. Thanks to @bastistician for the report and @ben-schwen for the fix.
 
 ### NOTES
 
@@ -538,6 +540,8 @@ rowwiseDT(
 
 21. `setDT(get0('var'))` now correctly modifies `var` by reference, consistent with the long-standing behavior of `setDT(get('var'))`, [#6864](https://github.com/Rdatatable/data.table/issues/6864). Thanks to @rikivillalba for the report and @venom1204 for the fix.
 
+22. `fread()` could fail to read Mac CSV files (with `\r` line endings) if the file contained any `\n` character, such as a final `\r\n`. This was fixed by detecting the predominant line ending in a sample of the file, [#4186](https://github.com/Rdatatable/data.table/issues/4186). Thanks to @MPagel for the report and @ben-schwen for the fix.
+
 ### NOTES
 
 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181).
diff --git a/R/data.table.R b/R/data.table.R
@@ -1448,7 +1448,7 @@ replace_dot_alias = function(e) {
         if (SD_only)
           jvnames = jnames = sdvars
         else
-          jnames = as.character(Filter(is.name, jsub)[-1L])
+          jnames = vapply_1c(jsub, function(x) if (is.name(x)) as.character(x) else NA_character_)[-1L]
         key_idx = chmatch(key, jnames)
         missing_keys = which(is.na(key_idx))
         if (length(missing_keys) && missing_keys[1L] == 1L) return(NULL)
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -11952,6 +11952,8 @@ tt = setDT(read.csv(f, stringsAsFactors=FALSE))
 tt[2, B:=gsub("\n","\r",B)]  # base R changes the \r to a \n, so restore that
 test(1778.4, tt, DT)
 unlink(f)
+# fread has problems with mixed \r and \r\n #4186
+test(1778.5, fread(text="Col1;Col2\r1;data1\r2;data2\r3;data3\r\n", verbose=TRUE), data.table(Col1=1:3, Col2=c("data1","data2","data3")), output="An \\r")
 
 # #1392 IDate ITime new methods for faster conversion
 # conversion in-out match for UTC
@@ -21835,3 +21837,21 @@ DT[, V1000 := 20:1]
 test(2343.1, forderv(DT, by=names(DT), sort=FALSE, retGrp=TRUE), forderv(DT, by=c("V1", "V1000"), sort=FALSE, retGrp=TRUE))
 x = c(rep(0, 7e5), 1e6)
 test(2343.2, forderv(list(x)), integer(0))
+
+# Keep key when new column added before existing key in j
+# Incorrect key can lead to incorrect join result #7364
+DT = data.table(V1 = 1:2, key = "V1")
+test(2344.00, key(DT[, .(V2 = c("b", "a"), V1)]), "V1")
+test(2344.01, key(DT[, .(V2 = -V1, V1)]), "V1")
+
+d1 = data.table(V1 = c(1L, 0L, 1L), V2 = c("a", "a", "b"), key = "V2")
+d2 = d1[, .(V1, label = c("one", "zero", "one"), V2)]
+r = d2[data.table(label = "one"), on = "label", allow.cartesian = TRUE]
+test(2344.02, nrow(r), 2L)
+# join result of keyed input is the same as unkeyed input
+test(2344.03, setkey(d1[, .(V1, label = c("one", "zero", "one"), V2)][data.table(label = "one"), on = "label", allow.cartesian = TRUE], NULL),
+              setkey(d1, NULL)[, .(V1, label = c("one", "zero", "one"), V2)][data.table(label = "one"), on = "label", allow.cartesian = TRUE])
+
+# keep sub-key in case of multiple keys, even with new columns and changing column order
+DT = data.table(V1 = 1:2, V2 = 3:4, V3 = 5:6, key = c("V1", "V2", "V3"))
+test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", "V2"))
diff --git a/man/measure.Rd b/man/measure.Rd
@@ -83,7 +83,7 @@ melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)
 print(melt(who, measure.vars = measure(
   diagnosis, gender, ages,
   ymin=as.numeric,
-  ymax=function(y)ifelse(y=="", Inf, as.numeric(y)),
+  ymax=function(y)ifelse(nzchar(y), as.numeric(y), Inf),
   pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))"
 )), class=TRUE)
 }
diff --git a/src/fread.c b/src/fread.c
@@ -1628,12 +1628,35 @@ int freadMain(freadMainArgs _args)
   if (verbose) DTPRINT(_("[04] Arrange mmap to be \\0 terminated\n"));
 
   // First, set 'eol_one_r' for use by eol() to know if \r-only line ending is allowed, #2371
+  // Count different line ending types to handle mixed endings (e.g. Mac CSV with mostly \r and final \r\n) #4186
+  int count_r_only = 0;   // \r not followed by \n
+  int count_with_n = 0;   // \n with or without \r
   ch = sof;
-  while (ch < eof && *ch != '\n') ch++;
-  eol_one_r = (ch == eof);
+  const char *sample_end = eof;
+  if ((size_t)(eof - sof) > 100000) sample_end = sof + 100000; // Sample first 100KB or whole file if smaller
+  while (ch < sample_end) {
+    if (*ch == '\r') {
+      // Skip consecutive \r to avoid miscounting \r\r\n as multiple line endings
+      while (ch < sample_end && *ch == '\r') ch++;
+      if (ch < sample_end && *ch == '\n') {
+        count_with_n++;
+        ch++;
+      } else {
+        count_r_only++;
+      }
+    } else if (*ch == '\n') {
+      count_with_n++;
+      ch++;
+    } else {
+      ch++;
+    }
+  }
+  // If file has mostly \r-only line endings, treat \r as line ending
+  eol_one_r = (count_r_only > count_with_n);
   if (verbose) DTPRINT(eol_one_r ?
-    _("  No \\n exists in the file at all, so single \\r (if any) will be taken as one line ending. This is unusual but will happen normally when there is no \\r either; e.g. a single line missing its end of line.\n") :
-    _("  \\n has been found in the input and different lines can end with different line endings (e.g. mixed \\n and \\r\\n in one file). This is common and ideal.\n"));
+    _("  An \\r by itself will be taken as one line ending (counts: %d \\r by themselves vs %d [\\r]*\\n). This happens with old Mac CSV or when there is no \\r at all.\n") :
+    _("  \\n has been found in the input (counts: %d \\r by themselves vs %d [\\r]*\\n) and different lines can end with different line endings (e.g. mixed \\n and \\r\\n in one file). This is common and ideal.\n"),
+    count_r_only, count_with_n);
 
   bool lastEOLreplaced = false;
   if (args.filename) {
diff --git a/vignettes/datatable-reshape.Rmd b/vignettes/datatable-reshape.Rmd
@@ -279,7 +279,7 @@ melt(who, measure.vars = measure(
 
 When using the `pattern` argument, it must be a Perl-compatible
 regular expression containing the same number of capture groups
-(parenthesized sub-expressions) as the number other arguments (group
+(parenthesized sub-expressions) as the number of other arguments (group
 names). The code below shows how to use a more complex regex with five
 groups, two numeric output columns, and an anonymous type conversion
 function,

Original file line number	Diff line number	Diff line change
`@@ -104,5 +104,6 @@ Authors@R: c(`
`104`	`104`	`person("Reino", "Bruner", role="ctb"),`
`105`	`105`	`person(given="@badasahog", role="ctb", comment="GitHub user"),`
`106`	`106`	`person("Vinit", "Thakur", role="ctb"),`
`107`		`- person("Mukul", "Kumar", role="ctb")`
	`107`	`+ person("Mukul", "Kumar", role="ctb"),`
	`108`	`+ person("Ildikó", "Czeller", role="ctb")`
`108`	`109`	`)`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ melt(who, measure.vars = measure(diagnosis, gender, ages, pattern="new_?(.*)_(.)`
`83`	`83`	`print(melt(who, measure.vars = measure(`
`84`	`84`	`diagnosis, gender, ages,`
`85`	`85`	`ymin=as.numeric,`
`86`		`- ymax=function(y)ifelse(y=="", Inf, as.numeric(y)),`
	`86`	`+ ymax=function(y)ifelse(nzchar(y), as.numeric(y), Inf),`
`87`	`87`	`pattern="new_?(.*)_(.)(([0-9]{2})([0-9]{0,2}))"`
`88`	`88`	`)), class=TRUE)`
`89`	`89`	`}`