Skip to content

Commit 358caa2

Browse files
authored
Fix forder segfault on long radix (#6111)
* fix recursion version * exit loop * add coverage * finetune NEWS
1 parent 089b957 commit 358caa2

File tree

3 files changed

+39
-4
lines changed

3 files changed

+39
-4
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
338338
339339
19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix.
340340
341+
20. `forderv` could segfault on keys with long runs of identical bytes (e.g., many duplicate columns) because the single-group branch tail-recursed radix-by-radix until the C stack ran out, [#4300](https://github.com/Rdatatable/data.table/issues/4300). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies for the report and @ben-schwen for the fix.
342+
341343
### NOTES
342344
343345
1. The following in-progress deprecations have proceeded:

inst/tests/tests.Rraw

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21823,4 +21823,15 @@ test(2342.3, options = c(datatable.week = "sequential"), week(as.IDate(c("2019-1
2182321823
test(2342.4, options = c(datatable.week = "sequential"), week(as.IDate(c("2020-12-31","2021-01-01"))), c(53L,1L))
2182421824
test(2342.5, options = c(datatable.week = "sequential"), week(as.IDate("2021-01-06") + 0:6), c(1L,1L,2L,2L,2L,2L,2L))
2182521825
test(2342.6, options = c(datatable.week = "sequential"), week(as.IDate(c("2016-02-27","2016-02-28","2016-02-29","2016-03-01","2016-03-02"))), c(9L,9L,9L,9L,9L))
21826-
test(2342.7, options = c(datatable.week = "default"), week(as.IDate("1970-01-07")), 2L, warning = "The default behavior of week() is changing")
21826+
test(2342.7, options = c(datatable.week = "default"), week(as.IDate("1970-01-07")), 2L, warning = "The default behavior of week() is changing")
21827+
21828+
# forderv should not segfault on long single group keys due to recursion #4300
21829+
N = 1e4
21830+
set.seed(1)
21831+
idx = sort(sample(10, 20, TRUE))
21832+
x = matrix(rnorm(N), nrow=10)
21833+
DT = as.data.table(x[idx,])
21834+
DT[, V1000 := 20:1]
21835+
test(2343.1, forderv(DT, by=names(DT), sort=FALSE, retGrp=TRUE), forderv(DT, by=c("V1", "V1000"), sort=FALSE, retGrp=TRUE))
21836+
x = c(rep(0, 7e5), 1e6)
21837+
test(2343.2, forderv(list(x)), integer(0))

src/forder.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ uint64_t dtwiddle(double x) //const void *p, int i)
437437
STOP(_("Unknown non-finite value; not NA, NaN, -Inf or +Inf")); // # nocov
438438
}
439439

440-
void radix_r(const int from, const int to, const int radix);
440+
void radix_r(const int from, const int to, int radix);
441441

442442
/*
443443
OpenMP is used here to parallelize multiple operations that come together to
@@ -901,7 +901,8 @@ static bool sort_ugrp(uint8_t *x, const int n)
901901
return skip;
902902
}
903903

904-
void radix_r(const int from, const int to, const int radix) {
904+
void radix_r(const int from, const int to, int radix) {
905+
for (;;) {
905906
TBEG();
906907
const int my_n = to-from+1;
907908
if (my_n==1) { // minor TODO: batch up the 1's instead in caller (and that's only needed when retgrp anyway)
@@ -1025,6 +1026,11 @@ void radix_r(const int from, const int to, const int radix) {
10251026
TEND(9)
10261027
if (radix+1==nradix || ngrp==my_n) { // ngrp==my_n => unique groups all size 1 and we can stop recursing now
10271028
push(my_gs, ngrp);
1029+
} else if (ngrp==1) { // ngrp == 1 => all same byte at this radix; go to next radix with same splits #4300
1030+
// no need to push since gs stays the same
1031+
free(my_gs);
1032+
radix++;
1033+
continue;
10281034
} else {
10291035
for (int i=0, f=from; i<ngrp; i++) {
10301036
radix_r(f, f+my_gs[i]-1, radix+1);
@@ -1128,6 +1134,11 @@ void radix_r(const int from, const int to, const int radix) {
11281134
if (radix+1==nradix) {
11291135
// aside: cannot be all size 1 (a saving used in my_n<=256 case above) because my_n>256 and ngrp<=256
11301136
push(my_gs, ngrp);
1137+
} else if (ngrp==1) { // ngrp == 1 => all same byte at this radix; go to next radix with same splits #4300
1138+
// no need to push since gs stays the same
1139+
free(my_gs);
1140+
radix++;
1141+
continue;
11311142
} else {
11321143
// this single thread will now descend and resolve all groups, now that the groups are close in cache
11331144
for (int i=0, my_from=from; i<ngrp; i++) {
@@ -1313,6 +1324,16 @@ void radix_r(const int from, const int to, const int radix) {
13131324
push(my_gs, ngrp);
13141325
TEND(23)
13151326
}
1327+
else if (ngrp==1) {
1328+
// no need to push since gs stays the same
1329+
free(my_gs);
1330+
free(counts);
1331+
free(starts);
1332+
free(ugrps);
1333+
free(ngrps);
1334+
radix++;
1335+
continue;
1336+
}
13161337
else {
13171338
// TODO: explicitly repeat parallel batch for any skew bins
13181339
bool anyBig = false;
@@ -1364,7 +1385,8 @@ void radix_r(const int from, const int to, const int radix) {
13641385
free(ugrps);
13651386
free(ngrps);
13661387
TEND(26)
1367-
}
1388+
return;
1389+
}}
13681390

13691391

13701392
SEXP issorted(SEXP x, SEXP by)

0 commit comments

Comments
 (0)