@@ -50,12 +50,20 @@ static int ustr_alloc = 0;
5050static int ustr_n = 0 ;
5151static int ustr_maxlen = 0 ;
5252static int sortType = 0 ; // 0 just group; -1 descending, +1 ascending
53- static int nalast = 0 ; // 1 (true i.e. last), 0 (false i.e. first), -1 (na i.e. remove)
53+
5454static int nradix = 0 ;
5555static uint8_t * * key = NULL ;
5656static int * anso = NULL ;
5757static bool notFirst = false;
5858
59+ static int lgl_arg_to_int (int lgl_arg ) {
60+ if (lgl_arg == NA_LOGICAL ) {
61+ return -1 ;
62+ }
63+ return lgl_arg ;
64+ }
65+
66+
5967static char msg [1001 ];
6068// use STOP in this file (not error()) to ensure cleanup() is called first
6169// snprintf to msg first in case nrow (just as an example) is provided in the message because cleanup() sets nrow to 0
@@ -440,7 +448,7 @@ uint64_t dtwiddle(double x) //const void *p, int i)
440448 STOP (_ ("Unknown non-finite value; not NA, NaN, -Inf or +Inf" )); // # nocov
441449}
442450
443- void radix_r (const int from , const int to , int radix );
451+ void radix_r (const int from , const int to , int radix , bool na_remove );
444452
445453/*
446454 OpenMP is used here to parallelize multiple operations that come together to
@@ -527,7 +535,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
527535 STOP (_ ("At least one of retGrp= or sort= must be TRUE" ));
528536 if (!isLogical (naArg ) || LENGTH (naArg ) != 1 )
529537 STOP (_ ("na.last must be logical TRUE, FALSE or NA of length 1" )); // # nocov # covered in reuseSorting forder
530- nalast = (LOGICAL (naArg )[0 ] == NA_LOGICAL ) ? -1 : LOGICAL (naArg )[0 ]; // 1=na last, 0=na first (default), -1=remove na
538+ const int nalast = lgl_arg_to_int (LOGICAL (naArg )[0 ]); // 1=na last, 0=na first (default), -1=remove na
539+ const bool na_remove = (nalast == -1 );
531540
532541 if (nrow == 0 ) {
533542 // empty vector or 0-row DT is always sorted
@@ -803,7 +812,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
803812 free (TMP ); free (UGRP ); // # nocov
804813 STOP (_ ("Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d" ), nth ); // # nocov
805814 }
806-
815+
807816 if (retgrp ) {
808817 gs_thread = calloc (nth , sizeof (* gs_thread )); // thread private group size buffers
809818 gs_thread_alloc = calloc (nth , sizeof (* gs_thread_alloc ));
@@ -814,7 +823,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
814823 }
815824 }
816825 if (nradix ) {
817- radix_r (0 , nrow - 1 , 0 ); // top level recursive call: (from, to, radix)
826+ radix_r (0 , nrow - 1 , 0 , na_remove ); // top level recursive call: (from, to, radix)
818827 } else {
819828 push (& nrow , 1 );
820829 }
@@ -906,7 +915,7 @@ static bool sort_ugrp(uint8_t *x, const int n)
906915 return skip ;
907916}
908917
909- void radix_r (const int from , const int to , int radix ) {
918+ void radix_r (const int from , const int to , int radix , bool na_remove ) {
910919 for (;;) {
911920 TBEG ();
912921 const int my_n = to - from + 1 ;
@@ -1038,7 +1047,7 @@ void radix_r(const int from, const int to, int radix) {
10381047 continue ;
10391048 } else {
10401049 for (int i = 0 , f = from ; i < ngrp ; i ++ ) {
1041- radix_r (f , f + my_gs [i ]- 1 , radix + 1 );
1050+ radix_r (f , f + my_gs [i ]- 1 , radix + 1 , na_remove );
10421051 f += my_gs [i ];
10431052 }
10441053 }
@@ -1097,7 +1106,7 @@ void radix_r(const int from, const int to, int radix) {
10971106 }
10981107
10991108 int * restrict my_TMP = TMP + omp_get_thread_num ()* UINT16_MAX ; // Allocated up front to save malloc calls which i) block internally and ii) could fail
1100- if (radix == 0 && nalast != -1 ) {
1109+ if (radix == 0 && ! na_remove ) {
11011110 // anso contains 1:n so skip reading and copying it. Only happens when nrow<65535. Saving worth the branch (untested) when user repeatedly calls a small-n small-cardinality order.
11021111 for (int i = 0 ; i < my_n ; i ++ ) anso [my_starts [my_key [i ]]++ ] = i + 1 ; // +1 as R is 1-based.
11031112 // The loop counter could be uint_fast16_t since max i here will be UINT16_MAX-1 (65534), hence ++ after last iteration won't overflow 16bits. However, have chosen signed
@@ -1147,7 +1156,7 @@ void radix_r(const int from, const int to, int radix) {
11471156 } else {
11481157 // this single thread will now descend and resolve all groups, now that the groups are close in cache
11491158 for (int i = 0 , my_from = from ; i < ngrp ; i ++ ) {
1150- radix_r (my_from , my_from + my_gs [i ]- 1 , radix + 1 );
1159+ radix_r (my_from , my_from + my_gs [i ]- 1 , radix + 1 , na_remove );
11511160 my_from += my_gs [i ];
11521161 }
11531162 }
@@ -1368,7 +1377,7 @@ void radix_r(const int from, const int to, int radix) {
13681377 // each in parallel here and they're all dealt with in parallel. There is no nestedness here.
13691378 for (int i = 0 ; i < ngrp ; i ++ ) {
13701379 int start = from + starts [ugrp [i ]];
1371- radix_r (start , start + my_gs [i ]- 1 , radix + 1 );
1380+ radix_r (start , start + my_gs [i ]- 1 , radix + 1 , na_remove );
13721381 flush ();
13731382 }
13741383 TEND (24 )
@@ -1380,7 +1389,7 @@ void radix_r(const int from, const int to, int radix) {
13801389 #pragma omp parallel for ordered schedule(dynamic) num_threads(MIN(nth, ngrp)) // #5077
13811390 for (int i = 0 ; i < ngrp ; i ++ ) {
13821391 int start = from + starts [ugrp [i ]];
1383- radix_r (start , start + my_gs [i ]- 1 , radix + 1 );
1392+ radix_r (start , start + my_gs [i ]- 1 , radix + 1 , na_remove );
13841393 #pragma omp ordered
13851394 flush ();
13861395 }
@@ -1389,7 +1398,7 @@ void radix_r(const int from, const int to, int radix) {
13891398 #pragma omp parallel for schedule(dynamic) num_threads(MIN(nth, ngrp)) // #5077
13901399 for (int i = 0 ; i < ngrp ; i ++ ) {
13911400 int start = from + starts [ugrp [i ]];
1392- radix_r (start , start + my_gs [i ]- 1 , radix + 1 );
1401+ radix_r (start , start + my_gs [i ]- 1 , radix + 1 , na_remove );
13931402 }
13941403 }
13951404 TEND (25 )
@@ -1682,8 +1691,15 @@ bool GetAutoIndex(void) {
16821691}
16831692
16841693// attr(idx, "anyna")>0 || attr(idx, "anyinfnan")>0
1685- bool idxAnyNF (SEXP idx ) {
1686- return INTEGER (getAttrib (idx , sym_anyna ))[0 ]> 0 || INTEGER (getAttrib (idx , sym_anyinfnan ))[0 ]> 0 ;
1694+ static inline bool idxAnyNF (SEXP idx ) {
1695+ SEXP any_na = getAttrib (idx , sym_anyna );
1696+ // Defensive: #7519 to avoid segfaults on CRAN
1697+ if (!isInteger (any_na ) || xlength (any_na ) != 1 ) return true;
1698+
1699+ SEXP any_infnan = getAttrib (idx , sym_anyinfnan );
1700+ if (!isInteger (any_infnan ) || xlength (any_infnan ) != 1 ) return true;
1701+
1702+ return INTEGER (any_na )[0 ]> 0 || INTEGER (any_infnan )[0 ]> 0 ;
16871703}
16881704
16891705// forder, re-use existing key or index if possible, otherwise call forder
@@ -1708,7 +1724,11 @@ SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP
17081724 bool sortGroups = (bool )LOGICAL (sortGroupsArg )[0 ];
17091725 if (!isLogical (naArg ) || LENGTH (naArg ) != 1 )
17101726 error (_ ("na.last must be logical TRUE, FALSE or NA of length 1" ));
1711- bool na = (bool )LOGICAL (naArg )[0 ];
1727+ int na_arg = lgl_arg_to_int (LOGICAL (naArg )[0 ]);
1728+
1729+ const bool na_first = (na_arg == 0 ); // na.last=FALSE
1730+ const bool na_last = (na_arg == 1 ); // na.last=TRUE
1731+ const bool na_remove = (na_arg == -1 ); // na.last=NA
17121732 if (!isInteger (ascArg ))
17131733 error (_ ("order must be integer" )); // # nocov # coerced to int in R
17141734 if (!isLogical (reuseSortingArg ) || LENGTH (reuseSortingArg ) != 1 )
@@ -1739,18 +1759,18 @@ SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP
17391759 opt = 0 ;
17401760 }
17411761 SEXP ans = R_NilValue ;
1742- if (opt == -1 && ! na && !retGrp && colsKeyHead (DT , by )) {
1762+ if (opt == -1 && na_first && !retGrp && colsKeyHead (DT , by )) {
17431763 opt = 1 ; // keyOpt
17441764 ans = PROTECT (allocVector (INTSXP , 0 )); protecti ++ ;
17451765 if (verbose )
17461766 Rprintf (_ ("forderReuseSorting: using key: %s\n" ), CHAR (STRING_ELT (idxName (DT , by ), 0 )));
17471767 }
17481768 if (opt == -1 && GetUseIndex ()) {
17491769 SEXP idx = getIndex (DT , by );
1750- if (!isNull (idx )) {
1751- bool hasStats = !isNull (getAttrib (idx , sym_anyna ));
1752- if (! na || // na.last=FALSE
1753- (hasStats && !idxAnyNF (idx ))) { // na.last=TRUE && !anyNA
1770+ if (!isNull (idx ) && ! na_remove ) {
1771+ bool hasStats = !isNull (getAttrib (idx , sym_anyna )) && ! isNull ( getAttrib ( idx , sym_anyinfnan )) ;
1772+ if (na_first || // na.last=FALSE
1773+ (na_last && hasStats && !idxAnyNF (idx ))) { // na.last=TRUE && !anyNA
17541774 bool hasGrp = !isNull (getAttrib (idx , sym_starts ));
17551775 if (hasGrp && !hasStats )
17561776 internal_error_with_cleanup (__func__ , "index has 'starts' attribute but not 'anyna', please report to issue tracker" ); // # nocov
@@ -1809,7 +1829,7 @@ SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP
18091829 if (opt < 1 ) {
18101830 ans = PROTECT (forder (DT , by , retGrpArg , retStatsArg , sortGroupsArg , ascArg , naArg )); protecti ++ ;
18111831 if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or reuseSorting=FALSE
1812- (! na || (retStats && !idxAnyNF (ans ))) && // lets create index even if na.last=T used but no NAs detected!
1832+ (na_first || (retStats && !idxAnyNF (ans ))) && // lets create index even if na.last=T used but no NAs detected!
18131833 GetUseIndex () &&
18141834 GetAutoIndex ()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only
18151835 putIndex (DT , by , ans );
0 commit comments