From 847f94782c905854ccb6e9d26360ec30c37d8691 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 22 Apr 2025 08:22:22 -0600 Subject: [PATCH 1/4] locale.c: Change name of macro This is in preparation for the next commit where it will be split out to be a stand-alone macro. --- locale.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/locale.c b/locale.c index 9376e2ed81b0..2b904ea97298 100644 --- a/locale.c +++ b/locale.c @@ -10096,7 +10096,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* Store the collation id */ *(PERL_UINTMAX_T *)xbuf = PL_collation_ix; -# define CLEANUP_STRXFRM_COMMON \ +# define CLEANUP_NON_STRXFRM \ STMT_START { \ Safefree(free_me); \ Safefree(sans_nuls); \ @@ -10119,7 +10119,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, STMT_START { \ if (constructed_locale != (locale_t) 0) \ freelocale(constructed_locale); \ - CLEANUP_STRXFRM_COMMON; \ + CLEANUP_NON_STRXFRM; \ } STMT_END # else # define my_strxfrm(dest, src, n) strxfrm(dest, src, n) @@ -10130,10 +10130,10 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, # define CLEANUP_STRXFRM \ STMT_START { \ restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); \ - CLEANUP_STRXFRM_COMMON; \ + CLEANUP_NON_STRXFRM; \ } STMT_END # else -# define CLEANUP_STRXFRM CLEANUP_STRXFRM_COMMON +# define CLEANUP_STRXFRM CLEANUP_NON_STRXFRM # endif # endif From 89520a4d1f0be755e449d07b880a1ab882cec3ed Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 22 Apr 2025 08:24:36 -0600 Subject: [PATCH 2/4] locale.c: Don't do asymmetric back out on failure This fixes #23519 When something goes wrong doing locale-aware string collation, the code attempts to carry on as well as can be expected. Prior to this commit the backout code was asymmetric, trying to undo things that had not been done. This happened when the failure was early on. In the case of this ticket, the platform has a defective locale that was detectable before getting very far along. The solution adopted here is to jump to a different label for those early failures that does less backout than for later failures. --- locale.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/locale.c b/locale.c index 2b904ea97298..de4cb356884a 100644 --- a/locale.c +++ b/locale.c @@ -9663,7 +9663,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, if (PL_collxfrm_base != 0) { /* bad collation => skip */ DEBUG_L(PerlIO_printf(Perl_debug_log, "mem_collxfrm_: locale's collation is defective\n")); - goto bad; + goto bad_no_strxfrm; } /* (mult, base) == (0,0) means we need to calculate mult and base @@ -9789,7 +9789,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, DEBUG_L(PerlIO_printf(Perl_debug_log, "mem_collxfrm_: Couldn't find any character to replace" " embedded NULs in locale %s with", PL_collation_name)); - goto bad; + goto bad_no_strxfrm; } DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -9930,7 +9930,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, "mem_collxfrm_: Couldn't find any character to" " replace above-Latin1 chars in locale %s with", PL_collation_name)); - goto bad; + goto bad_no_strxfrm; } DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -10090,7 +10090,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, if (UNLIKELY(! xbuf)) { DEBUG_L(PerlIO_printf(Perl_debug_log, "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc)); - goto bad; + goto bad_no_strxfrm; } /* Store the collation id */ @@ -10119,7 +10119,6 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, STMT_START { \ if (constructed_locale != (locale_t) 0) \ freelocale(constructed_locale); \ - CLEANUP_NON_STRXFRM; \ } STMT_END # else # define my_strxfrm(dest, src, n) strxfrm(dest, src, n) @@ -10128,12 +10127,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name); # define CLEANUP_STRXFRM \ - STMT_START { \ - restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); \ - CLEANUP_NON_STRXFRM; \ - } STMT_END -# else -# define CLEANUP_STRXFRM CLEANUP_NON_STRXFRM + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); # endif # endif @@ -10275,6 +10269,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8)); CLEANUP_STRXFRM; + CLEANUP_NON_STRXFRM; /* Free up unneeded space; retain enough for trailing NUL */ Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); @@ -10286,9 +10281,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); CLEANUP_STRXFRM; + bad_no_strxfrm: /* Found a problem before strxfrm() got called */ + DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); + Safefree(xbuf); *xlen = 0; - + CLEANUP_NON_STRXFRM; return NULL; } From c3be9b188fe2bd0a7575d0da7e24c6a92ce243af Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 20 Apr 2025 11:57:19 -0600 Subject: [PATCH 3/4] mem_collxfrm(): Return early if locale collation not sane This changes a subsidiary function's return value from void to bool, returning false if it finds the locale doesn't have sane collation. The calling function is changed to check this, and give up immediately if the locale isn't sane. --- locale.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/locale.c b/locale.c index de4cb356884a..864b1652c76a 100644 --- a/locale.c +++ b/locale.c @@ -9451,11 +9451,32 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #undef GET_DESCRIPTION #ifdef USE_LOCALE_COLLATE -STATIC void +STATIC bool S_compute_collxfrm_coefficients(pTHX) { - - /* A locale collation definition includes primary, secondary, tertiary, + /* This is called from mem_collxfrm() the first time the latter is called + * on the current locale to do initialization for it. + * + * This returns true and initializes the coefficients for a linear equation + * that, given a string of some length, predicts how much memory it will + * take to hold the result of calling mem_collxfrm() on that string. The + * equation is of the form: + * m * length + b + * where m = PL_collxfrm_mult and b = PL_collxfrm_base + * + * It returns false if the locale does not appear to be sane. + * + * The prediction is just an educated guess to save time and, + * mem_collxrfm() may adjust it based on experience with strings it + * encounters. + * + * This function also: + * sets 'PL_in_utf8_COLLATE_locale' to indicate if the locale is a + * UTF-8 one + * initializes 'PL_strxfrm_NUL_replacement' to NUL + * initializes 'PL_strxfrm_max_cp' = 0; + * + * A locale collation definition includes primary, secondary, tertiary, * etc. weights for each character. To sort, the primary weights are used, * and only if they compare equal, then the secondary weights are used, and * only if they compare equal, then the tertiary, etc. @@ -9564,8 +9585,9 @@ S_compute_collxfrm_coefficients(pTHX) "Disabling locale collation for LC_COLLATE='%s';" " length for shorter sample=%zu; longer=%zu\n", PL_collation_name, x_len_shorter, x_len_longer)); + return false; } - else { + SSize_t base; /* Temporary */ /* We have both: m * strlen(longer) + b = x_len_longer @@ -9597,7 +9619,6 @@ S_compute_collxfrm_coefficients(pTHX) /* Add 1 for the trailing NUL */ PL_collxfrm_base = base + 1; - } DEBUG_L(PerlIO_printf(Perl_debug_log, "?UTF-8 locale=%d; x_len_shorter=%zu, " @@ -9606,6 +9627,7 @@ S_compute_collxfrm_coefficients(pTHX) PL_in_utf8_COLLATE_locale, x_len_shorter, x_len_longer, PL_collxfrm_mult, PL_collxfrm_base)); + return true; } char * @@ -9668,7 +9690,9 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* (mult, base) == (0,0) means we need to calculate mult and base * before proceeding */ - S_compute_collxfrm_coefficients(aTHX); + if (! S_compute_collxfrm_coefficients(aTHX)) { + return NULL; /* locale collation not sane */ + } } /* Replace any embedded NULs with the control that sorts before any others. From 13d2b29c1478c27a08b49d78d57a67fcef2b8f00 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 20 Apr 2025 12:01:00 -0600 Subject: [PATCH 4/4] mem_collxfrm(): White space, comments, only The previous commit removed a block; so can outdent --- locale.c | 75 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/locale.c b/locale.c index 864b1652c76a..106de8e35df8 100644 --- a/locale.c +++ b/locale.c @@ -9538,13 +9538,13 @@ S_compute_collxfrm_coefficients(pTHX) PL_strxfrm_NUL_replacement = '\0'; PL_strxfrm_max_cp = 0; - /* mem_collxfrm_() is used get the transformation (though here we are - * interested only in its length). It is used because it has the - * intelligence to handle all cases, but to work, it needs some values of - * 'm' and 'b' to get it started. For the purposes of this calculation we - * use a very conservative estimate of 'm' and 'b'. This assumes a weight - * can be multiple bytes, enough to hold any UV on the platform, and there - * are 5 levels, 4 weight bytes, and a trailing NUL. */ + /* mem_collxfrm_() is used recursively to get the transformation (though + * here we are interested only in its length). It is used because it has + * the intelligence to handle all cases, but to work, it needs some values + * of 'm' and 'b' to get it started. For the purposes of this calculation + * we use a very conservative estimate of 'm' and 'b'. This assumes a + * weight can be multiple bytes, enough to hold any UV on the platform, and + * there are 5 levels, 4 weight bytes, and a trailing NUL. */ PL_collxfrm_base = 5; PL_collxfrm_mult = 5 * sizeof(UV); @@ -9588,37 +9588,36 @@ S_compute_collxfrm_coefficients(pTHX) return false; } - SSize_t base; /* Temporary */ + SSize_t base; /* Temporary */ - /* We have both: m * strlen(longer) + b = x_len_longer - * m * strlen(shorter) + b = x_len_shorter; - * subtracting yields: - * m * (strlen(longer) - strlen(shorter)) - * = x_len_longer - x_len_shorter - * But we have set things up so that 'shorter' is 1 byte smaller than - * 'longer'. Hence: - * m = x_len_longer - x_len_shorter - * - * But if something went wrong, make sure the multiplier is at least 1. - */ - if (x_len_longer > x_len_shorter) { - PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; - } - else { - PL_collxfrm_mult = 1; - } + /* We have both: m * strlen(longer) + b = x_len_longer + * m * strlen(shorter) + b = x_len_shorter; + * subtracting yields: + * m * (strlen(longer) - strlen(shorter)) + * = x_len_longer - x_len_shorter + * But we have set things up so that 'shorter' is 1 byte smaller than + * 'longer'. Hence: + * m = x_len_longer - x_len_shorter + * + * But if something went wrong, make sure the multiplier is at least 1. + */ + if (x_len_longer > x_len_shorter) { + PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; + } + else { + PL_collxfrm_mult = 1; + } - /* mx + b = len - * so: b = len - mx - * but in case something has gone wrong, make sure it is non-negative - * */ - base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); - if (base < 0) { - base = 0; - } + /* mx + b = len + * so: b = len - mx + * but in case something has gone wrong, make sure it is non-negative */ + base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); + if (base < 0) { + base = 0; + } - /* Add 1 for the trailing NUL */ - PL_collxfrm_base = base + 1; + /* Add 1 for the trailing NUL */ + PL_collxfrm_base = base + 1; DEBUG_L(PerlIO_printf(Perl_debug_log, "?UTF-8 locale=%d; x_len_shorter=%zu, " @@ -9765,7 +9764,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* Create a 1-char string of the current code point */ cur_source[0] = (char) j; - /* Then transform it */ + /* Then transform it using a recursive call */ x = mem_collxfrm_(cur_source, trial_len, &x_len, 0 /* The string is not in UTF-8 */); @@ -9925,7 +9924,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* Create a 1-char string of the current code point */ cur_source[0] = (char) j; - /* Then transform it */ + /* Then transform it (recursively) */ x = mem_collxfrm_(cur_source, 1, &x_len, FALSE); /* If something went wrong (which it shouldn't), just @@ -10221,7 +10220,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, PL_collxfrm_mult = new_m; PL_collxfrm_base = 1; /* +1 For trailing NUL */ computed_guess = PL_collxfrm_base - + (PL_collxfrm_mult * length_in_chars); + + (PL_collxfrm_mult * length_in_chars); if (computed_guess < needed) { PL_collxfrm_base += needed - computed_guess; }