diff --git a/locale.c b/locale.c index 9376e2ed81b0..106de8e35df8 100644 --- a/locale.c +++ b/locale.c @@ -9451,11 +9451,32 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #undef GET_DESCRIPTION #ifdef USE_LOCALE_COLLATE -STATIC void +STATIC bool S_compute_collxfrm_coefficients(pTHX) { - - /* A locale collation definition includes primary, secondary, tertiary, + /* This is called from mem_collxfrm() the first time the latter is called + * on the current locale to do initialization for it. + * + * This returns true and initializes the coefficients for a linear equation + * that, given a string of some length, predicts how much memory it will + * take to hold the result of calling mem_collxfrm() on that string. The + * equation is of the form: + * m * length + b + * where m = PL_collxfrm_mult and b = PL_collxfrm_base + * + * It returns false if the locale does not appear to be sane. + * + * The prediction is just an educated guess to save time and, + * mem_collxrfm() may adjust it based on experience with strings it + * encounters. + * + * This function also: + * sets 'PL_in_utf8_COLLATE_locale' to indicate if the locale is a + * UTF-8 one + * initializes 'PL_strxfrm_NUL_replacement' to NUL + * initializes 'PL_strxfrm_max_cp' = 0; + * + * A locale collation definition includes primary, secondary, tertiary, * etc. weights for each character. To sort, the primary weights are used, * and only if they compare equal, then the secondary weights are used, and * only if they compare equal, then the tertiary, etc. @@ -9517,13 +9538,13 @@ S_compute_collxfrm_coefficients(pTHX) PL_strxfrm_NUL_replacement = '\0'; PL_strxfrm_max_cp = 0; - /* mem_collxfrm_() is used get the transformation (though here we are - * interested only in its length). It is used because it has the - * intelligence to handle all cases, but to work, it needs some values of - * 'm' and 'b' to get it started. For the purposes of this calculation we - * use a very conservative estimate of 'm' and 'b'. This assumes a weight - * can be multiple bytes, enough to hold any UV on the platform, and there - * are 5 levels, 4 weight bytes, and a trailing NUL. */ + /* mem_collxfrm_() is used recursively to get the transformation (though + * here we are interested only in its length). It is used because it has + * the intelligence to handle all cases, but to work, it needs some values + * of 'm' and 'b' to get it started. For the purposes of this calculation + * we use a very conservative estimate of 'm' and 'b'. This assumes a + * weight can be multiple bytes, enough to hold any UV on the platform, and + * there are 5 levels, 4 weight bytes, and a trailing NUL. */ PL_collxfrm_base = 5; PL_collxfrm_mult = 5 * sizeof(UV); @@ -9564,41 +9585,40 @@ S_compute_collxfrm_coefficients(pTHX) "Disabling locale collation for LC_COLLATE='%s';" " length for shorter sample=%zu; longer=%zu\n", PL_collation_name, x_len_shorter, x_len_longer)); + return false; } - else { - SSize_t base; /* Temporary */ - - /* We have both: m * strlen(longer) + b = x_len_longer - * m * strlen(shorter) + b = x_len_shorter; - * subtracting yields: - * m * (strlen(longer) - strlen(shorter)) - * = x_len_longer - x_len_shorter - * But we have set things up so that 'shorter' is 1 byte smaller than - * 'longer'. Hence: - * m = x_len_longer - x_len_shorter - * - * But if something went wrong, make sure the multiplier is at least 1. - */ - if (x_len_longer > x_len_shorter) { - PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; - } - else { - PL_collxfrm_mult = 1; - } - /* mx + b = len - * so: b = len - mx - * but in case something has gone wrong, make sure it is non-negative - * */ - base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); - if (base < 0) { - base = 0; - } + SSize_t base; /* Temporary */ - /* Add 1 for the trailing NUL */ - PL_collxfrm_base = base + 1; + /* We have both: m * strlen(longer) + b = x_len_longer + * m * strlen(shorter) + b = x_len_shorter; + * subtracting yields: + * m * (strlen(longer) - strlen(shorter)) + * = x_len_longer - x_len_shorter + * But we have set things up so that 'shorter' is 1 byte smaller than + * 'longer'. Hence: + * m = x_len_longer - x_len_shorter + * + * But if something went wrong, make sure the multiplier is at least 1. + */ + if (x_len_longer > x_len_shorter) { + PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; + } + else { + PL_collxfrm_mult = 1; } + /* mx + b = len + * so: b = len - mx + * but in case something has gone wrong, make sure it is non-negative */ + base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); + if (base < 0) { + base = 0; + } + + /* Add 1 for the trailing NUL */ + PL_collxfrm_base = base + 1; + DEBUG_L(PerlIO_printf(Perl_debug_log, "?UTF-8 locale=%d; x_len_shorter=%zu, " "x_len_longer=%zu," @@ -9606,6 +9626,7 @@ S_compute_collxfrm_coefficients(pTHX) PL_in_utf8_COLLATE_locale, x_len_shorter, x_len_longer, PL_collxfrm_mult, PL_collxfrm_base)); + return true; } char * @@ -9663,12 +9684,14 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, if (PL_collxfrm_base != 0) { /* bad collation => skip */ DEBUG_L(PerlIO_printf(Perl_debug_log, "mem_collxfrm_: locale's collation is defective\n")); - goto bad; + goto bad_no_strxfrm; } /* (mult, base) == (0,0) means we need to calculate mult and base * before proceeding */ - S_compute_collxfrm_coefficients(aTHX); + if (! S_compute_collxfrm_coefficients(aTHX)) { + return NULL; /* locale collation not sane */ + } } /* Replace any embedded NULs with the control that sorts before any others. @@ -9741,7 +9764,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* Create a 1-char string of the current code point */ cur_source[0] = (char) j; - /* Then transform it */ + /* Then transform it using a recursive call */ x = mem_collxfrm_(cur_source, trial_len, &x_len, 0 /* The string is not in UTF-8 */); @@ -9789,7 +9812,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, DEBUG_L(PerlIO_printf(Perl_debug_log, "mem_collxfrm_: Couldn't find any character to replace" " embedded NULs in locale %s with", PL_collation_name)); - goto bad; + goto bad_no_strxfrm; } DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -9901,7 +9924,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* Create a 1-char string of the current code point */ cur_source[0] = (char) j; - /* Then transform it */ + /* Then transform it (recursively) */ x = mem_collxfrm_(cur_source, 1, &x_len, FALSE); /* If something went wrong (which it shouldn't), just @@ -9930,7 +9953,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, "mem_collxfrm_: Couldn't find any character to" " replace above-Latin1 chars in locale %s with", PL_collation_name)); - goto bad; + goto bad_no_strxfrm; } DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -10090,13 +10113,13 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, if (UNLIKELY(! xbuf)) { DEBUG_L(PerlIO_printf(Perl_debug_log, "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc)); - goto bad; + goto bad_no_strxfrm; } /* Store the collation id */ *(PERL_UINTMAX_T *)xbuf = PL_collation_ix; -# define CLEANUP_STRXFRM_COMMON \ +# define CLEANUP_NON_STRXFRM \ STMT_START { \ Safefree(free_me); \ Safefree(sans_nuls); \ @@ -10119,7 +10142,6 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, STMT_START { \ if (constructed_locale != (locale_t) 0) \ freelocale(constructed_locale); \ - CLEANUP_STRXFRM_COMMON; \ } STMT_END # else # define my_strxfrm(dest, src, n) strxfrm(dest, src, n) @@ -10128,12 +10150,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name); # define CLEANUP_STRXFRM \ - STMT_START { \ - restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); \ - CLEANUP_STRXFRM_COMMON; \ - } STMT_END -# else -# define CLEANUP_STRXFRM CLEANUP_STRXFRM_COMMON + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); # endif # endif @@ -10203,7 +10220,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, PL_collxfrm_mult = new_m; PL_collxfrm_base = 1; /* +1 For trailing NUL */ computed_guess = PL_collxfrm_base - + (PL_collxfrm_mult * length_in_chars); + + (PL_collxfrm_mult * length_in_chars); if (computed_guess < needed) { PL_collxfrm_base += needed - computed_guess; } @@ -10275,6 +10292,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8)); CLEANUP_STRXFRM; + CLEANUP_NON_STRXFRM; /* Free up unneeded space; retain enough for trailing NUL */ Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); @@ -10286,9 +10304,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); CLEANUP_STRXFRM; + bad_no_strxfrm: /* Found a problem before strxfrm() got called */ + DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); + Safefree(xbuf); *xlen = 0; - + CLEANUP_NON_STRXFRM; return NULL; }