Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 78 additions & 57 deletions locale.c
Original file line number Diff line number Diff line change
Expand Up @@ -9451,11 +9451,32 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
#undef GET_DESCRIPTION
#ifdef USE_LOCALE_COLLATE

STATIC void
STATIC bool
S_compute_collxfrm_coefficients(pTHX)
{

/* A locale collation definition includes primary, secondary, tertiary,
/* This is called from mem_collxfrm() the first time the latter is called
* on the current locale to do initialization for it.
*
* This returns true and initializes the coefficients for a linear equation
* that, given a string of some length, predicts how much memory it will
* take to hold the result of calling mem_collxfrm() on that string. The
* equation is of the form:
* m * length + b
* where m = PL_collxfrm_mult and b = PL_collxfrm_base
*
* It returns false if the locale does not appear to be sane.
*
* The prediction is just an educated guess to save time and,
* mem_collxrfm() may adjust it based on experience with strings it
* encounters.
*
* This function also:
* sets 'PL_in_utf8_COLLATE_locale' to indicate if the locale is a
* UTF-8 one
* initializes 'PL_strxfrm_NUL_replacement' to NUL
* initializes 'PL_strxfrm_max_cp' = 0;
*
* A locale collation definition includes primary, secondary, tertiary,
* etc. weights for each character. To sort, the primary weights are used,
* and only if they compare equal, then the secondary weights are used, and
* only if they compare equal, then the tertiary, etc.
Expand Down Expand Up @@ -9517,13 +9538,13 @@ S_compute_collxfrm_coefficients(pTHX)
PL_strxfrm_NUL_replacement = '\0';
PL_strxfrm_max_cp = 0;

/* mem_collxfrm_() is used get the transformation (though here we are
* interested only in its length). It is used because it has the
* intelligence to handle all cases, but to work, it needs some values of
* 'm' and 'b' to get it started. For the purposes of this calculation we
* use a very conservative estimate of 'm' and 'b'. This assumes a weight
* can be multiple bytes, enough to hold any UV on the platform, and there
* are 5 levels, 4 weight bytes, and a trailing NUL. */
/* mem_collxfrm_() is used recursively to get the transformation (though
* here we are interested only in its length). It is used because it has
* the intelligence to handle all cases, but to work, it needs some values
* of 'm' and 'b' to get it started. For the purposes of this calculation
* we use a very conservative estimate of 'm' and 'b'. This assumes a
* weight can be multiple bytes, enough to hold any UV on the platform, and
* there are 5 levels, 4 weight bytes, and a trailing NUL. */
PL_collxfrm_base = 5;
PL_collxfrm_mult = 5 * sizeof(UV);

Expand Down Expand Up @@ -9564,48 +9585,48 @@ S_compute_collxfrm_coefficients(pTHX)
"Disabling locale collation for LC_COLLATE='%s';"
" length for shorter sample=%zu; longer=%zu\n",
PL_collation_name, x_len_shorter, x_len_longer));
return false;
}
else {
SSize_t base; /* Temporary */

/* We have both: m * strlen(longer) + b = x_len_longer
* m * strlen(shorter) + b = x_len_shorter;
* subtracting yields:
* m * (strlen(longer) - strlen(shorter))
* = x_len_longer - x_len_shorter
* But we have set things up so that 'shorter' is 1 byte smaller than
* 'longer'. Hence:
* m = x_len_longer - x_len_shorter
*
* But if something went wrong, make sure the multiplier is at least 1.
*/
if (x_len_longer > x_len_shorter) {
PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
}
else {
PL_collxfrm_mult = 1;
}

/* mx + b = len
* so: b = len - mx
* but in case something has gone wrong, make sure it is non-negative
* */
base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
if (base < 0) {
base = 0;
}
SSize_t base; /* Temporary */

/* Add 1 for the trailing NUL */
PL_collxfrm_base = base + 1;
/* We have both: m * strlen(longer) + b = x_len_longer
* m * strlen(shorter) + b = x_len_shorter;
* subtracting yields:
* m * (strlen(longer) - strlen(shorter))
* = x_len_longer - x_len_shorter
* But we have set things up so that 'shorter' is 1 byte smaller than
* 'longer'. Hence:
* m = x_len_longer - x_len_shorter
*
* But if something went wrong, make sure the multiplier is at least 1.
*/
if (x_len_longer > x_len_shorter) {
PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
}
else {
PL_collxfrm_mult = 1;
}

/* mx + b = len
* so: b = len - mx
* but in case something has gone wrong, make sure it is non-negative */
base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
if (base < 0) {
base = 0;
}

/* Add 1 for the trailing NUL */
PL_collxfrm_base = base + 1;

DEBUG_L(PerlIO_printf(Perl_debug_log,
"?UTF-8 locale=%d; x_len_shorter=%zu, "
"x_len_longer=%zu,"
" collate multipler=%zu, collate base=%zu\n",
PL_in_utf8_COLLATE_locale,
x_len_shorter, x_len_longer,
PL_collxfrm_mult, PL_collxfrm_base));
return true;
}

char *
Expand Down Expand Up @@ -9663,12 +9684,14 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
if (PL_collxfrm_base != 0) { /* bad collation => skip */
DEBUG_L(PerlIO_printf(Perl_debug_log,
"mem_collxfrm_: locale's collation is defective\n"));
goto bad;
goto bad_no_strxfrm;
}

/* (mult, base) == (0,0) means we need to calculate mult and base
* before proceeding */
S_compute_collxfrm_coefficients(aTHX);
if (! S_compute_collxfrm_coefficients(aTHX)) {
return NULL; /* locale collation not sane */
}
}

/* Replace any embedded NULs with the control that sorts before any others.
Expand Down Expand Up @@ -9741,7 +9764,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
/* Create a 1-char string of the current code point */
cur_source[0] = (char) j;

/* Then transform it */
/* Then transform it using a recursive call */
x = mem_collxfrm_(cur_source, trial_len, &x_len,
0 /* The string is not in UTF-8 */);

Expand Down Expand Up @@ -9789,7 +9812,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
DEBUG_L(PerlIO_printf(Perl_debug_log,
"mem_collxfrm_: Couldn't find any character to replace"
" embedded NULs in locale %s with", PL_collation_name));
goto bad;
goto bad_no_strxfrm;
}

DEBUG_L(PerlIO_printf(Perl_debug_log,
Expand Down Expand Up @@ -9901,7 +9924,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
/* Create a 1-char string of the current code point */
cur_source[0] = (char) j;

/* Then transform it */
/* Then transform it (recursively) */
x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);

/* If something went wrong (which it shouldn't), just
Expand Down Expand Up @@ -9930,7 +9953,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
"mem_collxfrm_: Couldn't find any character to"
" replace above-Latin1 chars in locale %s with",
PL_collation_name));
goto bad;
goto bad_no_strxfrm;
}

DEBUG_L(PerlIO_printf(Perl_debug_log,
Expand Down Expand Up @@ -10090,13 +10113,13 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
if (UNLIKELY(! xbuf)) {
DEBUG_L(PerlIO_printf(Perl_debug_log,
"mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
goto bad;
goto bad_no_strxfrm;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a potential memory leak to me. At this point, sans_highs, sans_nuls, and free_me may have been allocated, but bad_no_strxfrm skips over the free() calls.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow I got it into my head that that label had code to do the necessary cleanup, and was blind to the fact that it didn't. But with the latest push, it should now.

}

/* Store the collation id */
*(PERL_UINTMAX_T *)xbuf = PL_collation_ix;

# define CLEANUP_STRXFRM_COMMON \
# define CLEANUP_NON_STRXFRM \
STMT_START { \
Safefree(free_me); \
Safefree(sans_nuls); \
Expand All @@ -10119,7 +10142,6 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
STMT_START { \
if (constructed_locale != (locale_t) 0) \
freelocale(constructed_locale); \
CLEANUP_STRXFRM_COMMON; \
} STMT_END
# else
# define my_strxfrm(dest, src, n) strxfrm(dest, src, n)
Expand All @@ -10128,12 +10150,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);

# define CLEANUP_STRXFRM \
STMT_START { \
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); \
CLEANUP_STRXFRM_COMMON; \
} STMT_END
# else
# define CLEANUP_STRXFRM CLEANUP_STRXFRM_COMMON
restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
# endif
# endif

Expand Down Expand Up @@ -10203,7 +10220,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
PL_collxfrm_mult = new_m;
PL_collxfrm_base = 1; /* +1 For trailing NUL */
computed_guess = PL_collxfrm_base
+ (PL_collxfrm_mult * length_in_chars);
+ (PL_collxfrm_mult * length_in_chars);
if (computed_guess < needed) {
PL_collxfrm_base += needed - computed_guess;
}
Expand Down Expand Up @@ -10275,6 +10292,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,

DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
CLEANUP_STRXFRM;
CLEANUP_NON_STRXFRM;

/* Free up unneeded space; retain enough for trailing NUL */
Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
Expand All @@ -10286,9 +10304,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
CLEANUP_STRXFRM;

bad_no_strxfrm: /* Found a problem before strxfrm() got called */
DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));

Safefree(xbuf);
*xlen = 0;

CLEANUP_NON_STRXFRM;
return NULL;
}

Expand Down
Loading