Perl · khwilliamson · Apr 23, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 20, 2025
diff --git a/locale.c b/locale.c
@@ -9451,11 +9451,32 @@ Perl_init_i18nl10n(pTHX_ int printwarn)
 #undef GET_DESCRIPTION
 #ifdef USE_LOCALE_COLLATE
 
-STATIC void
+STATIC bool
 S_compute_collxfrm_coefficients(pTHX)
 {
-
-    /* A locale collation definition includes primary, secondary, tertiary,
+    /* This is called from mem_collxfrm() the first time the latter is called
+     * on the current locale to do initialization for it.
+     *
+     * This returns true and initializes the coefficients for a linear equation
+     * that, given a string of some length, predicts how much memory it will
+     * take to hold the result of calling mem_collxfrm() on that string.  The
+     * equation is of the form:
+     *      m * length + b
+     * where m = PL_collxfrm_mult and b = PL_collxfrm_base
+     *
+     * It returns false if the locale does not appear to be sane.
+     *
+     * The prediction is just an educated guess to save time and,
+     * mem_collxrfm() may adjust it based on experience with strings it
+     * encounters.
+     *
+     * This function also:
+     *      sets 'PL_in_utf8_COLLATE_locale' to indicate if the locale is a
+     *          UTF-8 one
+     *      initializes 'PL_strxfrm_NUL_replacement' to NUL
+     *      initializes 'PL_strxfrm_max_cp' = 0;
+     *
+     * A locale collation definition includes primary, secondary, tertiary,
      * etc. weights for each character.  To sort, the primary weights are used,
      * and only if they compare equal, then the secondary weights are used, and
      * only if they compare equal, then the tertiary, etc.
@@ -9517,13 +9538,13 @@ S_compute_collxfrm_coefficients(pTHX)
     PL_strxfrm_NUL_replacement = '\0';
     PL_strxfrm_max_cp = 0;
 
-    /* mem_collxfrm_() is used get the transformation (though here we are
-     * interested only in its length).  It is used because it has the
-     * intelligence to handle all cases, but to work, it needs some values of
-     * 'm' and 'b' to get it started.  For the purposes of this calculation we
-     * use a very conservative estimate of 'm' and 'b'.  This assumes a weight
-     * can be multiple bytes, enough to hold any UV on the platform, and there
-     * are 5 levels, 4 weight bytes, and a trailing NUL.  */
+    /* mem_collxfrm_() is used recursively to get the transformation (though
+     * here we are interested only in its length).  It is used because it has
+     * the intelligence to handle all cases, but to work, it needs some values
+     * of 'm' and 'b' to get it started.  For the purposes of this calculation
+     * we use a very conservative estimate of 'm' and 'b'.  This assumes a
+     * weight can be multiple bytes, enough to hold any UV on the platform, and
+     * there are 5 levels, 4 weight bytes, and a trailing NUL.  */
     PL_collxfrm_base = 5;
     PL_collxfrm_mult = 5 * sizeof(UV);
 
@@ -9564,48 +9585,48 @@ S_compute_collxfrm_coefficients(pTHX)
                 "Disabling locale collation for LC_COLLATE='%s';"
                 " length for shorter sample=%zu; longer=%zu\n",
                 PL_collation_name, x_len_shorter, x_len_longer));
+        return false;
     }
-    else {
-        SSize_t base;       /* Temporary */
-
-        /* We have both: m * strlen(longer)  + b = x_len_longer
-         *               m * strlen(shorter) + b = x_len_shorter;
-         * subtracting yields:
-         *          m * (strlen(longer) - strlen(shorter))
-         *                             = x_len_longer - x_len_shorter
-         * But we have set things up so that 'shorter' is 1 byte smaller than
-         * 'longer'.  Hence:
-         *          m = x_len_longer - x_len_shorter
-         *
-         * But if something went wrong, make sure the multiplier is at least 1.
-         */
-        if (x_len_longer > x_len_shorter) {
-            PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
-        }
-        else {
-            PL_collxfrm_mult = 1;
-        }
 
-        /*     mx + b = len
-         * so:      b = len - mx
-         * but in case something has gone wrong, make sure it is non-negative
-         * */
-        base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
-        if (base < 0) {
-            base = 0;
-        }
+    SSize_t base;       /* Temporary */
 
-        /* Add 1 for the trailing NUL */
-        PL_collxfrm_base = base + 1;
+    /* We have both: m * strlen(longer)  + b = x_len_longer
+     *               m * strlen(shorter) + b = x_len_shorter;
+     * subtracting yields:
+     *          m * (strlen(longer) - strlen(shorter))
+     *                              = x_len_longer - x_len_shorter
+     * But we have set things up so that 'shorter' is 1 byte smaller than
+     * 'longer'.  Hence:
+     *          m = x_len_longer - x_len_shorter
+     *
+     * But if something went wrong, make sure the multiplier is at least 1.
+     */
+    if (x_len_longer > x_len_shorter) {
+        PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter;
+    }
+    else {
+        PL_collxfrm_mult = 1;
     }
 
+    /*     mx + b = len
+     * so:      b = len - mx
+     * but in case something has gone wrong, make sure it is non-negative */
+    base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1);
+    if (base < 0) {
+        base = 0;
+    }
+
+    /* Add 1 for the trailing NUL */
+    PL_collxfrm_base = base + 1;
+
     DEBUG_L(PerlIO_printf(Perl_debug_log,
                           "?UTF-8 locale=%d; x_len_shorter=%zu, "
                           "x_len_longer=%zu,"
                           " collate multipler=%zu, collate base=%zu\n",
                           PL_in_utf8_COLLATE_locale,
                           x_len_shorter, x_len_longer,
                           PL_collxfrm_mult, PL_collxfrm_base));
+    return true;
 }
 
 char *
@@ -9663,12 +9684,14 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
         if (PL_collxfrm_base != 0) { /* bad collation => skip */
             DEBUG_L(PerlIO_printf(Perl_debug_log,
                           "mem_collxfrm_: locale's collation is defective\n"));
-            goto bad;
+            goto bad_no_strxfrm;
         }
 
         /* (mult, base) == (0,0) means we need to calculate mult and base
          * before proceeding */
-        S_compute_collxfrm_coefficients(aTHX);
+        if (! S_compute_collxfrm_coefficients(aTHX)) {
+            return NULL;    /* locale collation not sane */
+        }
     }
 
     /* Replace any embedded NULs with the control that sorts before any others.
@@ -9741,7 +9764,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
                     /* Create a 1-char string of the current code point */
                     cur_source[0] = (char) j;
 
-                    /* Then transform it */
+                    /* Then transform it using a recursive call */
                     x = mem_collxfrm_(cur_source, trial_len, &x_len,
                                       0 /* The string is not in UTF-8 */);
 
@@ -9789,7 +9812,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
                 DEBUG_L(PerlIO_printf(Perl_debug_log,
                     "mem_collxfrm_: Couldn't find any character to replace"
                     " embedded NULs in locale %s with", PL_collation_name));
-                goto bad;
+                goto bad_no_strxfrm;
             }
 
             DEBUG_L(PerlIO_printf(Perl_debug_log,
@@ -9901,7 +9924,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
                     /* Create a 1-char string of the current code point */
                     cur_source[0] = (char) j;
 
-                    /* Then transform it */
+                    /* Then transform it (recursively) */
                     x = mem_collxfrm_(cur_source, 1, &x_len, FALSE);
 
                     /* If something went wrong (which it shouldn't), just
@@ -9930,7 +9953,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
                         "mem_collxfrm_: Couldn't find any character to"
                         " replace above-Latin1 chars in locale %s with",
                         PL_collation_name));
-                    goto bad;
+                    goto bad_no_strxfrm;
                 }
 
                 DEBUG_L(PerlIO_printf(Perl_debug_log,
@@ -10090,13 +10113,13 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
     if (UNLIKELY(! xbuf)) {
         DEBUG_L(PerlIO_printf(Perl_debug_log,
                       "mem_collxfrm_: Couldn't malloc %zu bytes\n", xAlloc));
-        goto bad;
+        goto bad_no_strxfrm;
     }
 
     /* Store the collation id */
     *(PERL_UINTMAX_T *)xbuf = PL_collation_ix;
 
-#  define CLEANUP_STRXFRM_COMMON                                        \
+#  define CLEANUP_NON_STRXFRM                                           \
         STMT_START {                                                    \
             Safefree(free_me);                                          \
             Safefree(sans_nuls);                                        \
@@ -10119,7 +10142,6 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
         STMT_START {                                                    \
             if (constructed_locale != (locale_t) 0)                     \
                 freelocale(constructed_locale);                         \
-            CLEANUP_STRXFRM_COMMON;                                     \
         } STMT_END
 #  else
 #    define my_strxfrm(dest, src, n)  strxfrm(dest, src, n)
@@ -10128,12 +10150,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
     orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name);
 
 #      define CLEANUP_STRXFRM                                           \
-        STMT_START {                                                    \
-                restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);  \
-                CLEANUP_STRXFRM_COMMON;                                 \
-        } STMT_END
-#    else
-#      define CLEANUP_STRXFRM  CLEANUP_STRXFRM_COMMON
+            restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale);
 #    endif
 #  endif
 
@@ -10203,7 +10220,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
                     PL_collxfrm_mult = new_m;
                     PL_collxfrm_base = 1;   /* +1 For trailing NUL */
                     computed_guess = PL_collxfrm_base
-                                    + (PL_collxfrm_mult * length_in_chars);
+                                   + (PL_collxfrm_mult * length_in_chars);
                     if (computed_guess < needed) {
                         PL_collxfrm_base += needed - computed_guess;
                     }
@@ -10275,6 +10292,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
 
     DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8));
     CLEANUP_STRXFRM;
+    CLEANUP_NON_STRXFRM;
 
     /* Free up unneeded space; retain enough for trailing NUL */
     Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char);
@@ -10286,9 +10304,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
     DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
     CLEANUP_STRXFRM;
 
+  bad_no_strxfrm:   /* Found a problem before strxfrm() got called */
+    DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8));
+
     Safefree(xbuf);
     *xlen = 0;
-
+    CLEANUP_NON_STRXFRM;
     return NULL;
 }