utf8_to_bytes_: Calculate needed malloc size

khwilliamson · khwilliamson · commit 8c15ff3a55ae · 2024-11-28T07:55:45.000-07:00
Prior to this commit, the size malloced was just the same as the length
of the input string, which is a worst case scenario.  This commit
changes so the new pass through the input (introduced in the previous
commit) also calculates the needed length.

The additional cost of doing this is minimal.  It has advantages on a
very long string with lots of sequences that are convertible.
diff --git a/utf8.c b/utf8.c
@@ -2404,11 +2404,12 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
     const U8 * const send = s0 + *lenp;
     U8 * s = first_variant;
     Size_t invariant_length = first_variant - s0;
+    Size_t variant_count = 0;
 
 #ifndef EBCDIC      /* The below relies on the bit patterns of UTF-8 */
 
     /* Do a first pass through the string to see if it actually is translatable
-     * into bytes.  On long strings this is
+     * into bytes, and if so, how big the result is.  On long strings this is
      * done a word at a time, so is relatively quick. (There is some
      * start-up/tear-down overhead with the per-word algorithm, so no real gain
      * unless the remaining portion of the string is long enough.  The current
@@ -2435,8 +2436,11 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
                     return false;
                 }
+
                 s++;
+                variant_count++;
             }
+
             s++;
         }
 
@@ -2486,6 +2490,12 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
                 return false;
             }
 
+            /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
+               explanation of how this works */
+            variant_count +=
+                (Size_t) (((((start_bytes)) >> 7) * PERL_COUNT_MULTIPLIER)
+                                      >> ((PERL_WORDSIZE - 1) * CHARBITS));
+
             s += PERL_WORDSIZE;
         } while (s + PERL_WORDSIZE <= send);
 
@@ -2494,6 +2504,7 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
          * first byte of the character  */
         if (s > first_variant && UTF8_IS_START(*(s-1))) {
             s--;
+            variant_count--;
         }
     }
 
@@ -2505,16 +2516,20 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
                 return false;
             }
             s++;
+            variant_count++;
         }
         s++;
     }
 
+    /* Here, we passed the tests above and know how many UTF-8 variant
+     * characters there are, which allows us to calculate the size to malloc
+     * for the non-destructive case */
     U8 *d0;
     if (result_as == PL_utf8_to_bytes_overwrite) {
         d0 = s0;
     }
     else {
-        Newx(d0, *lenp + 1, U8);
+        Newx(d0, (*lenp) + 1 - variant_count, U8);
         Copy(s0, d0, invariant_length, U8);
     }
 

Original file line number	Diff line number	Diff line change
`@@ -2404,11 +2404,12 @@ Perl_utf8_to_bytes_(pTHX_ U8 *s_ptr, STRLEN lenp, U8 ** free_me,`
`2404`	`2404`	`const U8 * const send = s0 + *lenp;`
`2405`	`2405`	`U8 * s = first_variant;`
`2406`	`2406`	`Size_t invariant_length = first_variant - s0;`
	`2407`	`+ Size_t variant_count = 0;`
`2407`	`2408`
`2408`	`2409`	`#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */`
`2409`	`2410`
`2410`	`2411`	`/* Do a first pass through the string to see if it actually is translatable`
`2411`		`- * into bytes. On long strings this is`
	`2412`	`+ * into bytes, and if so, how big the result is. On long strings this is`
`2412`	`2413`	`* done a word at a time, so is relatively quick. (There is some`
`2413`	`2414`	`* start-up/tear-down overhead with the per-word algorithm, so no real gain`
`2414`	`2415`	`* unless the remaining portion of the string is long enough. The current`
`@@ -2435,8 +2436,11 @@ Perl_utf8_to_bytes_(pTHX_ U8 *s_ptr, STRLEN lenp, U8 ** free_me,`
`2435`	`2436`	`if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {`
`2436`	`2437`	`return false;`
`2437`	`2438`	`}`
	`2439`	`+`
`2438`	`2440`	`s++;`
	`2441`	`+ variant_count++;`
`2439`	`2442`	`}`
	`2443`	`+`
`2440`	`2444`	`s++;`
`2441`	`2445`	`}`
`2442`	`2446`
`@@ -2486,6 +2490,12 @@ Perl_utf8_to_bytes_(pTHX_ U8 *s_ptr, STRLEN lenp, U8 ** free_me,`
`2486`	`2490`	`return false;`
`2487`	`2491`	`}`
`2488`	`2492`
	`2493`	`+ /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an`
	`2494`	`+ explanation of how this works */`
	`2495`	`+ variant_count +=`
	`2496`	`+ (Size_t) (((((start_bytes)) >> 7) * PERL_COUNT_MULTIPLIER)`
	`2497`	`+ >> ((PERL_WORDSIZE - 1) * CHARBITS));`
	`2498`	`+`
`2489`	`2499`	`s += PERL_WORDSIZE;`
`2490`	`2500`	`} while (s + PERL_WORDSIZE <= send);`
`2491`	`2501`
`@@ -2494,6 +2504,7 @@ Perl_utf8_to_bytes_(pTHX_ U8 *s_ptr, STRLEN lenp, U8 ** free_me,`
`2494`	`2504`	`* first byte of the character */`
`2495`	`2505`	`if (s > first_variant && UTF8_IS_START(*(s-1))) {`
`2496`	`2506`	`s--;`
	`2507`	`+ variant_count--;`
`2497`	`2508`	`}`
`2498`	`2509`	`}`
`2499`	`2510`
`@@ -2505,16 +2516,20 @@ Perl_utf8_to_bytes_(pTHX_ U8 *s_ptr, STRLEN lenp, U8 ** free_me,`
`2505`	`2516`	`return false;`
`2506`	`2517`	`}`
`2507`	`2518`	`s++;`
	`2519`	`+ variant_count++;`
`2508`	`2520`	`}`
`2509`	`2521`	`s++;`
`2510`	`2522`	`}`
`2511`	`2523`
	`2524`	`+ /* Here, we passed the tests above and know how many UTF-8 variant`
	`2525`	`+ * characters there are, which allows us to calculate the size to malloc`
	`2526`	`+ * for the non-destructive case */`
`2512`	`2527`	`U8 *d0;`
`2513`	`2528`	`if (result_as == PL_utf8_to_bytes_overwrite) {`
`2514`	`2529`	`d0 = s0;`
`2515`	`2530`	`}`
`2516`	`2531`	`else {`
`2517`		`- Newx(d0, *lenp + 1, U8);`
	`2532`	`+ Newx(d0, (*lenp) + 1 - variant_count, U8);`
`2518`	`2533`	`Copy(s0, d0, invariant_length, U8);`
`2519`	`2534`	`}`
`2520`	`2535`