utf8_to_bytes_: Add non-destructive write option

khwilliamson · khwilliamson · commit 0a5edc8f161a · 2024-11-28T07:55:45.000-07:00
This causes this function to be able to both overwrite the input, and to
instead create new memory.  It changes bytes_from_utf8() to use this new
capability instead of being a near duplication of the core code of this
function.

Prior to this commit, bytes_from_utf8() just allocated memory the size
of the original string, and started copying into it.  When it came to a
sequence that wasn't convertible, it stopped, and freed up the copy.
The new behavior has it checking first before the malloc that the string
is convertible.  That has the advantage that there is no malloc without
being sure it will be useful; but the disadvantage that there is an
extra pass through the input string, but that pass is per-word.

The next commit will introduce another advantage.

Thanks to Tony Cook for the 'free_me' idea
diff --git a/embed.fnc b/embed.fnc
@@ -3700,7 +3700,12 @@ Adpx	|U8 *	|utf8_to_bytes	|NN U8 *s				\
 				|NN STRLEN *lenp
 Cp	|bool	|utf8_to_bytes_ |NN U8 **s_ptr				\
 				|NN STRLEN *lenp			\
+				|NN U8 **free_me			\
 				|Perl_utf8_to_bytes_arg result_as
+Admp	|bool	|utf8_to_bytes_new_pv					\
+				|NN U8 const **s_ptr			\
+				|NN STRLEN *lenp			\
+				|NN U8 *free_me
 Admp	|bool	|utf8_to_bytes_overwrite				\
 				|NN U8 **s_ptr				\
 				|NN STRLEN *lenp
diff --git a/embed.h b/embed.h
@@ -859,7 +859,8 @@
 # define utf8_hop_safe                          Perl_utf8_hop_safe
 # define utf8_length(a,b)                       Perl_utf8_length(aTHX_ a,b)
 # define utf8_to_bytes(a,b)                     Perl_utf8_to_bytes(aTHX_ a,b)
-# define utf8_to_bytes_(a,b,c)                  Perl_utf8_to_bytes_(aTHX_ a,b,c)
+# define utf8_to_bytes_(a,b,c,d)                Perl_utf8_to_bytes_(aTHX_ a,b,c,d)
+# define utf8_to_bytes_new_pv(a,b,c)            Perl_utf8_to_bytes_new_pv(aTHX,a,b,c)
 # define utf8_to_bytes_overwrite(a,b)           Perl_utf8_to_bytes_overwrite(aTHX,a,b)
 # define utf8_to_uvchr_buf_helper(a,b,c)        Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
 # define utf8n_to_uvchr_msgs                    Perl_utf8n_to_uvchr_msgs
diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -2378,12 +2378,16 @@ If you need a copy of the string, see L</bytes_from_utf8>.
 */
 
 bool
-Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
+Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, U8 ** free_me,
                           Perl_utf8_to_bytes_arg result_as)
 {
     PERL_ARGS_ASSERT_UTF8_TO_BYTES_;
     PERL_UNUSED_CONTEXT;
 
+    if (result_as == PL_utf8_to_bytes_new_memory) {
+        *free_me = NULL;
+    }
+
     U8 * first_variant;
 
     /* This is a no-op if no variants at all in the input */
@@ -2505,7 +2509,15 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
         s++;
     }
 
-    U8 *d0 = s0;
+    U8 *d0;
+    if (result_as == PL_utf8_to_bytes_overwrite) {
+        d0 = s0;
+    }
+    else {
+        Newx(d0, *lenp + 1, U8);
+        Copy(s0, d0, invariant_length, U8);
+    }
+
     U8 * d = d0 + invariant_length;
 
     /* For the cases where the per-word algorithm wasn't used, everything is
@@ -2546,6 +2558,10 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
     *d = '\0';
     *lenp = d - d0;
 
+    if (result_as != PL_utf8_to_bytes_overwrite) {
+        *s_ptr = *free_me = d0;
+    }
+
     return true;
 
   cant_convert: ;
@@ -2556,10 +2572,16 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
      * text are C2 and C3, but didn't examine it to make sure each of those was
      * followed by precisely one continuation, for example.
      *
-     * We have to undo all we've done before, back down to the first UTF-8
-     * variant.  Note that each 2-byte variant we've done so far (converted to
-     * single byte) slides things to the left one byte, and so we have bytes
-     * that haven't been written over.
+     * If the result is in newly allocated memory, just free it */
+    if (result_as != PL_utf8_to_bytes_overwrite) {
+        Safefree(d0);
+        return false;
+    }
+
+    /* Otherwise, we have to undo all we've done before, back down to the first
+     * UTF-8 variant.  Note that each 2-byte variant we've done so far
+     * (converted to single byte) slides things to the left one byte, and so we
+     * have bytes that haven't been written over.
      *
      * Here, 'd' points to the next position to overwrite, and 's' points to
      * the first invalid byte.  That means 'd's contents haven't been changed
@@ -2641,57 +2663,25 @@ U8 *
 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
 {
     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
-    PERL_UNUSED_CONTEXT;
-
-    if (! *is_utf8p) {
-        return (U8 *) s;
-    }
-
-    const U8 * const s0 = s;
-    const U8 * const send = s + *lenp;
-    const U8 * first_variant;
-
-    /* The initial portion of 's' that consists of invariants can be Copied
-     * as-is.  If it is entirely invariant, the whole thing can be Copied. */
-    if (is_utf8_invariant_string_loc(s, *lenp, &first_variant)) {
-        first_variant = send;
-    }
-
-    U8 *d;
-    Newx(d, (*lenp) + 1, U8);
-    Copy(s, d, first_variant - s, U8);
-
-    U8 *converted_start = d;
-    d += first_variant - s;
-    s = first_variant;
-
-    while (s < send) {
-        U8 c = *s++;
-        if (! UTF8_IS_INVARIANT(c)) {
 
-            /* Then it is multi-byte encoded.  If the code point is above 0xFF,
-             * have to stop now */
-            if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
-                    Safefree(converted_start);
-                    return (U8 *) s0;
+    if (*is_utf8p) {
+        U8 * new_memory = NULL;
+        if (utf8_to_bytes_new_pv(&s, lenp, &new_memory)) {
+            *is_utf8p = false;
+
+            /* Our callers are always expecting new memory upon success.  Give
+             * it to them, adding a trailing NUL if not already there */
+            if (new_memory == NULL) {
+                U8 * new_s;
+                Newx(new_s, *lenp + 1, U8);
+                Copy(s, new_s, *lenp, U8);
+                new_s[*lenp] = '\0';
+                s = new_s;
             }
-
-            c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
-            s++;
         }
-        *d++ = c;
     }
 
-    /* Here, converted the whole of the input */
-    *is_utf8p = FALSE;
-
-    *d = '\0';
-    *lenp = d - converted_start;
-
-    /* Trim unused space */
-    Renew(converted_start, *lenp + 1, U8);
-
-    return converted_start;
+    return (U8 *) s;
 }
 
 /*
diff --git a/utf8.h b/utf8.h
@@ -1312,8 +1312,15 @@ typedef enum {
     PL_utf8_to_bytes_use_temporary,
 } Perl_utf8_to_bytes_arg;
 
+/* INT2PTR() is because this parameter should not be used in this case, but
+ * there is a NN assertion for it.  It causes that to pass but to still
+ * segfault if wrongly gets used */
 #define Perl_utf8_to_bytes_overwrite(mTHX, s, l)                            \
-        Perl_utf8_to_bytes_(aTHX_ s, l, PL_utf8_to_bytes_overwrite)
+        Perl_utf8_to_bytes_(aTHX_ s, l, INT2PTR(U8 **, 1),                  \
+                                  PL_utf8_to_bytes_overwrite)
+#define Perl_utf8_to_bytes_new_pv(mTHX, s, l, f)                            \
+        Perl_utf8_to_bytes_(aTHX_ (U8 **) s, l, f,                          \
+                                  PL_utf8_to_bytes_new_memory)
 
 /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
  * retained solely for backwards compatibility */