Add utf8_to_uv_msgs()

khwilliamson · khwilliamson · commit 0187f3d9c375 · 2024-12-02T10:48:02.000-07:00
This is the first of several functions with the naming style
utf8_to_uv(), and which are designed to be used instead of the
problematic current ones that are like utf8_to_uvchr().

The previous ones basically throw away crucial information in their
returns upon failure, creating hassles for the caller.  It is hard to
recover from malformed input with them to keep going to continue
parsing.  That is what modern UTF-8 handlers have settled on doing.

Originally I planned to replace just the most problematic one,
utf8_to_uvchr_buf(), but I realized that each level threw away
information, so it would be better to start at the base level one, which
utf8_to_uvchr_buf() eventually calls with a bunch of 0 parameters.  The
previous functions all had to disambiguate failure returns.  This stops
that at the root.

The new series all return a boolean as to their success, with a
consistent API throughout.  The old series had one outlier, again
utf8_to_uvchr_buf(), which had a different calling convention and
returns.

The basic logic in the base level function, which this commit handles,
was sound.  It just failed to return relevant information upon failure.

The new API has somewhat different formal parameter names and uses
Size_t instead of STRLEN for one of the parameters.  It also passes the
end of string position instead of a length.  The latter is problematic
when it could go negative, and instead becomes a huge positive number.

The old base function now merely calls the new one, and throws away the
relevant information, as it always has.
diff --git a/embed.fnc b/embed.fnc
@@ -3684,13 +3684,6 @@ ATdip	|UV	|utf8n_to_uvchr_msgs					\
 				|const U32 flags			\
 				|NULLOK U32 *errors			\
 				|NULLOK AV **msgs
-CTp	|UV	|_utf8n_to_uvchr_msgs_helper				\
-				|NN const U8 *s 			\
-				|STRLEN curlen				\
-				|NULLOK STRLEN *retlen			\
-				|const U32 flags			\
-				|NULLOK U32 *errors			\
-				|NULLOK AV **msgs
 CDbdp	|UV	|utf8n_to_uvuni |NN const U8 *s 			\
 				|STRLEN curlen				\
 				|NULLOK STRLEN *retlen			\
@@ -3740,6 +3733,21 @@ AMdip	|UV	|utf8_to_uvchr_buf					\
 				|NN const U8 *s 			\
 				|NN const U8 *send			\
 				|NULLOK STRLEN *retlen
+ATip	|bool	|utf8_to_uv_msgs|NN const U8 * const s0 		\
+				|NN const U8 *e 			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p		\
+				|const U32 flags			\
+				|NULLOK U32 *errors			\
+				|NULLOK AV **msgs
+CTp	|bool	|utf8_to_uv_msgs_helper_				\
+				|NN const U8 * const s0 		\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p		\
+				|const U32 flags			\
+				|NULLOK U32 *errors			\
+				|NULLOK AV **msgs
 CDbdp	|UV	|utf8_to_uvuni	|NN const U8 *s 			\
 				|NULLOK STRLEN *retlen
 : Used in perly.y
diff --git a/embed.h b/embed.h
@@ -125,7 +125,6 @@
 # define _to_utf8_lower_flags(a,b,c,d,e)        Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e)
 # define _to_utf8_title_flags(a,b,c,d,e)        Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e)
 # define _to_utf8_upper_flags(a,b,c,d,e)        Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e)
-# define _utf8n_to_uvchr_msgs_helper            Perl__utf8n_to_uvchr_msgs_helper
 # define amagic_call(a,b,c,d)                   Perl_amagic_call(aTHX_ a,b,c,d)
 # define amagic_deref_call(a,b)                 Perl_amagic_deref_call(aTHX_ a,b)
 # define apply_attrs_string(a,b,c,d)            Perl_apply_attrs_string(aTHX_ a,b,c,d)
@@ -863,6 +862,8 @@
 # define utf8_to_bytes_new_pv(a,b,c)            Perl_utf8_to_bytes_new_pv(aTHX,a,b,c)
 # define utf8_to_bytes_overwrite(a,b)           Perl_utf8_to_bytes_overwrite(aTHX,a,b)
 # define utf8_to_bytes_temp_pv(a,b)             Perl_utf8_to_bytes_temp_pv(aTHX,a,b)
+# define utf8_to_uv_msgs                        Perl_utf8_to_uv_msgs
+# define utf8_to_uv_msgs_helper_                Perl_utf8_to_uv_msgs_helper_
 # define utf8n_to_uvchr                         Perl_utf8n_to_uvchr
 # define utf8n_to_uvchr_error                   Perl_utf8n_to_uvchr_error
 # define utf8n_to_uvchr_msgs                    Perl_utf8n_to_uvchr_msgs
diff --git a/inline.h b/inline.h
@@ -3048,21 +3048,22 @@ Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
 }
 
-PERL_STATIC_INLINE UV
-Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
-                         STRLEN curlen,
-                         STRLEN *retlen,
-                         const U32 flags,
-                         U32 * errors,
-                         AV ** msgs)
+PERL_STATIC_INLINE bool
+Perl_utf8_to_uv_msgs(const U8 * const s0,
+                     const U8 * const e,
+                     UV * cp_p,
+                     Size_t *advance_p,
+                     const U32 flags,
+                     U32 * errors,
+                     AV ** msgs)
 {
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+    PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS;
 
-    /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
-     * simple cases, and, if necessary calls a helper function to deal with the
-     * more complex ones.  Almost all well-formed non-problematic code points
-     * are considered simple, so that it's unlikely that the helper function
-     * will need to be called. */
+    /* This is the inlined portion of utf8_to_uv_msgs.  It handles the simple
+     * cases, and, if necessary calls a helper function to deal with the more
+     * complex ones.  Almost all well-formed non-problematic code points are
+     * considered simple, so that it's unlikely that the helper function will
+     * need to be called. */
 
     /* Assume that isn't malformed; the vast majority of calls won't be */
     if (errors) {
@@ -3075,25 +3076,25 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
 
     /* No calls from core pass in an empty string; non-core need a check */
 #ifdef PERL_CORE
-    assert(curlen > 0);
+    assert(e > s0);
 #else
-    if (LIKELY(curlen > 0))
+    if (LIKELY(e > s0))
 #endif
 
     {
         /* UTF-8 invariants are returned unchanged.  The code below is quite
          * capable of handling this, but this shortcuts this very common case
          * */
         if (UTF8_IS_INVARIANT(*s0)) {
-            if (retlen) {
-                *retlen = 1;
+            if (advance_p) {
+                *advance_p = 1;
             }
 
-            return *s0;
+            *cp_p = *s0;
+            return true;
         }
 
         const U8 * s = s0;
-        const U8 * send = s + curlen;
 
         /* This dfa is fast.  If it accepts the input, it was for a
          * well-formed, non-problematic code point, which can be returned
@@ -3116,27 +3117,52 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
         PERL_UINT_FAST16_T state = PL_strict_utf8_dfa_tab[256 + type];
         UV uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
 
-        while (state > 1 && ++s < send) {
+        while (state > 1 && ++s < e) {
             type  = PL_strict_utf8_dfa_tab[*s];
             state = PL_strict_utf8_dfa_tab[256 + state + type];
 
             uv = UTF8_ACCUMULATE(uv, *s);
         }
 
         if (LIKELY(state == 0)) {
-            if (retlen) {
-                *retlen = s - s0 + 1;
+            if (advance_p) {
+                *advance_p = s - s0 + 1;
             }
 
-            return UNI_TO_NATIVE(uv);
+            *cp_p = UNI_TO_NATIVE(uv);
+            return true;
         }
     }
 
     /* Here is potentially problematic.  Use the full mechanism */
-    return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
-                                       errors, msgs);
+    return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs);
+}
+
+PERL_STATIC_INLINE UV
+Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
+                         STRLEN curlen,
+                         STRLEN *retlen,
+                         const U32 flags,
+                         U32 * errors,
+                         AV ** msgs)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+
+    UV cp;
+    if (LIKELY(utf8_to_uv_msgs(s0, s0 + curlen, &cp, retlen, flags, errors,
+                                                                        msgs)))
+    {
+        return cp;
+    }
+
+    if ((flags & UTF8_CHECK_ONLY) && retlen) {
+        *retlen = ((STRLEN) -1);
+    }
+
+    return 0;
 }
 
+
 PERL_STATIC_INLINE UV
 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 {
diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -1318,28 +1318,30 @@ The caller, of course, is responsible for freeing any returned AV.
 =cut
 */
 
-UV
-Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
-                               STRLEN curlen,
-                               STRLEN *retlen,
-                               const U32 flags,
-                               U32 * errors,
-                               AV ** msgs)
+bool
+Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
+                             const U8 * const e,
+                             UV *cp_p,
+                             Size_t *advance_p,
+                             const U32 flags,
+                             U32 * errors,
+                             AV ** msgs)
 {
-    const U8 * const s0 = s;
-    const U8 * send = s0 + curlen;
+    PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_;
+
+    const U8 * s = s0;
+    const U8 * send = e;
+    SSize_t curlen = send - s0;
     U32 possible_problems;  /* A bit is set here for each potential problem
                                found as we go along */
     UV uv;
-    STRLEN expectlen;     /* How long should this sequence be? */
-    STRLEN avail_len;     /* When input is too short, gives what that is */
+    SSize_t expectlen;    /* How long should this sequence be? */
+    SSize_t avail_len;    /* When input is too short, gives what that is */
     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
                              gets set and discarded */
 
     dTHX;
 
-    PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
-
     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
@@ -1356,13 +1358,14 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
     /* Each of the affected Hanguls starts with \xED */
 
     if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
-        if (retlen) {
-            *retlen = 3;
+        if (advance_p) {
+            *advance_p = 3;
         }
 
-        return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
-             | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
-             |  (s0[2] & UTF_CONTINUATION_MASK);
+        *cp_p = ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
+            | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
+            |  (s0[2] & UTF_CONTINUATION_MASK);
+        return true;
     }
 
     /* In conjunction with the exhaustive tests that can be enabled in
@@ -1403,7 +1406,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
      * We also should not consume too few bytes, otherwise someone could inject
      * things.  For example, an input could be deliberately designed to
      * overflow, and if this code bailed out immediately upon discovering that,
-     * returning to the caller C<*retlen> pointing to the very next byte (one
+     * returning to the caller C<*advance_p> pointing to the very next byte (one
      * which is actually part of the overflowing sequence), that could look
      * legitimate to the caller, which could discard the initial partial
      * sequence and process the rest, inappropriately.
@@ -1415,7 +1418,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
      * allowed one, we could allow in something that shouldn't have been.
      */
 
-    if (UNLIKELY(curlen == 0)) {
+    if (UNLIKELY(curlen <= 0)) {
         possible_problems |= UTF8_GOT_EMPTY;
         curlen = 0;
         uv = UNICODE_REPLACEMENT;
@@ -1433,8 +1436,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
      * function will be for, has this expected length.  For efficiency, set
      * things up here to return it.  It will be overridden only in those rare
      * cases where a malformation is found */
-    if (retlen) {
-        *retlen = expectlen;
+    if (advance_p) {
+        *advance_p = expectlen;
     }
 
     /* A continuation character can't start a valid sequence */
@@ -1606,7 +1609,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
         }
     }
 
-  ready_to_handle_errors:
+  ready_to_handle_errors: ;
 
     /* At this point:
      * curlen               contains the number of bytes in the sequence that
@@ -1629,6 +1632,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
      *                      us should be in it, but no further than s0 +
      *                      avail_len
      */
+    bool success = true;
 
     if (UNLIKELY(possible_problems)) {
         bool disallowed = FALSE;
@@ -2047,19 +2051,18 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
         /* Since there was a possible problem, the returned length may need to
          * be changed from the one stored at the beginning of this function.
          * Instead of trying to figure out if it has changed, just do it. */
-        if (retlen) {
-            *retlen = curlen;
+        if (advance_p) {
+            *advance_p = curlen;
         }
 
         if (disallowed) {
-            if (flags & UTF8_CHECK_ONLY && retlen) {
-                *retlen = ((STRLEN) -1);
-            }
-            return 0;
+            success = false;
+            uv = UNICODE_REPLACEMENT;
         }
     }
 
-    return UNI_TO_NATIVE(uv);
+    *cp_p = UNI_TO_NATIVE(uv);
+    return success;
 }
 
 /*