utf8_to_bytes: Refactor loop

khwilliamson · khwilliamson · commit 9d310cca8256 · 2024-11-28T07:55:45.000-07:00
The previous version did not make sure that it wasn't reading beyond the
end of the buffer in all cases, and the first pass through the input
string already ruled out it having most problems.  Thus we don't need
the full generality here of the macro UTF8_IS_DOWNGRADEABLE_START; and
this simplifies things
diff --git a/utf8.c b/utf8.c
@@ -2518,23 +2518,26 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
     U8 * d = s = first_variant;
 
     while (s < send) {
+        U8 c = *s++;
+        if (! UVCHR_IS_INVARIANT(c)) {
+
+            /* Then it is a multi-byte character.  The first pass above
+             * determined that the string contains only invariants, the C2 and
+             * C3 start bytes, and continuation bytes.  The condition above
+             * excluded this from being an invariant.  To be well formed, it
+             * needs to be a start byte followed by a continuation byte. */
+            if (   UNLIKELY(  UTF8_IS_CONTINUATION(c))
+                || UNLIKELY(  s >= send)
+                || UNLIKELY(! UTF8_IS_CONTINUATION(*s)))
+            {
+                goto cant_convert;
+            }
 
-        if (UVCHR_IS_INVARIANT(*s)) {
-            *d++ = *s++;
-            continue;
-        }
-
-        /* Here it is two-byte encoded. */
-        if (   LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
-            && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
-        {
-            U8 first_byte = *s++;
-            *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
+            c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
             s++;
-            continue;
         }
 
-                goto cant_convert;
+        *d++ = c;
     }
 
     /* Success! */