Unroll valid_utf8_to_uv loop

khwilliamson · khwilliamson · commit 2cb0034ef56d · 2025-09-20T10:21:33.000-06:00
This gives a bit of performance boost in this function that can be
called during pattern matching.

Here are some cachegrind comparisons with blead:

Key:
    Ir   Instruction read
    Dr   Data read
    Dw   Data write
    COND conditional branches
    IND  indirect branches

The numbers represent relative counts per loop iteration, compared to
blead at 100.0%.
Higher is better: for example, using half as many instructions gives 200%,
while using twice as many gives 50%.

               GCC                     CLANG

valid_utf8_to_uv(0x007f), length is 1

        blead      hacked        blead      hacked
       ------ -----------        ------     ------
    Ir 100.00      100.69        Ir 100.00  99.11
    Dr 100.00      101.47        Dr 100.00  99.74
    Dw 100.00      100.00        Dw 100.00  99.57
  COND 100.00      101.20        COND 100.00 100.00
   IND 100.00      100.00        IND 100.00  94.12

valid_utf8_to_uv(0x07ff), length is 2

        blead      hacked        blead      hacked
       ------ -----------        ------     ------
    Ir 100.00      100.68        Ir 100.00  99.04
    Dr 100.00      101.47        Dr 100.00  99.74
    Dw 100.00      100.00        Dw 100.00  99.57
  COND 100.00      102.40        COND 100.00 101.23
   IND 100.00      100.00        IND 100.00  94.12

valid_utf8_to_uv(0xfffd), length is 3

        blead      hacked        blead      hacked
       ------ -----------        ------     ------
    Ir 100.00      100.83        Ir 100.00  99.04
    Dr 100.00      101.47        Dr 100.00  99.75
    Dw 100.00      100.00        Dw 100.00  99.57
  COND 100.00      102.99        COND 100.00 101.84
   IND 100.00      100.00        IND 100.00  94.12

valid_utf8_to_uv(0xffffd), length is 4

        blead      hacked        blead      hacked
       ------ -----------        ------     ------
    Ir 100.00      100.91        Ir 100.00  99.13
    Dr 100.00      101.46        Dr 100.00  99.75
    Dw 100.00      100.00        Dw 100.00  99.57
  COND 100.00      103.59        COND 100.00 102.45
   IND 100.00      100.00        IND 100.00  94.12

valid_utf8_to_uv(0x3ffffff), length is 5

        blead      hacked        blead      hacked
       ------ -----------        ------     ------
    Ir 100.00      101.28        Ir 100.00  99.29
    Dr 100.00      101.46        Dr 100.00  99.75
    Dw 100.00      100.00        Dw 100.00  99.57
  COND 100.00      104.19        COND 100.00 103.07
   IND 100.00      100.00        IND 100.00  94.12

valid_utf8_to_uv(0x7fffffff), length is 6

        blead      hacked        blead      hacked
       ------ -----------        ------     ------
    Ir 100.00       89.83        Ir 100.00  88.83
    Dr 100.00       95.22        Dr 100.00  92.94
    Dw 100.00       92.44        Dw 100.00  91.63
  COND 100.00       86.21        COND 100.00  87.11
   IND 100.00      100.00        IND 100.00  88.89

Clang gives slightly worse results than gcc.  But there is an
improvement in both cases for conditionals for two-byte and longer
characters..

This shows that the performance is significantly worse for code points
that take 6 bytes (or more, which I didn't include) to represent.  These
are all well outside the Unicode range; hence are very rarely
encountered.  Performance is improved a bit for the typical cases.

The algorithm used could handle 6 and 7 byte characters, but that
increases memory usage, and can lead to the compiler choosing to not
inline this function.  In blead, experiments with clang gave these
results
    Max bytes inlined   Instances in the code where not inlined
        3                 14
        4                 19
        5                 19
        6                 19
        7                 57

We really need to accomodate any Unicode code point, which is 4 bytes (5
on EBCDIC).  But the others we don't care about.  Even though 6 bytes
doesn't show as being worse than 4, I chose to not include it, because
we don't care about performance for these rare non-Unicode code points,
and it just might cause non-inlining for different compilers or clang
versions.
diff --git a/embed.fnc b/embed.fnc
@@ -1858,6 +1858,9 @@ CTopr	|void	|locale_panic	|NN const char *msg				\
 : Used in perly.y
 p	|OP *	|localize	|NN OP *o				\
 				|I32 lex
+CTp	|UV	|long_valid_utf8_to_uv					\
+				|NN const U8 * const s			\
+				|NN const U8 * const e
 ARdp	|I32	|looks_like_number					\
 				|NN SV * const sv
 CRTip	|unsigned|lsbit_pos32	|U32 word
diff --git a/embed.h b/embed.h
@@ -358,6 +358,7 @@
 # define lex_stuff_pvn(a,b,c)                   Perl_lex_stuff_pvn(aTHX_ a,b,c)
 # define lex_stuff_sv(a,b)                      Perl_lex_stuff_sv(aTHX_ a,b)
 # define lex_unstuff(a)                         Perl_lex_unstuff(aTHX_ a)
+# define long_valid_utf8_to_uv                  Perl_long_valid_utf8_to_uv
 # define looks_like_number(a)                   Perl_looks_like_number(aTHX_ a)
 # define lsbit_pos32                            Perl_lsbit_pos32
 # define magic_dump(a)                          Perl_magic_dump(aTHX_ a)
diff --git a/inline.h b/inline.h
@@ -1334,31 +1334,99 @@ Perl_valid_utf8_to_uv(const U8 *s, STRLEN *retlen)
 
     const UV expectlen = UTF8SKIP(s);
     ASSUME(inRANGE(expectlen, 1, UTF8_MAXBYTES));
-    const U8* send = s + expectlen;
-    UV uv = *s;
+    UV uv = 0;
 
-    if (retlen) {
-        *retlen = expectlen;
-    }
-
-    /* An invariant is trivially returned */
-    if (expectlen == 1) {
-        return uv;
+    /* Note that this is branchless except for the switch() jump table, and
+     * checking that the caller wants a *retlen returned.
+     *
+     * There is wasted effort for length 1 inputs of initializing 'uv' to 0 
+     * and calculating 'full_shift' (unless the compiler optimizes that out).
+     * Benchmarks indicate this is acceptable.
+     * See GH #23690 */
+
+    /* Consider a 4-byte UTF-8-encoded charater.  On ASCII platforms it looks
+     * like:
+     * 1st Byte   2nd Byte   3rd Byte   4th Byte
+     * 1111 0ddd  10cc cccc  10bb bbbb  10aa aaaa
+     *
+     * And the code point it represents is dddccccccbbbbbbbbaaaaaa
+     * Each continuation byte contributes its lower 6 bits to the total.  For
+     * generality call that number 'L'.
+     *
+     * You get that code point by masking off the top bits of each byte, then
+     * or'ing together:
+     * the start byte shifted left by 3*L bits,
+     * with  byte [1] shifted left by 2*L bits
+     * with  byte [2] shifted left by 1*L bits
+     * with  byte [3] shifted left by 0*L bits
+     *
+     * The order is immaterial, so we can rewrite that as
+     * 'or' together byte [3] shifted left by 0*L bits
+     *          with byte [2] shifted left by 1*L bits
+     *          with byte [1] shifted left by 2*L bits
+     *          with byte [0] shifted left by 3*L bits,
+     *
+     * All share the paradigm that for byte n you mask off the top bits and
+     * shift the remainder left by (4 - 1 - n) * L bits.  So we get
+     *      (s[n] & mask) << (4 - 1 - n) * L
+     * For a three-byte character it would be
+     *      (s[n] & mask) << (3 - 1 - n) * L
+     * and generally
+     *      (s[n] & mask) << (expectlen - 1 - n) * L
+     * which can be rewritten
+     *      (s[n] & mask) << (expectlen - 1) * L - nL
+     * Calculate the term once that isn't compile-time constant and is the same
+     * for all n */
+    U8 full_shift = (expectlen - 1) * UTF_ACCUMULATION_SHIFT;
+
+    /* Then create a macro that does the full calculation given n.  For EBCDIC,
+     * we need to transform s[n] to I8 */
+#define PERL_VALID_UTF8_NEXT_ACCUMULATION(n)         \
+    (( (UV) (   NATIVE_UTF8_TO_I8( s[n] ) & UTF_CONTINUATION_MASK))         \
+             << (full_shift - (n) * UTF_ACCUMULATION_SHIFT))
+
+    switch (expectlen) {
+      default:
+        uv = long_valid_utf8_to_uv(s, s + expectlen);
+        break;
+
+#if 0   /* See GH #23690 */
+      /* These cases give the correct results, but the extra memory used lowers
+       * the chances of the compiler actually inlining this, and we only care
+       * about performance for Unicode code points, all of which can be
+       * expressed with 4 bytes (5 on EBCDIC).  Experiements with clang showed
+       * no difference between 4,5,6, but a huge drop off with 7. */
+      case 7: uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(6);
+              /* FALLTHROUGH */
+      case 6: uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(5);
+              /* FALLTHROUGH */
+#endif
+      case 5: uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(4);
+              /* FALLTHROUGH */
+      case 4:
+        uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(3);
+              /* FALLTHROUGH */
+      case 3:
+        uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(2);
+              /* FALLTHROUGH */
+      case 2:
+        uv |= PERL_VALID_UTF8_NEXT_ACCUMULATION(1);
+
+        uv = UNI_TO_NATIVE(uv | (  ((UV)(  NATIVE_UTF8_TO_I8(s[0])
+                                         & UTF_START_MASK(expectlen))
+                                 << full_shift)));
+        break;
+
+      case 1:
+        uv = s[0];
+        break;
     }
 
-    /* Remove the leading bits that indicate the number of bytes, leaving just
-     * the bits that are part of the value */
-    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
-
-    /* Now, loop through the remaining bytes, accumulating each into the
-     * working total as we go.  (I khw tried unrolling the loop for up to 4
-     * bytes, but there was no performance improvement) */
-    for (++s; s < send; s++) {
-        uv = UTF8_ACCUMULATE(uv, *s);
+    if (retlen) {
+        *retlen = expectlen;
     }
 
-    return UNI_TO_NATIVE(uv);
-
+    return uv;
 }
 
 /* This looks like 0x010101... */
diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -37,6 +37,21 @@ static const char malformed_text[] = "Malformed UTF-8 character";
 static const char unees[] =
                         "Malformed UTF-8 character (unexpected end of string)";
 
+UV
+Perl_long_valid_utf8_to_uv(const U8 * const s, const U8 * const e)
+{
+    PERL_ARGS_ASSERT_LONG_VALID_UTF8_TO_UV;
+
+    /* This exists entirely to make the inlined 'valid_utf8_to_uv' smaller, to
+     * increase its chances of actually getting inlined.  For the code points
+     * it doesn't handle, it calls utf8_to_uv_or_die(), which is also inlined.
+     * So the compiler would try to inline both, getting a too-large-to-inline
+     * result.  So this non-inlined routine acts as an intermediary, to avoid
+     * that */
+
+    return utf8_to_uv_or_die(s, e, NULL);
+}
+
 /*
 These are various utility functions for manipulating UTF8-encoded
 strings.  For the uninitiated, this is a method of representing arbitrary