Perl · khwilliamson · Sep 23, 2025 · Dec 7, 2024 · Jan 16, 2025 · Mar 8, 2025
diff --git a/utf8.c b/utf8.c
@@ -2189,8 +2189,9 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
              * depend on earlier actions.  Also the ordering tries to cause any
              * messages to be displayed in kind of decreasing severity order.
              * */
-            U32 this_problem = 1U << lsbit_pos32(possible_problems);
 
+            U8 this_problem_bit = lsbit_pos32(possible_problems);
+            U32 this_problem = 1U << this_problem_bit;
             U32 this_flag_bit = this_problem;
 
             /* All cases set this */
@@ -2215,7 +2216,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
              * would be to handle the message.
              */
 
-            switch (this_problem) {
+            switch (this_problem_bit) {
               default:
                 croak("panic: Unexpected case value in utf8_to_uv_msgs() %"
                       U32uf, this_problem);
@@ -2232,7 +2233,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
                     continue;                                               \
                 }                                                           \
 
-              case UTF8_GOT_EMPTY:
+              case UTF8_GOT_EMPTY_BIT_POS_:
                 COMMON_DEFAULT_REJECTS(,);
 
                 /* This so-called malformation is now treated as a bug in the
@@ -2243,7 +2244,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
                 message = Perl_form(aTHX_ "%s (empty string)", malformed_text);
                 break;
 
-              case UTF8_GOT_CONTINUATION:
+              case UTF8_GOT_CONTINUATION_BIT_POS_:
                 COMMON_DEFAULT_REJECTS(,);
                 message = form(
                                 "%s: %s (unexpected continuation byte 0x%02x,"
@@ -2253,7 +2254,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
                                 *s0);
                 break;
 
-              case UTF8_GOT_SHORT:
+              case UTF8_GOT_SHORT_BIT_POS_:
                 COMMON_DEFAULT_REJECTS(,);
                 message = form(
                              "%s: %s (too short; %d byte%s available, need %d)",
@@ -2264,7 +2265,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
                              (int)expectlen);
                 break;
 
-              case UTF8_GOT_NON_CONTINUATION:
+              case UTF8_GOT_NON_CONTINUATION_BIT_POS_:
                {
                 COMMON_DEFAULT_REJECTS(,);
 
@@ -2282,8 +2283,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
                 break;
                }
 
-              case UTF8_GOT_LONG:
-              case UTF8_GOT_LONG_WITH_VALUE:
+              case UTF8_GOT_LONG_BIT_POS_:
+              case UTF8_GOT_LONG_WITH_VALUE_BIT_POS_:
                 COMMON_DEFAULT_REJECTS(,);
 
                 /* These error types cause 'input_uv' to be something that
@@ -2356,7 +2357,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
                     continue;                                               \
                 }
 
-              case UTF8_GOT_SURROGATE:
+              case UTF8_GOT_SURROGATE_BIT_POS_:
                 COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_SURROGATE,
                                          WARN_SURROGATE,,);
 
@@ -2374,7 +2375,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
 
                 break;
 
-              case UTF8_GOT_NONCHAR:
+              case UTF8_GOT_NONCHAR_BIT_POS_:
                 COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_NONCHAR, WARN_NONCHAR,,);
 
                 /* The code above should have guaranteed that we don't get here
@@ -2406,21 +2407,21 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
               bool overflows;
               bool is_extended;
 
-              case UTF8_GOT_OVERFLOW:
+              case UTF8_GOT_OVERFLOW_BIT_POS_:
                 COMMON_DEFAULT_REJECTS(ckWARN_d, WARN_NON_UNICODE);
                 overflows = true;
                 is_extended = true;
                 goto super_common;
 
-              case UTF8_GOT_PERL_EXTENDED:
+              case UTF8_GOT_PERL_EXTENDED_BIT_POS_:
                 COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_PERL_EXTENDED,
                                          WARN_NON_UNICODE, ckWARN_d,
                                          WARN_PORTABLE);
                 overflows = orig_problems & UTF8_GOT_OVERFLOW;
                 is_extended = true;
                 goto super_common;
 
-              case UTF8_GOT_SUPER:
+              case UTF8_GOT_SUPER_BIT_POS_:
                 COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_SUPER, WARN_NON_UNICODE,,);
                 overflows = orig_problems & UTF8_GOT_OVERFLOW;
                 is_extended = UTF8_IS_PERL_EXTENDED(s0);

diff --git a/utf8.h b/utf8.h
@@ -1173,27 +1173,56 @@ point's representation.
 
 /* The ordering of these bits is important to a switch() statement in utf8.c
  * for handling problems in converting UTF-8 to a UV */
-#define UTF8_ALLOW_OVERFLOW             0x0001
-#define UTF8_GOT_OVERFLOW               UTF8_ALLOW_OVERFLOW
+#define UTF8_GOT_OVERFLOW_BIT_POS_              0
+#define UTF8_GOT_EMPTY_BIT_POS_                 1
+#define UTF8_GOT_CONTINUATION_BIT_POS_          2
+#define UTF8_GOT_SHORT_BIT_POS_                 3
+#define UTF8_GOT_NON_CONTINUATION_BIT_POS_      4
 
-#define UTF8_ALLOW_EMPTY		0x0002	/* Allow a zero length string */
-#define UTF8_GOT_EMPTY                  UTF8_ALLOW_EMPTY
+#define UTF8_GOT_SURROGATE_BIT_POS_             5
+#define UTF8_WARN_SURROGATE_BIT_POS_            6
+
+#define UTF8_GOT_PERL_EXTENDED_BIT_POS_         7
+#define UTF8_WARN_PERL_EXTENDED_BIT_POS_        8
+
+#define UTF8_GOT_SUPER_BIT_POS_                 9
+#define UTF8_WARN_SUPER_BIT_POS_               10
+
+#define UTF8_GOT_NONCHAR_BIT_POS_              11
+#define UTF8_WARN_NONCHAR_BIT_POS_             12
+
+#define UTF8_GOT_LONG_BIT_POS_                 13
+#define UTF8_GOT_LONG_WITH_VALUE_BIT_POS_      14
+
+#define UTF8_CHECK_ONLY_BIT_POS_               15
+#define UTF8_DIE_IF_MALFORMED_BIT_POS_         16
+#define UTF8_FORCE_WARN_IF_MALFORMED_BIT_POS_  17
+
+#define UTF8_NO_CONFIDENCE_IN_CURLEN_BIT_POS_  18
+
+#define UTF8_GOT_OVERFLOW               (1U << UTF8_GOT_OVERFLOW_BIT_POS_)
+#define UTF8_ALLOW_OVERFLOW                    UTF8_GOT_OVERFLOW
+
+/* Allow a zero length string */
+#define UTF8_GOT_EMPTY                  (1U << UTF8_GOT_EMPTY_BIT_POS_)
+#define UTF8_ALLOW_EMPTY                       UTF8_GOT_EMPTY
 
 /* Allow first byte to be a continuation byte */
-#define UTF8_ALLOW_CONTINUATION		0x0004
-#define UTF8_GOT_CONTINUATION		UTF8_ALLOW_CONTINUATION
+#define UTF8_GOT_CONTINUATION           (1U << UTF8_GOT_CONTINUATION_BIT_POS_)
+#define UTF8_ALLOW_CONTINUATION                UTF8_GOT_CONTINUATION
 
 /* expecting more bytes than were available in the string */
-#define UTF8_ALLOW_SHORT		0x0008
-#define UTF8_GOT_SHORT		        UTF8_ALLOW_SHORT
+#define UTF8_GOT_SHORT                  (1U << UTF8_GOT_SHORT_BIT_POS_)
+#define UTF8_ALLOW_SHORT                       UTF8_GOT_SHORT
 
 /* Unexpected non-continuation byte */
-#define UTF8_ALLOW_NON_CONTINUATION	0x0010
-#define UTF8_GOT_NON_CONTINUATION	UTF8_ALLOW_NON_CONTINUATION
+#define UTF8_GOT_NON_CONTINUATION     (1U << UTF8_GOT_NON_CONTINUATION_BIT_POS_)
+#define UTF8_ALLOW_NON_CONTINUATION          UTF8_GOT_NON_CONTINUATION
 
-#define UTF8_DISALLOW_SURROGATE		0x0020	/* Unicode surrogates */
-#define UTF8_GOT_SURROGATE		UTF8_DISALLOW_SURROGATE
-#define UTF8_WARN_SURROGATE		0x0040
+/* Unicode surrogates */
+#define UTF8_GOT_SURROGATE              (1U << UTF8_GOT_SURROGATE_BIT_POS_)
+#define UTF8_DISALLOW_SURROGATE                UTF8_GOT_SURROGATE
+#define UTF8_WARN_SURROGATE             (1U << UTF8_WARN_SURROGATE_BIT_POS_)
 
 /* The original UTF-8 standard did not define UTF-8 with start bytes of 0xFE or
  * 0xFF, though UTF-EBCDIC did.  This allowed both versions to represent code
@@ -1204,27 +1233,27 @@ point's representation.
  * extensions, and not likely to be interchangeable with other languages.  Note
  * that on ASCII platforms, FE overflows a signed 32-bit word, and FF an
  * unsigned one. */
-#define UTF8_DISALLOW_PERL_EXTENDED     0x0080
-#define UTF8_GOT_PERL_EXTENDED          UTF8_DISALLOW_PERL_EXTENDED
-#define UTF8_WARN_PERL_EXTENDED         0x0100
+#define UTF8_GOT_PERL_EXTENDED          (1U << UTF8_GOT_PERL_EXTENDED_BIT_POS_)
+#define UTF8_DISALLOW_PERL_EXTENDED            UTF8_GOT_PERL_EXTENDED
+#define UTF8_WARN_PERL_EXTENDED         (1U << UTF8_WARN_PERL_EXTENDED_BIT_POS_)
 
 /* Super-set of Unicode: code points above the legal max */
-#define UTF8_DISALLOW_SUPER		0x0200
-#define UTF8_GOT_SUPER		        UTF8_DISALLOW_SUPER
-#define UTF8_WARN_SUPER		        0x0400
+#define UTF8_GOT_SUPER                  (1U << UTF8_GOT_SUPER_BIT_POS_)
+#define UTF8_DISALLOW_SUPER                    UTF8_GOT_SUPER
+#define UTF8_WARN_SUPER                 (1U << UTF8_WARN_SUPER_BIT_POS_)
 
 /* Unicode non-character  code points */
-#define UTF8_DISALLOW_NONCHAR           0x0800
-#define UTF8_GOT_NONCHAR                UTF8_DISALLOW_NONCHAR
-#define UTF8_WARN_NONCHAR               0x1000
+#define UTF8_GOT_NONCHAR                (1U << UTF8_GOT_NONCHAR_BIT_POS_)
+#define UTF8_DISALLOW_NONCHAR                  UTF8_GOT_NONCHAR
+#define UTF8_WARN_NONCHAR               (1U << UTF8_WARN_NONCHAR_BIT_POS_)
 
 /* Overlong sequence; i.e., the code point can be specified in fewer bytes.
  * First one will convert the overlong to the REPLACEMENT CHARACTER; second
  * will return what the overlong evaluates to */
-#define UTF8_ALLOW_LONG                 0x2000
-#define UTF8_GOT_LONG                   UTF8_ALLOW_LONG
-#define UTF8_ALLOW_LONG_AND_ITS_VALUE   0x4000
-#define UTF8_GOT_LONG_WITH_VALUE        UTF8_ALLOW_LONG_AND_ITS_VALUE
+#define UTF8_GOT_LONG                   (1U << UTF8_GOT_LONG_BIT_POS_)
+#define UTF8_ALLOW_LONG                        UTF8_GOT_LONG
+#define UTF8_GOT_LONG_WITH_VALUE     (1U << UTF8_GOT_LONG_WITH_VALUE_BIT_POS_)
+#define UTF8_ALLOW_LONG_AND_ITS_VALUE       UTF8_GOT_LONG_WITH_VALUE
 
 /* For back compat, these old names are misleading for overlongs and
  * UTF_EBCDIC. */
@@ -1234,10 +1263,12 @@ point's representation.
 #define UTF8_DISALLOW_FE_FF             UTF8_DISALLOW_PERL_EXTENDED
 #define UTF8_WARN_FE_FF                 UTF8_WARN_PERL_EXTENDED
 
-#define UTF8_CHECK_ONLY			0x8000
-#define UTF8_NO_CONFIDENCE_IN_CURLEN_   0x10000  /* Internal core use only */
-#define UTF8_DIE_IF_MALFORMED           0x20000
-#define UTF8_FORCE_WARN_IF_MALFORMED    0x40000
+#define UTF8_CHECK_ONLY                 (1U << UTF8_CHECK_ONLY_BIT_POS_)
+#define UTF8_NO_CONFIDENCE_IN_CURLEN_   /* Internal core use only */        \
+                            (1U << UTF8_NO_CONFIDENCE_IN_CURLEN_BIT_POS_)
+#define UTF8_DIE_IF_MALFORMED   (1U << UTF8_DIE_IF_MALFORMED_BIT_POS_)
+#define UTF8_FORCE_WARN_IF_MALFORMED                                        \
+                                (1U <<UTF8_FORCE_WARN_IF_MALFORMED_BIT_POS_)
 
 /* For backwards source compatibility.  They do nothing, as the default now
  * includes what they used to mean.  The first one's meaning was to allow the