diff --git a/utf8.c b/utf8.c index 90ec77026412..4f36b89ffbee 100644 --- a/utf8.c +++ b/utf8.c @@ -2189,8 +2189,9 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * depend on earlier actions. Also the ordering tries to cause any * messages to be displayed in kind of decreasing severity order. * */ - U32 this_problem = 1U << lsbit_pos32(possible_problems); + U8 this_problem_bit = lsbit_pos32(possible_problems); + U32 this_problem = 1U << this_problem_bit; U32 this_flag_bit = this_problem; /* All cases set this */ @@ -2215,7 +2216,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * would be to handle the message. */ - switch (this_problem) { + switch (this_problem_bit) { default: croak("panic: Unexpected case value in utf8_to_uv_msgs() %" U32uf, this_problem); @@ -2232,7 +2233,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, continue; \ } \ - case UTF8_GOT_EMPTY: + case UTF8_GOT_EMPTY_BIT_POS_: COMMON_DEFAULT_REJECTS(,); /* This so-called malformation is now treated as a bug in the @@ -2243,7 +2244,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, message = Perl_form(aTHX_ "%s (empty string)", malformed_text); break; - case UTF8_GOT_CONTINUATION: + case UTF8_GOT_CONTINUATION_BIT_POS_: COMMON_DEFAULT_REJECTS(,); message = form( "%s: %s (unexpected continuation byte 0x%02x," @@ -2253,7 +2254,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, *s0); break; - case UTF8_GOT_SHORT: + case UTF8_GOT_SHORT_BIT_POS_: COMMON_DEFAULT_REJECTS(,); message = form( "%s: %s (too short; %d byte%s available, need %d)", @@ -2264,7 +2265,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, (int)expectlen); break; - case UTF8_GOT_NON_CONTINUATION: + case UTF8_GOT_NON_CONTINUATION_BIT_POS_: { COMMON_DEFAULT_REJECTS(,); @@ -2282,8 +2283,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, break; } - case UTF8_GOT_LONG: - case UTF8_GOT_LONG_WITH_VALUE: + case UTF8_GOT_LONG_BIT_POS_: + case UTF8_GOT_LONG_WITH_VALUE_BIT_POS_: COMMON_DEFAULT_REJECTS(,); /* These error types cause 'input_uv' to be something that @@ -2356,7 +2357,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, continue; \ } - case UTF8_GOT_SURROGATE: + case UTF8_GOT_SURROGATE_BIT_POS_: COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_SURROGATE, WARN_SURROGATE,,); @@ -2374,7 +2375,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, break; - case UTF8_GOT_NONCHAR: + case UTF8_GOT_NONCHAR_BIT_POS_: COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_NONCHAR, WARN_NONCHAR,,); /* The code above should have guaranteed that we don't get here @@ -2406,13 +2407,13 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, bool overflows; bool is_extended; - case UTF8_GOT_OVERFLOW: + case UTF8_GOT_OVERFLOW_BIT_POS_: COMMON_DEFAULT_REJECTS(ckWARN_d, WARN_NON_UNICODE); overflows = true; is_extended = true; goto super_common; - case UTF8_GOT_PERL_EXTENDED: + case UTF8_GOT_PERL_EXTENDED_BIT_POS_: COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_PERL_EXTENDED, WARN_NON_UNICODE, ckWARN_d, WARN_PORTABLE); @@ -2420,7 +2421,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, is_extended = true; goto super_common; - case UTF8_GOT_SUPER: + case UTF8_GOT_SUPER_BIT_POS_: COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_SUPER, WARN_NON_UNICODE,,); overflows = orig_problems & UTF8_GOT_OVERFLOW; is_extended = UTF8_IS_PERL_EXTENDED(s0); diff --git a/utf8.h b/utf8.h index 97c31f1d33b1..e2a403ae4666 100644 --- a/utf8.h +++ b/utf8.h @@ -1173,27 +1173,56 @@ point's representation. /* The ordering of these bits is important to a switch() statement in utf8.c * for handling problems in converting UTF-8 to a UV */ -#define UTF8_ALLOW_OVERFLOW 0x0001 -#define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW +#define UTF8_GOT_OVERFLOW_BIT_POS_ 0 +#define UTF8_GOT_EMPTY_BIT_POS_ 1 +#define UTF8_GOT_CONTINUATION_BIT_POS_ 2 +#define UTF8_GOT_SHORT_BIT_POS_ 3 +#define UTF8_GOT_NON_CONTINUATION_BIT_POS_ 4 -#define UTF8_ALLOW_EMPTY 0x0002 /* Allow a zero length string */ -#define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY +#define UTF8_GOT_SURROGATE_BIT_POS_ 5 +#define UTF8_WARN_SURROGATE_BIT_POS_ 6 + +#define UTF8_GOT_PERL_EXTENDED_BIT_POS_ 7 +#define UTF8_WARN_PERL_EXTENDED_BIT_POS_ 8 + +#define UTF8_GOT_SUPER_BIT_POS_ 9 +#define UTF8_WARN_SUPER_BIT_POS_ 10 + +#define UTF8_GOT_NONCHAR_BIT_POS_ 11 +#define UTF8_WARN_NONCHAR_BIT_POS_ 12 + +#define UTF8_GOT_LONG_BIT_POS_ 13 +#define UTF8_GOT_LONG_WITH_VALUE_BIT_POS_ 14 + +#define UTF8_CHECK_ONLY_BIT_POS_ 15 +#define UTF8_DIE_IF_MALFORMED_BIT_POS_ 16 +#define UTF8_FORCE_WARN_IF_MALFORMED_BIT_POS_ 17 + +#define UTF8_NO_CONFIDENCE_IN_CURLEN_BIT_POS_ 18 + +#define UTF8_GOT_OVERFLOW (1U << UTF8_GOT_OVERFLOW_BIT_POS_) +#define UTF8_ALLOW_OVERFLOW UTF8_GOT_OVERFLOW + +/* Allow a zero length string */ +#define UTF8_GOT_EMPTY (1U << UTF8_GOT_EMPTY_BIT_POS_) +#define UTF8_ALLOW_EMPTY UTF8_GOT_EMPTY /* Allow first byte to be a continuation byte */ -#define UTF8_ALLOW_CONTINUATION 0x0004 -#define UTF8_GOT_CONTINUATION UTF8_ALLOW_CONTINUATION +#define UTF8_GOT_CONTINUATION (1U << UTF8_GOT_CONTINUATION_BIT_POS_) +#define UTF8_ALLOW_CONTINUATION UTF8_GOT_CONTINUATION /* expecting more bytes than were available in the string */ -#define UTF8_ALLOW_SHORT 0x0008 -#define UTF8_GOT_SHORT UTF8_ALLOW_SHORT +#define UTF8_GOT_SHORT (1U << UTF8_GOT_SHORT_BIT_POS_) +#define UTF8_ALLOW_SHORT UTF8_GOT_SHORT /* Unexpected non-continuation byte */ -#define UTF8_ALLOW_NON_CONTINUATION 0x0010 -#define UTF8_GOT_NON_CONTINUATION UTF8_ALLOW_NON_CONTINUATION +#define UTF8_GOT_NON_CONTINUATION (1U << UTF8_GOT_NON_CONTINUATION_BIT_POS_) +#define UTF8_ALLOW_NON_CONTINUATION UTF8_GOT_NON_CONTINUATION -#define UTF8_DISALLOW_SURROGATE 0x0020 /* Unicode surrogates */ -#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE -#define UTF8_WARN_SURROGATE 0x0040 +/* Unicode surrogates */ +#define UTF8_GOT_SURROGATE (1U << UTF8_GOT_SURROGATE_BIT_POS_) +#define UTF8_DISALLOW_SURROGATE UTF8_GOT_SURROGATE +#define UTF8_WARN_SURROGATE (1U << UTF8_WARN_SURROGATE_BIT_POS_) /* The original UTF-8 standard did not define UTF-8 with start bytes of 0xFE or * 0xFF, though UTF-EBCDIC did. This allowed both versions to represent code @@ -1204,27 +1233,27 @@ point's representation. * extensions, and not likely to be interchangeable with other languages. Note * that on ASCII platforms, FE overflows a signed 32-bit word, and FF an * unsigned one. */ -#define UTF8_DISALLOW_PERL_EXTENDED 0x0080 -#define UTF8_GOT_PERL_EXTENDED UTF8_DISALLOW_PERL_EXTENDED -#define UTF8_WARN_PERL_EXTENDED 0x0100 +#define UTF8_GOT_PERL_EXTENDED (1U << UTF8_GOT_PERL_EXTENDED_BIT_POS_) +#define UTF8_DISALLOW_PERL_EXTENDED UTF8_GOT_PERL_EXTENDED +#define UTF8_WARN_PERL_EXTENDED (1U << UTF8_WARN_PERL_EXTENDED_BIT_POS_) /* Super-set of Unicode: code points above the legal max */ -#define UTF8_DISALLOW_SUPER 0x0200 -#define UTF8_GOT_SUPER UTF8_DISALLOW_SUPER -#define UTF8_WARN_SUPER 0x0400 +#define UTF8_GOT_SUPER (1U << UTF8_GOT_SUPER_BIT_POS_) +#define UTF8_DISALLOW_SUPER UTF8_GOT_SUPER +#define UTF8_WARN_SUPER (1U << UTF8_WARN_SUPER_BIT_POS_) /* Unicode non-character code points */ -#define UTF8_DISALLOW_NONCHAR 0x0800 -#define UTF8_GOT_NONCHAR UTF8_DISALLOW_NONCHAR -#define UTF8_WARN_NONCHAR 0x1000 +#define UTF8_GOT_NONCHAR (1U << UTF8_GOT_NONCHAR_BIT_POS_) +#define UTF8_DISALLOW_NONCHAR UTF8_GOT_NONCHAR +#define UTF8_WARN_NONCHAR (1U << UTF8_WARN_NONCHAR_BIT_POS_) /* Overlong sequence; i.e., the code point can be specified in fewer bytes. * First one will convert the overlong to the REPLACEMENT CHARACTER; second * will return what the overlong evaluates to */ -#define UTF8_ALLOW_LONG 0x2000 -#define UTF8_GOT_LONG UTF8_ALLOW_LONG -#define UTF8_ALLOW_LONG_AND_ITS_VALUE 0x4000 -#define UTF8_GOT_LONG_WITH_VALUE UTF8_ALLOW_LONG_AND_ITS_VALUE +#define UTF8_GOT_LONG (1U << UTF8_GOT_LONG_BIT_POS_) +#define UTF8_ALLOW_LONG UTF8_GOT_LONG +#define UTF8_GOT_LONG_WITH_VALUE (1U << UTF8_GOT_LONG_WITH_VALUE_BIT_POS_) +#define UTF8_ALLOW_LONG_AND_ITS_VALUE UTF8_GOT_LONG_WITH_VALUE /* For back compat, these old names are misleading for overlongs and * UTF_EBCDIC. */ @@ -1234,10 +1263,12 @@ point's representation. #define UTF8_DISALLOW_FE_FF UTF8_DISALLOW_PERL_EXTENDED #define UTF8_WARN_FE_FF UTF8_WARN_PERL_EXTENDED -#define UTF8_CHECK_ONLY 0x8000 -#define UTF8_NO_CONFIDENCE_IN_CURLEN_ 0x10000 /* Internal core use only */ -#define UTF8_DIE_IF_MALFORMED 0x20000 -#define UTF8_FORCE_WARN_IF_MALFORMED 0x40000 +#define UTF8_CHECK_ONLY (1U << UTF8_CHECK_ONLY_BIT_POS_) +#define UTF8_NO_CONFIDENCE_IN_CURLEN_ /* Internal core use only */ \ + (1U << UTF8_NO_CONFIDENCE_IN_CURLEN_BIT_POS_) +#define UTF8_DIE_IF_MALFORMED (1U << UTF8_DIE_IF_MALFORMED_BIT_POS_) +#define UTF8_FORCE_WARN_IF_MALFORMED \ + (1U <