Skip to content

Commit 238a42b

Browse files
committed
utf8_to_uv_msgs: Revamp handling of above-Unicode code points
As stated in a recent commit message, this is complex and problematic. This commit revamps it, simplifying it and fixing the known remaining bugs.
1 parent 22d8ec6 commit 238a42b

File tree

1 file changed

+93
-196
lines changed

1 file changed

+93
-196
lines changed

utf8.c

Lines changed: 93 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -2076,6 +2076,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20762076
const UV input_uv = uv;
20772077
U32 error_flags_return = 0;
20782078
AV * msgs_return = NULL;
2079+
Size_t super_msgs_count = 0;
20792080

20802081
/* The conditions that are rejected by default are the ones for which
20812082
* you need a flag to accept. There is a good reason for them being
@@ -2155,10 +2156,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
21552156

21562157
U32 this_flag_bit = this_problem;
21572158

2158-
/* All cases but these two set this; it makes the cases simpler
2159-
* to do it here */
2160-
error_flags_return |= this_problem & ~( UTF8_GOT_PERL_EXTENDED
2161-
|UTF8_GOT_SUPER);
2159+
/* All cases set this */
2160+
error_flags_return |= this_problem;
21622161

21632162
/* Turn off so next iteration doesn't retry this */
21642163
possible_problems &= ~this_problem;
@@ -2356,213 +2355,111 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23562355

23572356
break;
23582357

2359-
/* The remaining cases all involve non-Unicode code points.
2360-
* These come in three increasingly restrictive flavors.
2361-
* SUPERs are simply all the ones above Unicode;
2362-
* PERL_EXTENDED_UTF8 are the subset of these that are
2363-
* expressed in a non-standard extension to UTF-8. Unless also
2364-
* overlong, these have a very high ordinal value. Finally
2365-
* OVERFLOWS are for such a high code point that they don't fit
2366-
* into the word size of this platform. Perl extended-UTF-8 is
2367-
* required to express code points this high. So an overflow
2368-
* is a member of all three flavors; besides overflowing, it
2369-
* also is using perl extended UTF-8 and is also plain
2370-
* non-Unicode.
2371-
*
2372-
* There are cases in this switch for each of the three types.
2373-
* Because they are related, there are tests of the input flags
2374-
* to see what combination of these require warnings and/or
2375-
* rejection. And there a jumps between the cases. The task
2376-
* is simpler because the code earlier in the function has set
2377-
* things up so that at most one problem flag bit is set for
2378-
* any of them, the most restrictive case the input matches.
2379-
* Also, for the non-overflow cases, there is no problem flag
2380-
* bit if the caller doesn't want special handling for it.
2381-
*
2382-
* Each type has its own warning category and text,
2383-
* corresponding to the specific problem. Whenever a warning
2384-
* is generated, it uses the one for the most dire type the
2385-
* code point fits into. Suppose the flags say we warn on all
2386-
* non-Unicode code points, but not on overflowing and we get a
2387-
* code point too large for the platform. The generated
2388-
* warning will be the text that says it overflowed, while the
2389-
* returned bit will be for the SUPER type. To accomplish
2390-
* this, the formats are shared between the cases. 'cp_format'
2391-
* is used if there is a specific representable code point that
2392-
* the input translates to; if not, instead a more generic
2393-
* format, 'non_cp_format' is used */
2394-
const char * cp_format;
2395-
const char * non_cp_format;
2358+
/* The final three cases are all closely related. They are
2359+
* ordered in execution by severity of the corresponding
2360+
* condition */
2361+
STATIC_ASSERT_STMT( UTF8_GOT_OVERFLOW
2362+
< UTF8_GOT_PERL_EXTENDED);
2363+
STATIC_ASSERT_STMT(UTF8_GOT_PERL_EXTENDED < UTF8_GOT_SUPER);
2364+
2365+
/* And each is a subset of the next. The code does a bit of
2366+
* setup for each and then jumps to common handling. This
2367+
* structure comes from the desire to use the most dire warning
2368+
* suitable for the condition even if the only warning class
2369+
* that is enabled is a less severe one. It just makes sense
2370+
* that if someone wants to be warned about all above-Unicode
2371+
* code points, and this one is so far above that it won't fit
2372+
* in the platform's word size, that the overflow warning would
2373+
* be output instead of the more mild one. */
2374+
2375+
bool overflows;
2376+
bool is_extended;
23962377

23972378
case UTF8_GOT_OVERFLOW:
2398-
/* For this overflow case, any format and message text are set
2399-
* up to create the warning for it. If overflows are to be
2400-
* rejected, the warning is simply created, and we break to the
2401-
* end of the switch() (where code common to all cases will
2402-
* finish the job). Otherwise it looks to see if either the
2403-
* perl-extended or plain super cases are supposed to handle
2404-
* things. If so, it jumps into the code of the most
2405-
* restrictive one so that that they will use this more dire
2406-
* warning. If neither handle it, the code just breaks; doing
2407-
* nothing. */
2408-
non_cp_format = MALFORMED_TEXT ": %s (overflows)";
2409-
2410-
/* We can't exactly specify such a large code point, so can't
2411-
* output it */
2412-
cp_format = NULL;
2413-
2414-
/* In the unlikely case that the caller has asked to "allow"
2415-
* this malformation, we transfer to the next lower severity of
2416-
* code that handles the case; or just 'break' if none. */
2417-
if (UNLIKELY(flags & UTF8_ALLOW_OVERFLOW)) {
2418-
if (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2419-
|UTF8_WARN_PERL_EXTENDED))
2420-
{
2421-
this_flag_bit = UTF8_GOT_PERL_EXTENDED;
2422-
goto join_perl_extended;
2423-
}
2424-
if (flags & (UTF8_DISALLOW_SUPER|UTF8_WARN_SUPER)) {
2425-
this_flag_bit = UTF8_GOT_SUPER;
2426-
goto join_plain_supers;
2427-
}
2379+
COMMON_DEFAULT_REJECTS(ckWARN_d, WARN_NON_UNICODE);
2380+
overflows = true;
2381+
is_extended = true;
2382+
goto super_common;
24282383

2429-
break;
2430-
}
2384+
case UTF8_GOT_PERL_EXTENDED:
2385+
COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_PERL_EXTENDED,
2386+
WARN_NON_UNICODE, ckWARN_d,
2387+
WARN_PORTABLE);
2388+
overflows = orig_problems & UTF8_GOT_OVERFLOW;
2389+
is_extended = true;
2390+
goto super_common;
24312391

2432-
/* Here, overflow is disallowed; handle everything in this
2433-
* case: */
2434-
2435-
/* Overflow is a hybrid. If the word size on this platform
2436-
* were wide enough for this to not overflow, a non-Unicode
2437-
* code point would have been generated. If the caller wanted
2438-
* warnings for such code points, the warning category would be
2439-
* WARN_NON_UNICODE, On the other hand, overflow is considered
2440-
* a malformation, which is serious, and the category would be
2441-
* just WARN_UTF8. We clearly should warn if either category
2442-
* is enabled, but which category to use? Historically, we've
2443-
* used 'utf8' if it is enabled; and that seems like the more
2444-
* severe category, more befitting a malformation. */
2445-
pack_warn = PACK_WARN(WARN_UTF8, ckWARN_d, WARN_NON_UNICODE);
2446-
if (pack_warn) {
2447-
message = Perl_form(aTHX_ non_cp_format,
2448-
_byte_dump_string(s0, curlen, 0));
2449-
}
2392+
case UTF8_GOT_SUPER:
2393+
COMMON_DEFAULT_ACCEPTEDS(UTF8_WARN_SUPER, WARN_NON_UNICODE,,);
2394+
overflows = orig_problems & UTF8_GOT_OVERFLOW;
2395+
is_extended = UTF8_IS_PERL_EXTENDED(s0);
24502396

2451-
/* But the API says we flag all errors found that the calling
2452-
* flags indicate should be */
2453-
if (flags & ( UTF8_WARN_PERL_EXTENDED
2454-
|UTF8_DISALLOW_PERL_EXTENDED))
2455-
{
2456-
error_flags_return |= UTF8_GOT_PERL_EXTENDED;
2457-
}
2458-
if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
2459-
error_flags_return |= UTF8_GOT_SUPER;
2397+
super_common:
2398+
{
2399+
/* To get here the COMMON macros above determined that a
2400+
* warning message needs to be generated for this case.
2401+
* (Otherwise they would have executed a 'continue' statement
2402+
* to try the next case.). But they don't always catch if a
2403+
* message has already been generated for the underlying
2404+
* condition. Skip if so. */
2405+
if (super_msgs_count++) {
2406+
continue;
24602407
}
24612408

2462-
break;
2463-
2464-
case UTF8_GOT_PERL_EXTENDED:
2465-
2466-
/* We get here when the input uses Perl extended UTF-8, and the
2467-
* caller has indicated that above-Unicode code points (of
2468-
* which these are a subset) are to be disallowed and/or warned
2469-
* about
2409+
/* Now generate the message text. We can't include the code
2410+
* point in it if there isn't a specific one, either because
2411+
* this overflowed, or there weren't enough bytes to form a
2412+
* complete character.
24702413
*
2471-
* Set up the formats. We can include the code point in the
2472-
* message if we have an exact one (input not too short) and
2473-
* it's not an overlong that reduces down to something too low.
2474-
* (Otherwise, the message could say something untrue like
2475-
* "Code point 0x41 is not Unicode ...". But this would be a
2476-
* lie; 0x41 is Unicode. It was expressed in a non-standard
2477-
* form of UTF-8 that Unicode doesn't approve of.) */
2478-
cp_format = ( (orig_problems & (UTF8_GOT_TOO_SHORT))
2479-
|| ! UNICODE_IS_PERL_EXTENDED(input_uv))
2480-
? NULL
2481-
: PL_extended_cp_format;
2482-
non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2483-
" is a Perl extension, and so is not portable";
2484-
2485-
/* We know here that the caller indicated at least one of the
2486-
* EXTENDED or SUPER flags. If it's not EXTENDED, use SUPER */
2487-
if (! (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2488-
|UTF8_WARN_PERL_EXTENDED)))
2489-
{
2490-
this_flag_bit = UTF8_GOT_SUPER;
2414+
* We also can't include it if the resultant message would be
2415+
* misleading. This can happen when a sequence is an overlong,
2416+
* using Perl extended UTF-8. That could evaluate to a
2417+
* character in the Unicode range, say the letter "A"; we don't
2418+
* want a message saying that "A" isn't Unicode, because this
2419+
* would be a lie. "A" definitely is Unicode. It was just
2420+
* expressed in a non-standard form of UTF-8 that we warn
2421+
* about. If the sequence uses extended UTF-8 but the
2422+
* resulting code point isn't for above Unicode, we know we
2423+
* have this situation. */
2424+
2425+
if (overflows) {
2426+
message = Perl_form(aTHX_ "%s: %s (overflows)",
2427+
malformed_text,
2428+
_byte_dump_string(s0, curlen, 0));
24912429
}
2492-
2493-
join_perl_extended:
2494-
2495-
/* Here this level is to warn, reject, or both. The format has
2496-
* been set up to be for this level, or maybe the overflow
2497-
* case set up a more dire warning and jumped to the label just
2498-
* above (after determining that warning/rejecting here was
2499-
* enabled). We warn at this level if either it is supposed to
2500-
* warn, or plain supers are supposed to. In the latter case,
2501-
* we get this higher severity warning */
2502-
if (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) {
2503-
error_flags_return |= this_flag_bit;
2504-
2505-
/* These code points are non-portable, so warn if either
2506-
* category is enabled */
2507-
if (PACK_WARN(WARN_NON_UNICODE, ckWARN, WARN_PORTABLE)) {
2508-
if (cp_format) {
2509-
message = Perl_form(aTHX_ cp_format, input_uv);
2510-
}
2511-
else {
2512-
message = Perl_form(aTHX_
2513-
non_cp_format,
2514-
_byte_dump_string(s0, curlen, 0));
2515-
}
2430+
else if ( (orig_problems & UTF8_GOT_TOO_SHORT)
2431+
|| ( UTF8_IS_PERL_EXTENDED(s0)
2432+
&& ! UNICODE_IS_SUPER(input_uv)))
2433+
{
2434+
if (is_extended) {
2435+
message = Perl_form(aTHX_
2436+
"Any UTF-8 sequence that starts with"
2437+
" \"%s\" is a Perl extension, and so"
2438+
" is not portable",
2439+
_byte_dump_string(s0, curlen, 0));
2440+
}
2441+
else {
2442+
message = Perl_form(aTHX_
2443+
"Any UTF-8 sequence that starts with"
2444+
" \"%s\" is for a non-Unicode code"
2445+
" point, may not be portable",
2446+
_byte_dump_string(s0, curlen, 0));
25162447
}
25172448
}
2518-
2519-
/* Similarly if either of the two levels reject this, do it */
2520-
if (flags & (UTF8_DISALLOW_PERL_EXTENDED|UTF8_DISALLOW_SUPER)) {
2521-
error_flags_return |= this_flag_bit;
2449+
else if (is_extended) {
2450+
message = Perl_form(aTHX_ PL_extended_cp_format, input_uv);
25222451
}
2523-
2524-
break;
2525-
2526-
case UTF8_GOT_SUPER:
2527-
2528-
/* We get here when the input is for an above Unicode code
2529-
* point, but it does not use Perl extended UTF-8, and the
2530-
* caller has indicated that these are to be disallowed and/or
2531-
* warned about */
2532-
2533-
non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2534-
" is for a non-Unicode code point, may not be"
2535-
" portable";
2536-
2537-
/* We can include the code point in the message if we have an
2538-
* exact one (input not too short) */
2539-
cp_format = (orig_problems & (UTF8_GOT_TOO_SHORT))
2540-
? NULL
2541-
: super_cp_format;
2542-
2543-
join_plain_supers:
2544-
2545-
/* Here this level is to warn, reject, or both. The format has
2546-
* been set up to be for this level, or maybe the overflow
2547-
* case set up a more dire warning and jumped to the label just
2548-
* above (after determining that warning/rejecting here was
2549-
* enabled). */
2550-
error_flags_return |= this_flag_bit;
2551-
if (flags & UTF8_WARN_SUPER) {
2552-
if (PACK_WARN(WARN_NON_UNICODE,,)) {
2553-
if (cp_format) {
2554-
message = Perl_form(aTHX_ cp_format, input_uv);
2555-
}
2556-
else {
2557-
message = Perl_form(aTHX_
2558-
non_cp_format,
2559-
_byte_dump_string(s0, curlen, 0));
2560-
}
2561-
}
2452+
else {
2453+
message = Perl_form(aTHX_ super_cp_format, input_uv);
25622454
}
25632455

2564-
break;
2456+
/* This message only needs to output once. Ww can potentially
2457+
* save some loop iterations by turning off looking for
2458+
* warnings for it. */
2459+
flags &= ~(UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER);
25652460

2461+
break;
2462+
}
25662463
} /* End of switch() on the possible problems */
25672464

25682465
/* Display or save the message (if any) for the problem being

0 commit comments

Comments
 (0)