@@ -2076,6 +2076,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2076
2076
const UV input_uv = uv ;
2077
2077
U32 error_flags_return = 0 ;
2078
2078
AV * msgs_return = NULL ;
2079
+ Size_t super_msgs_count = 0 ;
2079
2080
2080
2081
/* The conditions that are rejected by default are the ones for which
2081
2082
* you need a flag to accept. There is a good reason for them being
@@ -2155,10 +2156,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2155
2156
2156
2157
U32 this_flag_bit = this_problem ;
2157
2158
2158
- /* All cases but these two set this; it makes the cases simpler
2159
- * to do it here */
2160
- error_flags_return |= this_problem & ~( UTF8_GOT_PERL_EXTENDED
2161
- |UTF8_GOT_SUPER );
2159
+ /* All cases set this */
2160
+ error_flags_return |= this_problem ;
2162
2161
2163
2162
/* Turn off so next iteration doesn't retry this */
2164
2163
possible_problems &= ~this_problem ;
@@ -2356,213 +2355,111 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2356
2355
2357
2356
break ;
2358
2357
2359
- /* The remaining cases all involve non-Unicode code points.
2360
- * These come in three increasingly restrictive flavors.
2361
- * SUPERs are simply all the ones above Unicode;
2362
- * PERL_EXTENDED_UTF8 are the subset of these that are
2363
- * expressed in a non-standard extension to UTF-8. Unless also
2364
- * overlong, these have a very high ordinal value. Finally
2365
- * OVERFLOWS are for such a high code point that they don't fit
2366
- * into the word size of this platform. Perl extended-UTF-8 is
2367
- * required to express code points this high. So an overflow
2368
- * is a member of all three flavors; besides overflowing, it
2369
- * also is using perl extended UTF-8 and is also plain
2370
- * non-Unicode.
2371
- *
2372
- * There are cases in this switch for each of the three types.
2373
- * Because they are related, there are tests of the input flags
2374
- * to see what combination of these require warnings and/or
2375
- * rejection. And there a jumps between the cases. The task
2376
- * is simpler because the code earlier in the function has set
2377
- * things up so that at most one problem flag bit is set for
2378
- * any of them, the most restrictive case the input matches.
2379
- * Also, for the non-overflow cases, there is no problem flag
2380
- * bit if the caller doesn't want special handling for it.
2381
- *
2382
- * Each type has its own warning category and text,
2383
- * corresponding to the specific problem. Whenever a warning
2384
- * is generated, it uses the one for the most dire type the
2385
- * code point fits into. Suppose the flags say we warn on all
2386
- * non-Unicode code points, but not on overflowing and we get a
2387
- * code point too large for the platform. The generated
2388
- * warning will be the text that says it overflowed, while the
2389
- * returned bit will be for the SUPER type. To accomplish
2390
- * this, the formats are shared between the cases. 'cp_format'
2391
- * is used if there is a specific representable code point that
2392
- * the input translates to; if not, instead a more generic
2393
- * format, 'non_cp_format' is used */
2394
- const char * cp_format ;
2395
- const char * non_cp_format ;
2358
+ /* The final three cases are all closely related. They are
2359
+ * ordered in execution by severity of the corresponding
2360
+ * condition */
2361
+ STATIC_ASSERT_STMT ( UTF8_GOT_OVERFLOW
2362
+ < UTF8_GOT_PERL_EXTENDED );
2363
+ STATIC_ASSERT_STMT (UTF8_GOT_PERL_EXTENDED < UTF8_GOT_SUPER );
2364
+
2365
+ /* And each is a subset of the next. The code does a bit of
2366
+ * setup for each and then jumps to common handling. This
2367
+ * structure comes from the desire to use the most dire warning
2368
+ * suitable for the condition even if the only warning class
2369
+ * that is enabled is a less severe one. It just makes sense
2370
+ * that if someone wants to be warned about all above-Unicode
2371
+ * code points, and this one is so far above that it won't fit
2372
+ * in the platform's word size, that the overflow warning would
2373
+ * be output instead of the more mild one. */
2374
+
2375
+ bool overflows ;
2376
+ bool is_extended ;
2396
2377
2397
2378
case UTF8_GOT_OVERFLOW :
2398
- /* For this overflow case, any format and message text are set
2399
- * up to create the warning for it. If overflows are to be
2400
- * rejected, the warning is simply created, and we break to the
2401
- * end of the switch() (where code common to all cases will
2402
- * finish the job). Otherwise it looks to see if either the
2403
- * perl-extended or plain super cases are supposed to handle
2404
- * things. If so, it jumps into the code of the most
2405
- * restrictive one so that that they will use this more dire
2406
- * warning. If neither handle it, the code just breaks; doing
2407
- * nothing. */
2408
- non_cp_format = MALFORMED_TEXT ": %s (overflows)" ;
2409
-
2410
- /* We can't exactly specify such a large code point, so can't
2411
- * output it */
2412
- cp_format = NULL ;
2413
-
2414
- /* In the unlikely case that the caller has asked to "allow"
2415
- * this malformation, we transfer to the next lower severity of
2416
- * code that handles the case; or just 'break' if none. */
2417
- if (UNLIKELY (flags & UTF8_ALLOW_OVERFLOW )) {
2418
- if (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2419
- |UTF8_WARN_PERL_EXTENDED ))
2420
- {
2421
- this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2422
- goto join_perl_extended ;
2423
- }
2424
- if (flags & (UTF8_DISALLOW_SUPER |UTF8_WARN_SUPER )) {
2425
- this_flag_bit = UTF8_GOT_SUPER ;
2426
- goto join_plain_supers ;
2427
- }
2379
+ COMMON_DEFAULT_REJECTS (ckWARN_d , WARN_NON_UNICODE );
2380
+ overflows = true;
2381
+ is_extended = true;
2382
+ goto super_common ;
2428
2383
2429
- break ;
2430
- }
2384
+ case UTF8_GOT_PERL_EXTENDED :
2385
+ COMMON_DEFAULT_ACCEPTEDS (UTF8_WARN_PERL_EXTENDED ,
2386
+ WARN_NON_UNICODE , ckWARN_d ,
2387
+ WARN_PORTABLE );
2388
+ overflows = orig_problems & UTF8_GOT_OVERFLOW ;
2389
+ is_extended = true;
2390
+ goto super_common ;
2431
2391
2432
- /* Here, overflow is disallowed; handle everything in this
2433
- * case: */
2434
-
2435
- /* Overflow is a hybrid. If the word size on this platform
2436
- * were wide enough for this to not overflow, a non-Unicode
2437
- * code point would have been generated. If the caller wanted
2438
- * warnings for such code points, the warning category would be
2439
- * WARN_NON_UNICODE, On the other hand, overflow is considered
2440
- * a malformation, which is serious, and the category would be
2441
- * just WARN_UTF8. We clearly should warn if either category
2442
- * is enabled, but which category to use? Historically, we've
2443
- * used 'utf8' if it is enabled; and that seems like the more
2444
- * severe category, more befitting a malformation. */
2445
- pack_warn = PACK_WARN (WARN_UTF8 , ckWARN_d , WARN_NON_UNICODE );
2446
- if (pack_warn ) {
2447
- message = Perl_form (aTHX_ non_cp_format ,
2448
- _byte_dump_string (s0 , curlen , 0 ));
2449
- }
2392
+ case UTF8_GOT_SUPER :
2393
+ COMMON_DEFAULT_ACCEPTEDS (UTF8_WARN_SUPER , WARN_NON_UNICODE ,,);
2394
+ overflows = orig_problems & UTF8_GOT_OVERFLOW ;
2395
+ is_extended = UTF8_IS_PERL_EXTENDED (s0 );
2450
2396
2451
- /* But the API says we flag all errors found that the calling
2452
- * flags indicate should be */
2453
- if (flags & ( UTF8_WARN_PERL_EXTENDED
2454
- |UTF8_DISALLOW_PERL_EXTENDED ))
2455
- {
2456
- error_flags_return |= UTF8_GOT_PERL_EXTENDED ;
2457
- }
2458
- if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
2459
- error_flags_return |= UTF8_GOT_SUPER ;
2397
+ super_common :
2398
+ {
2399
+ /* To get here the COMMON macros above determined that a
2400
+ * warning message needs to be generated for this case.
2401
+ * (Otherwise they would have executed a 'continue' statement
2402
+ * to try the next case.). But they don't always catch if a
2403
+ * message has already been generated for the underlying
2404
+ * condition. Skip if so. */
2405
+ if (super_msgs_count ++ ) {
2406
+ continue ;
2460
2407
}
2461
2408
2462
- break ;
2463
-
2464
- case UTF8_GOT_PERL_EXTENDED :
2465
-
2466
- /* We get here when the input uses Perl extended UTF-8, and the
2467
- * caller has indicated that above-Unicode code points (of
2468
- * which these are a subset) are to be disallowed and/or warned
2469
- * about
2409
+ /* Now generate the message text. We can't include the code
2410
+ * point in it if there isn't a specific one, either because
2411
+ * this overflowed, or there weren't enough bytes to form a
2412
+ * complete character.
2470
2413
*
2471
- * Set up the formats. We can include the code point in the
2472
- * message if we have an exact one (input not too short) and
2473
- * it's not an overlong that reduces down to something too low.
2474
- * (Otherwise, the message could say something untrue like
2475
- * "Code point 0x41 is not Unicode ...". But this would be a
2476
- * lie; 0x41 is Unicode. It was expressed in a non-standard
2477
- * form of UTF-8 that Unicode doesn't approve of.) */
2478
- cp_format = ( (orig_problems & (UTF8_GOT_TOO_SHORT ))
2479
- || ! UNICODE_IS_PERL_EXTENDED (input_uv ))
2480
- ? NULL
2481
- : PL_extended_cp_format ;
2482
- non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2483
- " is a Perl extension, and so is not portable" ;
2484
-
2485
- /* We know here that the caller indicated at least one of the
2486
- * EXTENDED or SUPER flags. If it's not EXTENDED, use SUPER */
2487
- if (! (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2488
- |UTF8_WARN_PERL_EXTENDED )))
2489
- {
2490
- this_flag_bit = UTF8_GOT_SUPER ;
2414
+ * We also can't include it if the resultant message would be
2415
+ * misleading. This can happen when a sequence is an overlong,
2416
+ * using Perl extended UTF-8. That could evaluate to a
2417
+ * character in the Unicode range, say the letter "A"; we don't
2418
+ * want a message saying that "A" isn't Unicode, because this
2419
+ * would be a lie. "A" definitely is Unicode. It was just
2420
+ * expressed in a non-standard form of UTF-8 that we warn
2421
+ * about. If the sequence uses extended UTF-8 but the
2422
+ * resulting code point isn't for above Unicode, we know we
2423
+ * have this situation. */
2424
+
2425
+ if (overflows ) {
2426
+ message = Perl_form (aTHX_ "%s: %s (overflows)" ,
2427
+ malformed_text ,
2428
+ _byte_dump_string (s0 , curlen , 0 ));
2491
2429
}
2492
-
2493
- join_perl_extended :
2494
-
2495
- /* Here this level is to warn, reject, or both. The format has
2496
- * been set up to be for this level, or maybe the overflow
2497
- * case set up a more dire warning and jumped to the label just
2498
- * above (after determining that warning/rejecting here was
2499
- * enabled). We warn at this level if either it is supposed to
2500
- * warn, or plain supers are supposed to. In the latter case,
2501
- * we get this higher severity warning */
2502
- if (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER )) {
2503
- error_flags_return |= this_flag_bit ;
2504
-
2505
- /* These code points are non-portable, so warn if either
2506
- * category is enabled */
2507
- if (PACK_WARN (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE )) {
2508
- if (cp_format ) {
2509
- message = Perl_form (aTHX_ cp_format , input_uv );
2510
- }
2511
- else {
2512
- message = Perl_form (aTHX_
2513
- non_cp_format ,
2514
- _byte_dump_string (s0 , curlen , 0 ));
2515
- }
2430
+ else if ( (orig_problems & UTF8_GOT_TOO_SHORT )
2431
+ || ( UTF8_IS_PERL_EXTENDED (s0 )
2432
+ && ! UNICODE_IS_SUPER (input_uv )))
2433
+ {
2434
+ if (is_extended ) {
2435
+ message = Perl_form (aTHX_
2436
+ "Any UTF-8 sequence that starts with"
2437
+ " \"%s\" is a Perl extension, and so"
2438
+ " is not portable" ,
2439
+ _byte_dump_string (s0 , curlen , 0 ));
2440
+ }
2441
+ else {
2442
+ message = Perl_form (aTHX_
2443
+ "Any UTF-8 sequence that starts with"
2444
+ " \"%s\" is for a non-Unicode code"
2445
+ " point, may not be portable" ,
2446
+ _byte_dump_string (s0 , curlen , 0 ));
2516
2447
}
2517
2448
}
2518
-
2519
- /* Similarly if either of the two levels reject this, do it */
2520
- if (flags & (UTF8_DISALLOW_PERL_EXTENDED |UTF8_DISALLOW_SUPER )) {
2521
- error_flags_return |= this_flag_bit ;
2449
+ else if (is_extended ) {
2450
+ message = Perl_form (aTHX_ PL_extended_cp_format , input_uv );
2522
2451
}
2523
-
2524
- break ;
2525
-
2526
- case UTF8_GOT_SUPER :
2527
-
2528
- /* We get here when the input is for an above Unicode code
2529
- * point, but it does not use Perl extended UTF-8, and the
2530
- * caller has indicated that these are to be disallowed and/or
2531
- * warned about */
2532
-
2533
- non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2534
- " is for a non-Unicode code point, may not be"
2535
- " portable" ;
2536
-
2537
- /* We can include the code point in the message if we have an
2538
- * exact one (input not too short) */
2539
- cp_format = (orig_problems & (UTF8_GOT_TOO_SHORT ))
2540
- ? NULL
2541
- : super_cp_format ;
2542
-
2543
- join_plain_supers :
2544
-
2545
- /* Here this level is to warn, reject, or both. The format has
2546
- * been set up to be for this level, or maybe the overflow
2547
- * case set up a more dire warning and jumped to the label just
2548
- * above (after determining that warning/rejecting here was
2549
- * enabled). */
2550
- error_flags_return |= this_flag_bit ;
2551
- if (flags & UTF8_WARN_SUPER ) {
2552
- if (PACK_WARN (WARN_NON_UNICODE ,,)) {
2553
- if (cp_format ) {
2554
- message = Perl_form (aTHX_ cp_format , input_uv );
2555
- }
2556
- else {
2557
- message = Perl_form (aTHX_
2558
- non_cp_format ,
2559
- _byte_dump_string (s0 , curlen , 0 ));
2560
- }
2561
- }
2452
+ else {
2453
+ message = Perl_form (aTHX_ super_cp_format , input_uv );
2562
2454
}
2563
2455
2564
- break ;
2456
+ /* This message only needs to output once. Ww can potentially
2457
+ * save some loop iterations by turning off looking for
2458
+ * warnings for it. */
2459
+ flags &= ~(UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER );
2565
2460
2461
+ break ;
2462
+ }
2566
2463
} /* End of switch() on the possible problems */
2567
2464
2568
2465
/* Display or save the message (if any) for the problem being
0 commit comments