@@ -2399,14 +2399,16 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2399
2399
2400
2400
#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
2401
2401
2402
- /* There is some start-up/tear-down overhead with this, so no real gain
2402
+ /* Do a first pass through the string to see if it actually is translatable
2403
+ * into bytes. On long strings this is
2404
+ * done a word at a time, so is relatively quick. (There is some
2405
+ * start-up/tear-down overhead with the per-word algorithm, so no real gain
2403
2406
* unless the remaining portion of the string is long enough. The current
2404
- * value is just a guess. */
2407
+ * value is just a guess.) On EBCDIC, it's always per-byte. */
2405
2408
if ((send - s ) > (ptrdiff_t ) (5 * PERL_WORDSIZE )) {
2406
2409
2407
- /* First, go through the string a word at-a-time to verify that it is
2408
- * downgradable. If it contains any start byte besides C2 and C3, then
2409
- * it isn't. */
2410
+ /* If the string contains any start byte besides C2 and C3, then it
2411
+ * isn't translatable into bytes */
2410
2412
2411
2413
const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0 ;
2412
2414
const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2 ;
@@ -2490,9 +2492,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2490
2492
}
2491
2493
2492
2494
#endif
2493
-
2494
- /* Do the straggler bytes beyond the final word boundary (or all bytes
2495
- * in the case of EBCDIC) */
2495
+ /* Do the straggler bytes beyond what the loop above did */
2496
2496
while (s < send ) {
2497
2497
if (! UTF8_IS_INVARIANT (* s )) {
2498
2498
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE (s , send )) {
@@ -2504,19 +2504,18 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2504
2504
s ++ ;
2505
2505
}
2506
2506
2507
- /* Here, we passed the tests above. For the EBCDIC case, everything
2508
- * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC,
2509
- * it means only that all start bytes were C2 or C3, hence any
2510
- * well-formed sequences are downgradable. But we didn't test, for
2511
- * example, that there weren't two C2's in a row. That means that in
2512
- * the loop below, we have to be sure things are well-formed. Because
2513
- * this is very very likely, and we don't care about having speedy
2514
- * handling of malformed input, the loop proceeds as if well formed,
2515
- * and should a malformed one come along, it undoes what it already has
2516
- * done */
2517
-
2518
2507
U8 * d = s = first_variant ;
2519
2508
2509
+ /* For the cases where the per-word algorithm wasn't used, everything is
2510
+ * well-formed and can definitely be translated. When the per word
2511
+ * algorithm was used, it found that all start bytes in the string were C2
2512
+ * or C3, hence any well-formed sequences are convertible to bytes. But we
2513
+ * didn't test, for example, that there weren't two C2's in a row. That
2514
+ * means that in the loop below, we have to be sure things are well-formed.
2515
+ * Because it is very very unlikely that we got this far for something
2516
+ * malformed, and because we prioritize speed in the normal case over the
2517
+ * malformed one, we go ahead and do the translation, and undo it if found
2518
+ * to be necessary. */
2520
2519
while (s < send ) {
2521
2520
U8 c = * s ++ ;
2522
2521
if (! UVCHR_IS_INVARIANT (c )) {
@@ -2548,12 +2547,11 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2548
2547
2549
2548
cant_convert : ;
2550
2549
2551
- /* Here, it is malformed. This shouldn't happen on EBCDIC, and on ASCII
2552
- * platforms, we know that the only start bytes in the text are C2 and C3,
2553
- * and the code above has made sure that it doesn't end with a start byte.
2554
- * That means the only malformations that are possible are a start byte
2555
- * without a continuation (either followed by another start byte or an
2556
- * invariant) or an unexpected continuation.
2550
+ /* Here, we found a malformation in the input. This won't happen except
2551
+ * when the per-word algorithm was used in the first pass, because that may
2552
+ * miss some malformations. It determined that the only start bytes in the
2553
+ * text are C2 and C3, but didn't examine it to make sure each of those was
2554
+ * followed by precisely one continuation, for example.
2557
2555
*
2558
2556
* We have to undo all we've done before, back down to the first UTF-8
2559
2557
* variant. Note that each 2-byte variant we've done so far (converted to
0 commit comments