@@ -2635,13 +2635,8 @@ Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
26352635 * cachegrind). The number isn't critical, as at these sizes, the total
26362636 * time spent isn't large either way */
26372637
2638- #ifndef EBCDIC
2639-
2640- if (e - s0 < 96 )
2641-
2642- #endif
2643-
2644- {
2638+ const U8 * const per_byte_end = WORTH_PER_WORD_LOOP (s0 , e , 12 );
2639+ if (! per_byte_end ) {
26452640 while (s < e ) { /* Count characters directly */
26462641
26472642 /* Take extra care to not exceed 'e' (which would be undefined
@@ -2670,30 +2665,23 @@ Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
26702665 return s - s0 ;
26712666 }
26722667
2673- #ifndef EBCDIC
2674-
26752668 /* Count continuations, word-at-a-time.
26762669 *
26772670 * We need to stop before the final start character in order to
26782671 * preserve the limited error checking that's always been done */
26792672 const U8 * e_limit = e - UTF8_MAXBYTES ;
26802673
2681- /* Points to the first byte >=s which is positioned at a word boundary. If
2682- * s is on a word boundary, it is s, otherwise it is to the next word. */
2683- const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR (s )
2684- - (PTR2nat (s ) & PERL_WORD_BOUNDARY_MASK );
2685-
26862674 /* Process up to a full word boundary. */
2687- while (s < partial_word_end ) {
2675+ while (s < per_byte_end ) {
26882676 const Size_t skip = UTF8SKIP (s );
26892677
26902678 continuations += skip - 1 ;
26912679 s += skip ;
26922680 }
26932681
26942682 /* Adjust back down any overshoot */
2695- continuations -= s - partial_word_end ;
2696- s = partial_word_end ;
2683+ continuations -= s - per_byte_end ;
2684+ s = per_byte_end ;
26972685
26982686 do { /* Process per-word */
26992687
@@ -2742,8 +2730,6 @@ Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
27422730 break ;
27432731 }
27442732
2745- # endif
2746-
27472733 if (LIKELY (e == s )) {
27482734 return s - s0 - continuations ;
27492735 }
@@ -3014,15 +3000,11 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
30143000 Size_t invariant_length = first_variant - s0 ;
30153001 Size_t variant_count = 0 ;
30163002
3017- #ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
3018-
3019- /* Do a first pass through the string to see if it actually is translatable
3020- * into bytes, and if so, how big the result is. On long strings this is
3021- * done a word at a time, so is relatively quick. (There is some
3022- * start-up/tear-down overhead with the per-word algorithm, so no real gain
3003+ /* There is some start-up/tear-down overhead with this, so no real gain
30233004 * unless the remaining portion of the string is long enough. The current
3024- * value is just a guess.) On EBCDIC, it's always per-byte. */
3025- if ((send - s ) > (ptrdiff_t ) (5 * PERL_WORDSIZE )) {
3005+ * value is just a guess. */
3006+ U8 * const per_byte_end = WORTH_PER_WORD_LOOP (s , send , 5 );
3007+ if (per_byte_end ) {
30263008
30273009 /* If the string contains any start byte besides C2 and C3, then it
30283010 * isn't translatable into bytes */
@@ -3031,15 +3013,7 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
30313013 const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2 ;
30323014 const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE ;
30333015
3034- /* Points to the first byte >=s which is positioned at a word boundary.
3035- * If s is on a word boundary, it is s, otherwise it is the first byte
3036- * of the next word. */
3037- U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR (s )
3038- - (PTR2nat (s ) & PERL_WORD_BOUNDARY_MASK );
3039-
3040- /* Here there is at least a full word beyond the first word boundary.
3041- * Process up to that boundary. */
3042- while (s < partial_word_end ) {
3016+ while (s < per_byte_end ) {
30433017 if (! UTF8_IS_INVARIANT (* s )) {
30443018 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE (s , send )) {
30453019 return false;
@@ -3053,7 +3027,7 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
30533027 }
30543028
30553029 /* Adjust back down any overshoot */
3056- s = partial_word_end ;
3030+ s = per_byte_end ;
30573031
30583032 /* Process per-word */
30593033 do {
@@ -3116,7 +3090,6 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
31163090 }
31173091 }
31183092
3119- #endif
31203093 /* Do the straggler bytes beyond what the loop above did */
31213094 while (s < send ) {
31223095 if (! UTF8_IS_INVARIANT (* s )) {
0 commit comments