Skip to content

Commit 2b72566

Browse files
committed
Use new WORTH_PER_WORD_LOOP()
This converts the places that could benefit from this new macro (and its kin) to use them.
1 parent 2d92a68 commit 2b72566

File tree

3 files changed

+27
-92
lines changed

3 files changed

+27
-92
lines changed

inline.h

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1573,25 +1573,13 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
15731573
len = strlen((const char *)s);
15741574
}
15751575

1576-
1577-
#ifndef EBCDIC
1578-
15791576
/* Do the word-at-a-time iff there is at least one usable full word. That
15801577
* means that after advancing to a word boundary, there still is at least a
1581-
* full word left. The number of bytes needed to advance is 'wordsize -
1582-
* offset' unless offset is 0. */
1583-
if ((STRLEN) (send - x) >= PERL_WORDSIZE
1584-
1585-
/* This term is wordsize if subword; 0 if not */
1586-
+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1587-
1588-
/* 'offset' */
1589-
- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1590-
{
1578+
* full word left. */
1579+
const U8 * const per_byte_end = WORTH_PER_WORD_LOOP(x, send, 1);
15911580

1592-
/* Process per-byte until reach word boundary. XXX This loop could be
1593-
* eliminated if we knew that this platform had fast unaligned reads */
1594-
while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1581+
if (per_byte_end) {
1582+
while (x < per_byte_end ) {
15951583
if (! UTF8_IS_INVARIANT(*x)) {
15961584
if (ep) {
15971585
*ep = x;
@@ -1633,8 +1621,6 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
16331621
} while (x + PERL_WORDSIZE <= send);
16341622
}
16351623

1636-
#endif /* End of ! EBCDIC */
1637-
16381624
/* Process per-byte. (Can't use libc functions like strpbrk() because
16391625
* input isn't necessarily a C string) */
16401626
while (x < send) {
@@ -2116,18 +2102,11 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e)
21162102
const U8* x = s;
21172103
Size_t count = 0;
21182104

2119-
# ifndef EBCDIC
2120-
21212105
/* Test if the string is long enough to use word-at-a-time. (Logic is the
21222106
* same as for is_utf8_invariant_string()) */
2123-
if ((STRLEN) (e - x) >= PERL_WORDSIZE
2124-
+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
2125-
- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
2126-
{
2127-
2128-
/* Process per-byte until reach word boundary. XXX This loop could be
2129-
* eliminated if we knew that this platform had fast unaligned reads */
2130-
while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
2107+
const U8 * const per_byte_end = WORTH_PER_WORD_LOOP(x, e, 1);
2108+
if (per_byte_end) {
2109+
while (x < per_byte_end ) {
21312110
count += ! UTF8_IS_INVARIANT(*x++);
21322111
}
21332112

@@ -2143,8 +2122,6 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e)
21432122
} while (x + PERL_WORDSIZE <= e);
21442123
}
21452124

2146-
# endif
2147-
21482125
/* Process per-byte */
21492126
while (x < e) {
21502127
if (! UTF8_IS_INVARIANT(*x)) {

regexec.c

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -632,14 +632,9 @@ S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
632632
* 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
633633
* */
634634

635-
if ((STRLEN) (send - s) >= PERL_WORDSIZE
636-
+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
637-
- (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
638-
{
639-
640-
/* Process per-byte until reach word boundary. XXX This loop could be
641-
* eliminated if we knew that this platform had fast unaligned reads */
642-
while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
635+
const U8 * const per_byte_end = WORTH_PER_WORD_LOOP_BINMODE(s, send, 1);
636+
if (per_byte_end) {
637+
while (s < per_byte_end ) {
643638
if (*s != span_byte) {
644639
return s;
645640
}
@@ -705,14 +700,9 @@ S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
705700
* returns 'send' if none found. It uses word-level operations instead of
706701
* byte to speed up the process */
707702

708-
#ifndef EBCDIC
709-
710-
if ((STRLEN) (send - s) >= PERL_WORDSIZE
711-
+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
712-
- (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
713-
{
714-
715-
while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
703+
const U8 * const per_byte_end = WORTH_PER_WORD_LOOP(s, send, 1);
704+
if (per_byte_end) {
705+
while (s < per_byte_end ) {
716706
if (((*s) & mask) == byte) {
717707
return s;
718708
}
@@ -757,8 +747,6 @@ S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
757747
} while (s + PERL_WORDSIZE <= send);
758748
}
759749

760-
#endif
761-
762750
while (s < send) {
763751
if (((*s) & mask) == byte) {
764752
return s;
@@ -781,12 +769,9 @@ S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
781769
* function. Returns 'send' if none found. Works like find_span_end(),
782770
* except for the AND */
783771

784-
if ((STRLEN) (send - s) >= PERL_WORDSIZE
785-
+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
786-
- (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
787-
{
788-
789-
while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
772+
const U8 * const per_byte_end = WORTH_PER_WORD_LOOP_BINMODE(s, send, 1);
773+
if (per_byte_end) {
774+
while (s < per_byte_end ) {
790775
if (((*s) & mask) != span_byte) {
791776
return s;
792777
}

utf8.c

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2635,13 +2635,8 @@ Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
26352635
* cachegrind). The number isn't critical, as at these sizes, the total
26362636
* time spent isn't large either way */
26372637

2638-
#ifndef EBCDIC
2639-
2640-
if (e - s0 < 96)
2641-
2642-
#endif
2643-
2644-
{
2638+
const U8 * const per_byte_end = WORTH_PER_WORD_LOOP(s0, e, 12);
2639+
if (! per_byte_end) {
26452640
while (s < e) { /* Count characters directly */
26462641

26472642
/* Take extra care to not exceed 'e' (which would be undefined
@@ -2670,30 +2665,23 @@ Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
26702665
return s - s0;
26712666
}
26722667

2673-
#ifndef EBCDIC
2674-
26752668
/* Count continuations, word-at-a-time.
26762669
*
26772670
* We need to stop before the final start character in order to
26782671
* preserve the limited error checking that's always been done */
26792672
const U8 * e_limit = e - UTF8_MAXBYTES;
26802673

2681-
/* Points to the first byte >=s which is positioned at a word boundary. If
2682-
* s is on a word boundary, it is s, otherwise it is to the next word. */
2683-
const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2684-
- (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2685-
26862674
/* Process up to a full word boundary. */
2687-
while (s < partial_word_end) {
2675+
while (s < per_byte_end ) {
26882676
const Size_t skip = UTF8SKIP(s);
26892677

26902678
continuations += skip - 1;
26912679
s += skip;
26922680
}
26932681

26942682
/* Adjust back down any overshoot */
2695-
continuations -= s - partial_word_end;
2696-
s = partial_word_end;
2683+
continuations -= s - per_byte_end;
2684+
s = per_byte_end;
26972685

26982686
do { /* Process per-word */
26992687

@@ -2742,8 +2730,6 @@ Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
27422730
break;
27432731
}
27442732

2745-
# endif
2746-
27472733
if (LIKELY(e == s)) {
27482734
return s - s0 - continuations;
27492735
}
@@ -3014,15 +3000,11 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
30143000
Size_t invariant_length = first_variant - s0;
30153001
Size_t variant_count = 0;
30163002

3017-
#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
3018-
3019-
/* Do a first pass through the string to see if it actually is translatable
3020-
* into bytes, and if so, how big the result is. On long strings this is
3021-
* done a word at a time, so is relatively quick. (There is some
3022-
* start-up/tear-down overhead with the per-word algorithm, so no real gain
3003+
/* There is some start-up/tear-down overhead with this, so no real gain
30233004
* unless the remaining portion of the string is long enough. The current
3024-
* value is just a guess.) On EBCDIC, it's always per-byte. */
3025-
if ((send - s) > (ptrdiff_t) (5 * PERL_WORDSIZE)) {
3005+
* value is just a guess. */
3006+
U8 * const per_byte_end = WORTH_PER_WORD_LOOP(s, send, 5);
3007+
if (per_byte_end) {
30263008

30273009
/* If the string contains any start byte besides C2 and C3, then it
30283010
* isn't translatable into bytes */
@@ -3031,15 +3013,7 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
30313013
const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
30323014
const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
30333015

3034-
/* Points to the first byte >=s which is positioned at a word boundary.
3035-
* If s is on a word boundary, it is s, otherwise it is the first byte
3036-
* of the next word. */
3037-
U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
3038-
- (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
3039-
3040-
/* Here there is at least a full word beyond the first word boundary.
3041-
* Process up to that boundary. */
3042-
while (s < partial_word_end) {
3016+
while (s < per_byte_end ) {
30433017
if (! UTF8_IS_INVARIANT(*s)) {
30443018
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
30453019
return false;
@@ -3053,7 +3027,7 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
30533027
}
30543028

30553029
/* Adjust back down any overshoot */
3056-
s = partial_word_end;
3030+
s = per_byte_end;
30573031

30583032
/* Process per-word */
30593033
do {
@@ -3116,7 +3090,6 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp, void ** free_me,
31163090
}
31173091
}
31183092

3119-
#endif
31203093
/* Do the straggler bytes beyond what the loop above did */
31213094
while (s < send) {
31223095
if (! UTF8_IS_INVARIANT(*s)) {

0 commit comments

Comments
 (0)