Skip to content

Commit 94a5ebf

Browse files
committed
Remove some special EBCDIC code
The 'variant_byte_number' function was written to find the byte number in a word of the first byte whose meaning varies depending on if the string it is part of is encoded in UTF-8 or not. On ASCII machines, that is simply when the upper bit is set. On EBCDIC machines, there is no similar pattern, so this function hasn't been compiled on those. A long time ago, I realized that this function could also handle binary data by coercing that binary data into having the form of having that bit set or not depending on the pattern being looked for, and then calling that function. But I actually hadn't realized until now that it was binary data not tied to a character set that was being worked on. This commit rectifies that. A new alias is added for that function that emphasizes that it works on binary data, the function is now compiled for EBCDIC, and the EBCDIC-only code that avoided using it is now removed.
1 parent d734970 commit 94a5ebf

File tree

5 files changed

+26
-47
lines changed

5 files changed

+26
-47
lines changed

embed.fnc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4010,6 +4010,8 @@ CRTdip |UV |valid_utf8_to_uv \
40104010
CRTdmp |UV |valid_utf8_to_uvchr \
40114011
|NN const U8 *s \
40124012
|NULLOK STRLEN *retlen
4013+
CRTip |unsigned int|variant_byte_number \
4014+
|PERL_UINTMAX_T word
40134015
Adp |int |vcmp |NN SV *lhv \
40144016
|NN SV *rhv
40154017
Adpr |void |vcroak |NULLOK const char *pat \
@@ -4108,10 +4110,6 @@ TXp |void |set_padlist |NN CV *cv \
41084110
: Used in sv.c
41094111
p |void |dump_sv_child |NN SV *sv
41104112
#endif
4111-
#if !defined(EBCDIC)
4112-
CRTip |unsigned int|variant_byte_number \
4113-
|PERL_UINTMAX_T word
4114-
#endif
41154113
#if defined(F_FREESP) && !defined(HAS_CHSIZE) && !defined(HAS_TRUNCATE)
41164114
ARdp |I32 |my_chsize |int fd \
41174115
|Off_t length

embed.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,7 @@
848848
# define valid_identifier_sv(a) Perl_valid_identifier_sv(aTHX_ a)
849849
# define valid_utf8_to_uv Perl_valid_utf8_to_uv
850850
# define Perl_valid_utf8_to_uvchr valid_utf8_to_uvchr
851+
# define variant_byte_number Perl_variant_byte_number
851852
# define vcmp(a,b) Perl_vcmp(aTHX_ a,b)
852853
# define vcroak(a,b) Perl_vcroak(aTHX_ a,b)
853854
# define vdeb(a,b) Perl_vdeb(aTHX_ a,b)
@@ -874,9 +875,6 @@
874875
# define pad_setsv(a,b) Perl_pad_setsv(aTHX_ a,b)
875876
# define pad_sv(a) Perl_pad_sv(aTHX_ a)
876877
# endif
877-
# if !defined(EBCDIC)
878-
# define variant_byte_number Perl_variant_byte_number
879-
# endif
880878
# if defined(F_FREESP) && !defined(HAS_CHSIZE) && !defined(HAS_TRUNCATE)
881879
# define my_chsize(a,b) Perl_my_chsize(aTHX_ a,b)
882880
# endif

inline.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2008,13 +2008,20 @@ Perl_single_1bit_pos32(U32 word)
20082008

20092009
}
20102010

2011-
#ifndef EBCDIC
2011+
/* Returns the byte number of the lowest numbered-byte whose uppermost bit is
2012+
* set */
2013+
#define first_upper_bit_set_byte_number(word) Perl_variant_byte_number(word)
20122014

20132015
PERL_STATIC_INLINE unsigned int
20142016
Perl_variant_byte_number(PERL_UINTMAX_T word)
20152017
{
2016-
/* This returns the position in a word (0..7) of the first variant byte in
2017-
* it. This is a helper function. Note that there are no branches */
2018+
/* This returns the position in a word (0..7) of the first byte whose
2019+
* uppermost bit is set. On ASCII boxes, this is equivalent to the first
2020+
* byte whose representation is different in UTF-8 vs not, hence the name
2021+
* and text in the comments. It was only later that this was used for
2022+
* binary data, not tied to the character set.
2023+
*
2024+
* This is a helper function. Note that there are no branches */
20182025

20192026
/* Get just the msb bits of each byte */
20202027
word &= PERL_VARIANTS_WORD_MASK;
@@ -2023,7 +2030,7 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
20232030
* word */
20242031
assert(word);
20252032

2026-
# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
2033+
#if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
20272034

20282035
/* Bytes are stored like
20292036
* Byte8 ... Byte2 Byte1
@@ -2036,7 +2043,7 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
20362043
* to 0..7 */
20372044
return (unsigned int) ((word + 1) >> 3) - 1;
20382045

2039-
# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
2046+
#elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
20402047

20412048
/* Bytes are stored like
20422049
* Byte1 Byte2 ... Byte8
@@ -2069,11 +2076,10 @@ Perl_variant_byte_number(PERL_UINTMAX_T word)
20692076
/* If all else fails, it's better to return something than just random */
20702077
return 0;
20712078

2072-
# endif
2079+
#endif
20732080

20742081
}
20752082

2076-
#endif
20772083
#if defined(PERL_CORE) || defined(PERL_EXT)
20782084

20792085
/*

proto.h

Lines changed: 5 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

regexec.c

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -653,15 +653,9 @@ S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
653653
continue;
654654
}
655655

656-
/* Here, at least one byte in the word isn't 'span_byte'. */
657-
658-
#ifdef EBCDIC
659-
660-
break;
661-
662-
#else
663-
664-
/* This xor leaves 1 bits only in those non-matching bytes */
656+
/* Here, at least one byte in the word isn't 'span_byte'.
657+
*
658+
* This xor leaves 1 bits only in those non-matching bytes */
665659
span_word ^= * (PERL_UINTMAX_T *) s;
666660

667661
/* Make sure the upper bit of each non-matching byte is set. This
@@ -671,10 +665,7 @@ S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
671665
span_word |= span_word << 4;
672666

673667
/* That reduces the problem to what this function solves */
674-
return s + variant_byte_number(span_word);
675-
676-
#endif
677-
668+
return s + first_upper_bit_set_byte_number(span_word);
678669
} while (s + PERL_WORDSIZE <= send);
679670
}
680671

@@ -789,20 +780,11 @@ S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
789780
continue;
790781
}
791782

792-
#ifdef EBCDIC
793-
794-
break;
795-
796-
#else
797-
798783
masked ^= span_word;
799784
masked |= masked << 1;
800785
masked |= masked << 2;
801786
masked |= masked << 4;
802-
return s + variant_byte_number(masked);
803-
804-
#endif
805-
787+
return s + first_upper_bit_set_byte_number(masked);
806788
} while (s + PERL_WORDSIZE <= send);
807789
}
808790

0 commit comments

Comments
 (0)