diff --git a/parser.h b/parser.h index 931db82f4b95..357cd5694793 100644 --- a/parser.h +++ b/parser.h @@ -112,7 +112,7 @@ typedef struct yy_parser { U8 lex_fakeeof; /* precedence at which to fake EOF */ U8 lex_flags; COP *saved_curcop; /* the previous PL_curcop */ - char tokenbuf[256]; + char tokenbuf[ 256 * MAX_UNICODE_UTF8_BYTES ]; line_t herelines; /* number of lines in here-doc */ line_t preambling; /* line # when processing $ENV{PERL5DB} */ diff --git a/t/comp/parser.t b/t/comp/parser.t index dbd5ecc842bc..3b056f5221fe 100644 --- a/t/comp/parser.t +++ b/t/comp/parser.t @@ -301,48 +301,73 @@ like($@, qr/BEGIN failed--compilation aborted/, 'BEGIN 7' ); # RT #70934 # check both the specific case in the ticket, and a few other paths into # S_scan_ident() - # simplify long ids - my $x100 = "x" x 256; - my $xFE = "x" x 254; - my $xFD = "x" x 253; - my $xFC = "x" x 252; - my $xFB = "x" x 251; - - eval qq[ \$#$xFB ]; - is($@, "", "251 character \$# sigil ident ok"); - eval qq[ \$#$xFC ]; + # The upper length limit for a token is 255 characters. But some Unicode + # characters in UTF-8 take 4 (and even 5 on EBCDIC) bytes each. Create a + # close to maximum length identifier in $plus1. (The reason it is done + # this way is to use the 5-byte code points in EBCDIC, but no such character + # as of Unicode 17.0 is an identifier start character; when they start + # assigning U+40000 is when there would be a 5-byte EBCDIC IDStart + # character.) + my $continuation = "\x{E0100}"; + my $plus1 = "\x{104B0}" . ($continuation x (255 - 1)); + + # Here $plus1 has been populated with a 4-byte Identifier Start character, + # and 254 continuation characters, each containing the most possible bytes + # available on this platform. This leaves space for a 256th character + # containing that maximum number of bytes. Instead of using that, we fill + # it to the brim with single-byte characters that we can chop off for the + # tests below. We need to calculate in byte lengths. + my $continuation_as_bytes = $continuation; + utf8::encode($continuation_as_bytes); + my $continuation_length = length $continuation_as_bytes; + + my $plus1_as_bytes = $plus1; + utf8::encode($plus1_as_bytes); + my $plus1_length = length $plus1_as_bytes; + + my $capacity = 256 * $continuation_length; + my $fill = $capacity - $plus1_length; + + $plus1 .= 'x' x $fill; + my $minus1 = substr $plus1, 0, -2; + my $minus2 = substr $minus1, 0, -1; + my $minus3 = substr $minus2, 0, -1; + my $minus4 = substr $minus3, 0, -1; + + eval qq[ \$#$minus4 ]; + is($@, "", "minus4 character \$# sigil ident ok"); + eval qq[ \$#$minus3 ]; like($@, qr/Identifier too long/, "too long id in \$# sigil ctx"); - eval qq[ \$$xFB ]; - is($@, "", "251 character \$ sigil ident ok"); - eval qq[ \$$xFC ]; + eval qq[ \$$minus4 ]; + is($@, "", "minus4 character \$ sigil ident ok"); + eval qq[ \$$minus3 ]; like($@, qr/Identifier too long/, "too long id in \$ sigil ctx"); - eval qq[ %$xFB ]; - is($@, "", "251 character % sigil ident ok"); - eval qq[ %$xFC ]; + eval qq[ %$minus4 ]; + is($@, "", "minus4 character % sigil ident ok"); + eval qq[ %$minus3 ]; like($@, qr/Identifier too long/, "too long id in % sigil ctx"); - eval qq[ \\&$xFB ]; # take a ref since I don't want to call it - is($@, "", "251 character & sigil ident ok"); - eval qq[ \\&$xFC ]; + eval qq[ \\&$minus4 ]; # take a ref since I don't want to call it + is($@, "", "minus4 character & sigil ident ok"); + eval qq[ \\&$minus3 ]; like($@, qr/Identifier too long/, "too long id in & sigil ctx"); - eval qq[ *$xFC ]; - is($@, "", "252 character glob ident ok"); - eval qq[ *$xFD ]; + eval qq[ *$minus3 ]; + is($@, "", "minus3 character glob ident ok"); + eval qq[ *$minus2 ]; like($@, qr/Identifier too long/, "too long id in glob ctx"); - eval qq[ for $xFC ]; + eval qq[ for $minus3 ]; like($@, qr/^Missing \$ on loop variable /, - "252 char id ok, but a different error"); - eval qq[ for $xFD; ]; + "minus3 char id ok, but a different error"); + eval qq[ for $minus2; ]; like($@, qr/^Missing \$ on loop variable /, "too long id in for ctx"); # the specific case from the ticket # however the parsing code in yyl_foreach has now changed - my $x = "x" x 257; - eval qq[ for $x ]; + eval qq[ for $plus1 ]; like($@, qr/^Missing \$ on loop variable /, "too long id ticket case"); # as PL_tokenbuf is now PL_parser->tokenbuf, the "buffer overflow" that was @@ -354,8 +379,7 @@ like($@, qr/BEGIN failed--compilation aborted/, 'BEGIN 7' ); # the buggy change to the calculation of the variable `e` in scan_word() # instead. - my $x = "x" x 260; - eval qq[ for my $x \$foo ]; + eval qq[ for my $plus1 \$foo ]; like($@, qr/at \(eval \d+\) line 1[,.]/, "line number is reported correctly"); } @@ -509,12 +533,27 @@ BEGIN{ ${"_<".__FILE__} = \1 } is __FILE__, $file, 'no __FILE__ corruption when setting CopFILESV to a ref'; -eval 'Fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' - .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' - .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' - .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' - .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' - .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'; +eval 'Ffffooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' + .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'; + like $@, "^Identifier too long at ", 'ident buffer overflow'; eval 'for my a1b $i (1) {}'; diff --git a/utf8.h b/utf8.h index 0fe5036ae148..86d010c399da 100644 --- a/utf8.h +++ b/utf8.h @@ -574,6 +574,11 @@ regen/charset_translations.pl. */ + (pos) + ((UTF_CONTINUATION_BYTE_INFO_BITS - 1) - 1)) /* Step fcn */ \ / (UTF_CONTINUATION_BYTE_INFO_BITS - 1)) /* take floor of */ + +/* The maximum number of bytes required to represent any Unicode code point + * 0..0x10FFFF */ +#define MAX_UNICODE_UTF8_BYTES UNISKIP_BY_MSB_(20) + /* Compute the number of UTF-8 bytes required for representing the input uv, * which must be a Unicode, not native value. * @@ -728,7 +733,7 @@ uppercase/lowercase/titlecase/fold into. =cut */ #define UTF8_MAXBYTES_CASE \ - MAX(UTF8_MAXBYTES, UTF8_MAX_FOLD_CHAR_EXPAND * UNISKIP_BY_MSB_(20)) + MAX(UTF8_MAXBYTES, UTF8_MAX_FOLD_CHAR_EXPAND * MAX_UNICODE_UTF8_BYTES) /* Rest of these are attributes of Unicode and perl's internals rather than the * encoding, or happen to be the same in both ASCII and EBCDIC (at least at @@ -1090,7 +1095,7 @@ this macro matches || NATIVE_UTF8_TO_I8(s[1]) >= UTF_FIRST_CONT_BYTE_110000_)) #define UTF8_IS_SUPER(s, e) \ - ((((e) - (s)) >= UNISKIP_BY_MSB_(20) && UTF8_IS_SUPER_NO_CHECK_(s)) \ + ((((e) - (s)) >= MAX_UNICODE_UTF8_BYTES && UTF8_IS_SUPER_NO_CHECK_(s)) \ ? isUTF8_CHAR(s, e) \ : 0)