Skip to content

Commit 8785c11

Browse files
committed
parser.h Allow up to 256 characters in a token
This is already the claimed allowed length. But that is a lie, until this commit. Instead, the buffer has been 256 bytes long, which means, we can have 256 1-byte characters in an identifier; but only 128 2-byte ones, etc. Unicode can have 4-byte identifier characters, so our limit has really been just 64 for those. The direction perl is supposed to be going, according to perldiag, is to eliminate any identifier length limit. I don't feel the urge to do that now, but simply increasing the buffer size to accommodate any 256 Unicode identifier characters causes us to meet our claim. The trickiest part of this by far was to get parser.t to pass, which contrary to perldiag, tests very specifically about identifiers just shy of 256. One thing it does is to create a long string. I just replaced every character in it by 4 repeats, and then split into shorter lines.
1 parent 7e81f19 commit 8785c11

File tree

2 files changed

+75
-36
lines changed

2 files changed

+75
-36
lines changed

parser.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ typedef struct yy_parser {
112112
U8 lex_fakeeof; /* precedence at which to fake EOF */
113113
U8 lex_flags;
114114
COP *saved_curcop; /* the previous PL_curcop */
115-
char tokenbuf[256];
115+
char tokenbuf[ 256 * MAX_UNICODE_UTF8_BYTES ];
116116
line_t herelines; /* number of lines in here-doc */
117117
line_t preambling; /* line # when processing $ENV{PERL5DB} */
118118

t/comp/parser.t

Lines changed: 74 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -301,48 +301,73 @@ like($@, qr/BEGIN failed--compilation aborted/, 'BEGIN 7' );
301301
# RT #70934
302302
# check both the specific case in the ticket, and a few other paths into
303303
# S_scan_ident()
304-
# simplify long ids
305-
my $x100 = "x" x 256;
306-
my $xFE = "x" x 254;
307-
my $xFD = "x" x 253;
308-
my $xFC = "x" x 252;
309-
my $xFB = "x" x 251;
310-
311-
eval qq[ \$#$xFB ];
312-
is($@, "", "251 character \$# sigil ident ok");
313-
eval qq[ \$#$xFC ];
304+
# The upper length limit for a token is 255 characters. But some Unicode
305+
# characters in UTF-8 take 4 (and even 5 on EBCDIC) bytes each. Create a
306+
# close to maximum length identifier in $plus1. (The reason it is done
307+
# this way is to use the 5-byte code points in EBCDIC, but no such character
308+
# as of Unicode 17.0 is an identifier start character; when they start
309+
# assigning U+40000 is when there would be a 5-byte EBCDIC IDStart
310+
# character.)
311+
my $continuation = "\x{E0100}";
312+
my $plus1 = "\x{104B0}" . ($continuation x (255 - 1));
313+
314+
# Here $plus1 has been populated with a 4-byte Identifier Start character,
315+
# and 254 continuation characters, each containing the most possible bytes
316+
# available on this platform. This leaves space for a 256th character
317+
# containing that maximum number of bytes. Instead of using that, we fill
318+
# it to the brim with single-byte characters that we can chop off for the
319+
# tests below. We need to calculate in byte lengths.
320+
my $continuation_as_bytes = $continuation;
321+
utf8::encode($continuation_as_bytes);
322+
my $continuation_length = length $continuation_as_bytes;
323+
324+
my $plus1_as_bytes = $plus1;
325+
utf8::encode($plus1_as_bytes);
326+
my $plus1_length = length $plus1_as_bytes;
327+
328+
my $capacity = 256 * $continuation_length;
329+
my $fill = $capacity - $plus1_length;
330+
331+
$plus1 .= 'x' x $fill;
332+
my $minus1 = substr $plus1, 0, -2;
333+
my $minus2 = substr $minus1, 0, -1;
334+
my $minus3 = substr $minus2, 0, -1;
335+
my $minus4 = substr $minus3, 0, -1;
336+
337+
eval qq[ \$#$minus4 ];
338+
is($@, "", "minus4 character \$# sigil ident ok");
339+
eval qq[ \$#$minus3 ];
314340
like($@, qr/Identifier too long/, "too long id in \$# sigil ctx");
315341

316-
eval qq[ \$$xFB ];
317-
is($@, "", "251 character \$ sigil ident ok");
318-
eval qq[ \$$xFC ];
342+
eval qq[ \$$minus4 ];
343+
is($@, "", "minus4 character \$ sigil ident ok");
344+
eval qq[ \$$minus3 ];
319345
like($@, qr/Identifier too long/, "too long id in \$ sigil ctx");
320346

321-
eval qq[ %$xFB ];
322-
is($@, "", "251 character % sigil ident ok");
323-
eval qq[ %$xFC ];
347+
eval qq[ %$minus4 ];
348+
is($@, "", "minus4 character % sigil ident ok");
349+
eval qq[ %$minus3 ];
324350
like($@, qr/Identifier too long/, "too long id in % sigil ctx");
325351

326-
eval qq[ \\&$xFB ]; # take a ref since I don't want to call it
327-
is($@, "", "251 character & sigil ident ok");
328-
eval qq[ \\&$xFC ];
352+
eval qq[ \\&$minus4 ]; # take a ref since I don't want to call it
353+
is($@, "", "minus4 character & sigil ident ok");
354+
eval qq[ \\&$minus3 ];
329355
like($@, qr/Identifier too long/, "too long id in & sigil ctx");
330356

331-
eval qq[ *$xFC ];
332-
is($@, "", "252 character glob ident ok");
333-
eval qq[ *$xFD ];
357+
eval qq[ *$minus3 ];
358+
is($@, "", "minus3 character glob ident ok");
359+
eval qq[ *$minus2 ];
334360
like($@, qr/Identifier too long/, "too long id in glob ctx");
335361

336-
eval qq[ for $xFC ];
362+
eval qq[ for $minus3 ];
337363
like($@, qr/^Missing \$ on loop variable /,
338-
"252 char id ok, but a different error");
339-
eval qq[ for $xFD; ];
364+
"minus3 char id ok, but a different error");
365+
eval qq[ for $minus2; ];
340366
like($@, qr/^Missing \$ on loop variable /, "too long id in for ctx");
341367

342368
# the specific case from the ticket
343369
# however the parsing code in yyl_foreach has now changed
344-
my $x = "x" x 257;
345-
eval qq[ for $x ];
370+
eval qq[ for $plus1 ];
346371
like($@, qr/^Missing \$ on loop variable /, "too long id ticket case");
347372

348373
# as PL_tokenbuf is now PL_parser->tokenbuf, the "buffer overflow" that was
@@ -354,8 +379,7 @@ like($@, qr/BEGIN failed--compilation aborted/, 'BEGIN 7' );
354379
# the buggy change to the calculation of the variable `e` in scan_word()
355380
# instead.
356381

357-
my $x = "x" x 260;
358-
eval qq[ for my $x \$foo ];
382+
eval qq[ for my $plus1 \$foo ];
359383
like($@, qr/at \(eval \d+\) line 1[,.]/, "line number is reported correctly");
360384
}
361385

@@ -509,12 +533,27 @@ BEGIN{ ${"_<".__FILE__} = \1 }
509533
is __FILE__, $file,
510534
'no __FILE__ corruption when setting CopFILESV to a ref';
511535
512-
eval 'Fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
513-
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
514-
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
515-
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
516-
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
517-
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo';
536+
eval 'Ffffooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
537+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
538+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
539+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
540+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
541+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
542+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
543+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
544+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
545+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
546+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
547+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
548+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
549+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
550+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
551+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
552+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
553+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
554+
.'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
555+
.'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo';
556+
518557
like $@, "^Identifier too long at ", 'ident buffer overflow';
519558
520559
eval 'for my a1b $i (1) {}';

0 commit comments

Comments
 (0)