@@ -301,48 +301,73 @@ like($@, qr/BEGIN failed--compilation aborted/, 'BEGIN 7' );
301301 # RT #70934
302302 # check both the specific case in the ticket, and a few other paths into
303303 # S_scan_ident()
304- # simplify long ids
305- my $x100 = " x" x 256 ;
306- my $xFE = " x" x 254 ;
307- my $xFD = " x" x 253 ;
308- my $xFC = " x" x 252 ;
309- my $xFB = " x" x 251 ;
310-
311- eval qq[ \$ #$xFB ] ;
312- is($@ , " " , " 251 character \$ # sigil ident ok" );
313- eval qq[ \$ #$xFC ] ;
304+ # The upper length limit for a token is 255 characters. But some Unicode
305+ # characters in UTF-8 take 4 (and even 5 on EBCDIC) bytes each. Create a
306+ # close to maximum length identifier in $plus1. (The reason it is done
307+ # this way is to use the 5-byte code points in EBCDIC, but no such character
308+ # as of Unicode 17.0 is an identifier start character; when they start
309+ # assigning U+40000 is when there would be a 5-byte EBCDIC IDStart
310+ # character.)
311+ my $continuation = " \x{E0100} " ;
312+ my $plus1 = " \x{104B0} " . ($continuation x (255 - 1));
313+
314+ # Here $plus1 has been populated with a 4-byte Identifier Start character,
315+ # and 254 continuation characters, each containing the most possible bytes
316+ # available on this platform. This leaves space for a 256th character
317+ # containing that maximum number of bytes. Instead of using that, we fill
318+ # it to the brim with single-byte characters that we can chop off for the
319+ # tests below. We need to calculate in byte lengths.
320+ my $continuation_as_bytes = $continuation ;
321+ utf8::encode($continuation_as_bytes );
322+ my $continuation_length = length $continuation_as_bytes ;
323+
324+ my $plus1_as_bytes = $plus1 ;
325+ utf8::encode($plus1_as_bytes );
326+ my $plus1_length = length $plus1_as_bytes ;
327+
328+ my $capacity = 256 * $continuation_length ;
329+ my $fill = $capacity - $plus1_length ;
330+
331+ $plus1 .= ' x' x $fill ;
332+ my $minus1 = substr $plus1 , 0, -2;
333+ my $minus2 = substr $minus1 , 0, -1;
334+ my $minus3 = substr $minus2 , 0, -1;
335+ my $minus4 = substr $minus3 , 0, -1;
336+
337+ eval qq[ \$ #$minus4 ] ;
338+ is($@ , " " , " minus4 character \$ # sigil ident ok" );
339+ eval qq[ \$ #$minus3 ] ;
314340 like($@ , qr / Identifier too long/ , " too long id in \$ # sigil ctx" );
315341
316- eval qq[ \$ $xFB ] ;
317- is($@ , " " , " 251 character \$ sigil ident ok" );
318- eval qq[ \$ $xFC ] ;
342+ eval qq[ \$ $minus4 ] ;
343+ is($@ , " " , " minus4 character \$ sigil ident ok" );
344+ eval qq[ \$ $minus3 ] ;
319345 like($@ , qr / Identifier too long/ , " too long id in \$ sigil ctx" );
320346
321- eval qq[ %$xFB ] ;
322- is($@ , " " , " 251 character % sigil ident ok" );
323- eval qq[ %$xFC ] ;
347+ eval qq[ %$minus4 ] ;
348+ is($@ , " " , " minus4 character % sigil ident ok" );
349+ eval qq[ %$minus3 ] ;
324350 like($@ , qr / Identifier too long/ , " too long id in % sigil ctx" );
325351
326- eval qq[ \\ &$xFB ] ; # take a ref since I don't want to call it
327- is($@ , " " , " 251 character & sigil ident ok" );
328- eval qq[ \\ &$xFC ] ;
352+ eval qq[ \\ &$minus4 ] ; # take a ref since I don't want to call it
353+ is($@ , " " , " minus4 character & sigil ident ok" );
354+ eval qq[ \\ &$minus3 ] ;
329355 like($@ , qr / Identifier too long/ , " too long id in & sigil ctx" );
330356
331- eval qq[ *$xFC ] ;
332- is($@ , " " , " 252 character glob ident ok" );
333- eval qq[ *$xFD ] ;
357+ eval qq[ *$minus3 ] ;
358+ is($@ , " " , " minus3 character glob ident ok" );
359+ eval qq[ *$minus2 ] ;
334360 like($@ , qr / Identifier too long/ , " too long id in glob ctx" );
335361
336- eval qq[ for $xFC ] ;
362+ eval qq[ for $minus3 ] ;
337363 like($@ , qr / ^Missing \$ on loop variable / ,
338- " 252 char id ok, but a different error" );
339- eval qq[ for $xFD ; ] ;
364+ " minus3 char id ok, but a different error" );
365+ eval qq[ for $minus2 ; ] ;
340366 like($@ , qr / ^Missing \$ on loop variable / , " too long id in for ctx" );
341367
342368 # the specific case from the ticket
343369 # however the parsing code in yyl_foreach has now changed
344- my $x = " x" x 257 ;
345- eval qq[ for $x ] ;
370+ eval qq[ for $plus1 ] ;
346371 like($@ , qr / ^Missing \$ on loop variable / , " too long id ticket case" );
347372
348373 # as PL_tokenbuf is now PL_parser->tokenbuf, the "buffer overflow" that was
@@ -354,8 +379,7 @@ like($@, qr/BEGIN failed--compilation aborted/, 'BEGIN 7' );
354379 # the buggy change to the calculation of the variable `e` in scan_word()
355380 # instead.
356381
357- my $x = " x" x 260 ;
358- eval qq[ for my $x \$ foo ] ;
382+ eval qq[ for my $plus1 \$ foo ] ;
359383 like($@ , qr / at \( eval \d +\) line 1[,.]/ , " line number is reported correctly" );
360384}
361385
@@ -509,12 +533,27 @@ BEGIN{ ${"_<".__FILE__} = \1 }
509533is __FILE__, $file ,
510534 'no __FILE__ corruption when setting CopFILESV to a ref';
511535
512- eval 'Fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
513- .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
514- .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
515- .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
516- .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
517- .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo';
536+ eval 'Ffffooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
537+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
538+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
539+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
540+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
541+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
542+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
543+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
544+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
545+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
546+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
547+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
548+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
549+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
550+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
551+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
552+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
553+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
554+ .'oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo'
555+ .'ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo';
556+
518557like $@ , "^Identifier too long at ", 'ident buffer overflow';
519558
520559eval 'for my a1b $i (1) {}';
0 commit comments