From 472c3d75600725a2571a2f4df4b8ecb712c5e8c3 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 10:34:04 -0600 Subject: [PATCH 01/17] regen/unicode_constants: Create one for SHY The soft hypen is treated specially in toke.c --- regen/unicode_constants.pl | 1 + unicode_constants.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 538a397edce5..8afc21703464 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -1039,6 +1039,7 @@ END U+10FFFF string MAX_UNICODE +SHY native NBSP native NBSP string diff --git a/unicode_constants.h b/unicode_constants.h index 78b401683265..574678d9e10f 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -76,6 +76,7 @@ bytes. # define MAX_UNICODE_UTF8 "\xF4\x8F\xBF\xBF" /* U+10FFFF */ +# define SHY_NATIVE 0xAD /* U+00AD */ # define NBSP_NATIVE 0xA0 /* U+00A0 */ # define NBSP_UTF8 "\xC2\xA0" /* U+00A0 */ @@ -142,6 +143,7 @@ bytes. # define MAX_UNICODE_UTF8 "\xEE\x42\x73\x73\x73" /* U+10FFFF */ +# define SHY_NATIVE 0xCA /* U+00AD */ # define NBSP_NATIVE 0x41 /* U+00A0 */ # define NBSP_UTF8 "\x80\x41" /* U+00A0 */ @@ -208,6 +210,7 @@ bytes. # define MAX_UNICODE_UTF8 "\xEE\x42\x72\x72\x72" /* U+10FFFF */ +# define SHY_NATIVE 0xCA /* U+00AD */ # define NBSP_NATIVE 0x41 /* U+00A0 */ # define NBSP_UTF8 "\x78\x41" /* U+00A0 */ From a8bb10b24417462da4ee64efabbb967aa3565f25 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 11 Oct 2025 05:17:24 -0600 Subject: [PATCH 02/17] S_scan_ident: Convert to flags parameter This is in preparation for passing other options to this function --- embed.fnc | 2 +- proto.h | 2 +- toke.c | 21 ++++++++++++--------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/embed.fnc b/embed.fnc index 3a16e2ffbe43..e15a894a60b3 100644 --- a/embed.fnc +++ b/embed.fnc @@ -6194,7 +6194,7 @@ RS |char * |scan_heredoc |NN char *s S |char * |scan_ident |NN char *s \ |SPTR char *dest \ |EPTR char *dest_end \ - |bool chk_unary + |U32 flags RS |char * |scan_inputsymbol \ |NN char *start RS |char * |scan_pat |NN char *start \ diff --git a/proto.h b/proto.h index c2908b823fc1..80b0c279021d 100644 --- a/proto.h +++ b/proto.h @@ -9500,7 +9500,7 @@ S_scan_heredoc(pTHX_ char *s) assert(s) STATIC char * -S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary); +S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags); # define PERL_ARGS_ASSERT_SCAN_IDENT \ assert(s); assert(dest); assert(dest_end); assert(dest < dest_end) diff --git a/toke.c b/toke.c index f9a20bdd9250..b2a50981acdf 100644 --- a/toke.c +++ b/toke.c @@ -180,6 +180,7 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th #define IDFIRST_ONLY (1 << 3) #define STOP_AT_FIRST_NON_DIGIT (1 << 4) #define CHECK_ONLY (1 << 5) +#define CHECK_UNARY (1 << 6) #ifdef DEBUGGING static const char* const lex_state_names[] = { @@ -4746,7 +4747,7 @@ S_intuit_more(pTHX_ char *s, char *e, * * khw: If what follows can't be an identifier, say it is too * long or is $001, then it must be a charclass */ - scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), FALSE); + scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), 0); len = strlen(tmpbuf); /* khw: This only looks at global variables; lexicals came @@ -5606,8 +5607,7 @@ yyl_dollar(pTHX_ char *s) || memCHRs("{$:+-@", s[2]))) { PL_tokenbuf[0] = '@'; - s = scan_ident(s + 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), - FALSE); + s = scan_ident(s + 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); S_warn_expect_operator(aTHX_ "Array length", s, POP_OLDBUFPTR); if (!PL_tokenbuf[1]) PREREF(DOLSHARP); @@ -5617,7 +5617,7 @@ yyl_dollar(pTHX_ char *s) } PL_tokenbuf[0] = '$'; - s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), FALSE); + s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); S_warn_expect_operator(aTHX_ "Scalar", s, POP_OLDBUFPTR); if (!PL_tokenbuf[1]) { if (s == PL_bufend) @@ -6282,7 +6282,7 @@ yyl_star(pTHX_ char *s) POSTDEREF(PERLY_STAR); if (PL_expect != XOPERATOR) { - s = scan_ident(s, PL_tokenbuf, C_ARRAY_END(PL_tokenbuf), TRUE); + s = scan_ident(s, PL_tokenbuf, C_ARRAY_END(PL_tokenbuf), CHECK_UNARY); PL_expect = XOPERATOR; force_ident(PL_tokenbuf, PERLY_STAR); if (!*PL_tokenbuf) @@ -6330,7 +6330,7 @@ yyl_percent(pTHX_ char *s) POSTDEREF(PERLY_PERCENT_SIGN); PL_tokenbuf[0] = '%'; - s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), FALSE); + s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); pl_yylval.ival = 0; if (!PL_tokenbuf[1]) { PREREF(PERLY_PERCENT_SIGN); @@ -6867,7 +6867,8 @@ yyl_ampersand(pTHX_ char *s) } PL_tokenbuf[0] = '&'; - s = scan_ident(s - 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), TRUE); + s = scan_ident(s - 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), + CHECK_UNARY); pl_yylval.ival = (OPpENTERSUB_AMPER<<8); if (PL_tokenbuf[1]) @@ -6952,7 +6953,7 @@ yyl_snail(pTHX_ char *s) if (PL_expect == XPOSTDEREF) POSTDEREF(PERLY_SNAIL); PL_tokenbuf[0] = '@'; - s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), FALSE); + s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); S_warn_expect_operator(aTHX_ "Array", s, POP_OLDBUFPTR); pl_yylval.ival = 0; if (!PL_tokenbuf[1]) { @@ -10757,7 +10758,7 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR * specific variable name. */ STATIC char * -S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) +S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) { PERL_ARGS_ASSERT_SCAN_IDENT; @@ -10768,6 +10769,8 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) char * const e = dest_end - 3; /* two-character token, ending NUL */ bool is_utf8 = cBOOL(UTF); line_t orig_copline = 0, tmp_copline = 0; + const bool chk_unary = (flags & CHECK_UNARY); + if (isSPACE(*s) || !*s) s = skipspace(s); From 3fe3a42362a7db9e383a3230cc13a4b736e3b235 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 03:34:46 -0600 Subject: [PATCH 03/17] S_scan_ident: Add some comments, white space, braces This function is complicated, without enough documentation for me to understand the subtleties; I only studied it enough to change things I needed to, or which became obvious to me in the process. Other things remain undocumented by this commit. Some of the white space gives improper indentation which will fit a future commit. This commit also remove redundant parentheses in one statement --- toke.c | 124 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 86 insertions(+), 38 deletions(-) diff --git a/toke.c b/toke.c index b2a50981acdf..8d213b065793 100644 --- a/toke.c +++ b/toke.c @@ -10750,12 +10750,22 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR return s; } -/* scan s and extract an identifier ($var) from it if possible - * into dest. +/* scan 's' and extract an identifier ($var) from it into 'dest' if possible. + * + * Unlike S_parse_ident which looks for the more usual types of identifiers + * (and which this calls if needed), this looks for every possible identifier + * type, such as punctuation ones. + * + * It returns a pointer into the input buffer pointing to just after all the + * bytes this function consumed; or croaks if an invalid identifier is found. + * * XXX: This function has subtle implications on parsing, and * changing how it behaves can cause a variable to change from * being a run time rv2sv call or a compile time binding to a * specific variable name. + * + * Use the CHECK_UNARY flag to cause this to look for ambiguities with unary + * operators. */ STATIC char * S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) @@ -10781,6 +10791,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) d = dest; if (*d) { + /* Here parse_ident() found a digit variable or an identifier (anything valid as a bareword), so job done and return. */ if (PL_lex_state != LEX_NORMAL) @@ -10788,7 +10799,8 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) return s; } - /* Here, it is not a run-of-the-mill identifier name */ + /* Here, it is not a run-of-the-mill identifier name; maybe not an + * identifier at all. Note *d is a NUL */ if (*s == '$' && s[1] && ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8) @@ -10803,9 +10815,12 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) return s; } - /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */ + /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} + * Skip to the first non-space past the brace */ if (*s == '{') { + /* 'bracket' becomes the offset from the beginning of this chunk */ bracket = s - SvPVX(PL_linestr); + s++; orig_copline = CopLINE(PL_curcop); if (s < PL_bufend && isSPACE(*s)) { @@ -10813,11 +10828,12 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) } } - /* Extract the first character of the variable name from 's' and - * copy it, null terminated into 'd'. Note that this does not - * involve checking for just IDFIRST characters, as it allows the - * '^' for ${^FOO} type variable names, and it allows all the - * characters that are legal in a single character variable name. + /* Here, 's' points to the next "interesting" character. + * Extract the first character of the potential variable name from 's' and + * copy it, NUL terminated, into 'd'. Note that this does not involve + * checking for just IDFIRST characters, as it allows the '^' for ${^FOO} + * type variable names, and it allows all the characters that are legal in + * a single character variable name. * * The legal ones are any of: * a) all ASCII characters except: @@ -10841,11 +10857,11 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) && LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD)))))) { if (is_utf8) { - const STRLEN skip = UTF8SKIP(s); - STRLEN i; - d[skip] = '\0'; - for ( i = 0; i < skip; i++ ) - d[i] = *s++; + const STRLEN skip = UTF8SKIP(s); + STRLEN i; + d[skip] = '\0'; + for ( i = 0; i < skip; i++ ) + d[i] = *s++; } else { *d = *s++; @@ -10853,7 +10869,13 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) } } - /* special case to handle ${10}, ${11} the same way we handle $1 etc */ + /* 'd' has not been advanced, but if 's' pointed to a legal identifier + * character, it has been advanced to the next character, and the + * character it previously pointed to has been copied to where 'd' + * continues to point to. + * + * If that copied character is a digit, it means we have something like + * ${10}, ${1547}, etc. Handle those the same way we handle $1, etc */ if (isDIGIT(*d)) { s = parse_ident(s - 1, PL_bufend, &d, e, is_utf8, STOP_AT_FIRST_NON_DIGIT); @@ -10861,39 +10883,60 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) /* The code below is expecting d to point to the final digit */ d--; } - - /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */ - else if (*d == '^' && *s && isCONTROLVAR(*s)) { + else /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */ + if (*d == '^' && *s && isCONTROLVAR(*s)) { *d = toCTRL(*s); s++; } - /* Warn about ambiguous code after unary operators if {...} notation isn't - used. There's no difference in ambiguity; it's merely a heuristic - about when not to warn. */ - else if (chk_unary && bracket == -1) + else /* Warn about ambiguous code after unary operators if {...} notation + isn't used. There's no difference in ambiguity; it's merely a + heuristic about when not to warn. */ + if (chk_unary && bracket == -1) { check_unary(); + } + + /* Here, 's' points to the next "interesting" character to be parsed. And + * *d points to the first byte of the final so-far parsed and copied + * character. This is one of four things: + * 1) The only byte of the final character of an all-digit numeric + * variable inside braces. e.g. if the input is ${ 123 }, '123' has + * been copied to 'dest', and 'd' points to the '3'. We don't know + * yet if there is a closing brace. + * 2) A control character + * 3) The first (or only) byte of some other identifier + * 4) *d is NUL for anything else. + */ - if (bracket != -1) { + if (bracket != -1) { /* Found a '{' */ bool skip; char *s2; - /* If we were processing {...} notation then... */ + + /* Handle the interior of braces. First look to see if the character + * pointed to by 'd' is legal as the start of an identifier. + * If it isn't a normal identifier, it could be a control-character + * one. Those have to be followed by a \w character. Prefer a normal + * identifier, as UTF-8 strings could erroneously be conflated with a + * control character identifier. */ if ( isIDFIRST_lazy_if_safe(d, e, is_utf8) || ( ! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ && isWORDCHAR(*s)) ) { - /* note we have to check for a normal identifier first, - * as it handles utf8 symbols, and only after that has - * been ruled out can we look at the caret words */ Size_t advance; if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) { - /* if it starts as a valid identifier, assume that it is one. - (the later check for } being at the expected point will trap - cases where this doesn't pan out.) */ + + /* Now parse the normal identifier. + * + * khw: The code below is buggy because we already have parsed + * and copied the first character of it. The next character + * could be any IDCONT one, not just an IDFIRST */ d += advance; s = parse_ident(s, PL_bufend, &d, e, is_utf8, (ALLOW_PACKAGE | CHECK_DOLLAR)); } else { /* caret word: ${^Foo} ${^CAPTURE[0]} */ + + /* Now parse the control character identifier. Again, we have + * already copied the first character. */ d++; while (isWORDCHAR(*s) && d < e) { *d++ = *s++; @@ -10902,12 +10945,15 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) croak("%s", ident_too_long); *d = '\0'; } + tmp_copline = CopLINE(PL_curcop); if (s < PL_bufend && isSPACE(*s)) { s = skipspace(s); } - if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) { + + if (*s == '[' || (*s == '{' && strNE(dest, "sub"))) { /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */ + if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) { const char * const brack = (const char *) @@ -10929,6 +10975,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) if ( !tmp_copline ) tmp_copline = CopLINE(PL_curcop); + if ((skip = s < PL_bufend && isSPACE(*s))) { /* Avoid incrementing line numbers or resetting PL_linestart, in case we have to back up. */ @@ -10939,10 +10986,9 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) else s2 = s; - /* Expect to find a closing } after consuming any trailing whitespace. - */ - if (*s2 == '}') { - /* Now increment line numbers if applicable. */ + /* Expect to find a closing '}' after consuming any trailing + * whitespace. */ + if (*s2 == '}') { /* Now increment line numbers if applicable. */ if (skip) s = skipspace(s); s++; @@ -10971,9 +11017,10 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) } } else { - /* Didn't find the closing } at the point we expected, so restore - state such that the next thing to process is the opening { and */ - s = SvPVX(PL_linestr) + bracket; /* let the parser handle it */ + /* Didn't find the closing '}' at the point we expected, so + * restore the state such that the next thing to process is the + * opening '{" and let the parser handle it */ + s = SvPVX(PL_linestr) + bracket; CopLINE_set(PL_curcop, orig_copline); PL_parser->herelines = herelines; *dest = '\0'; @@ -10984,6 +11031,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) && !PL_lex_brackets && !intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) PL_lex_state = LEX_INTERPEND; + return s; } From 00a64da4df746b81b9022631f9e6465f57c9028f Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 10:33:40 -0600 Subject: [PATCH 04/17] S_scan_ident: Use mnemonic for soft hyphen code point --- toke.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/toke.c b/toke.c index 8d213b065793..c2d8d9fb9377 100644 --- a/toke.c +++ b/toke.c @@ -10851,10 +10851,8 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) if ( (s <= PL_bufend - ((is_utf8) ? UTF8SKIP(s) : 1)) && ( isGRAPH_A(*s) - || (is_utf8 - ? isIDFIRST_utf8_safe(s, PL_bufend) - : ( isGRAPH_L1(*s) - && LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD)))))) + || (is_utf8 ? isIDFIRST_utf8_safe(s, PL_bufend) + : (isGRAPH_L1(*s) && LIKELY((U8) *s != SHY_NATIVE))))) { if (is_utf8) { const STRLEN skip = UTF8SKIP(s); From 439847413805340d2d0cb950c577b5517e9d1810 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 05:17:29 -0600 Subject: [PATCH 05/17] S_scan_ident: Add a mnemonic instead of using -1 This makes things clearer. --- toke.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/toke.c b/toke.c index c2d8d9fb9377..4d9c1b4f2065 100644 --- a/toke.c +++ b/toke.c @@ -10773,7 +10773,10 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) PERL_ARGS_ASSERT_SCAN_IDENT; I32 herelines = PL_parser->herelines; - SSize_t bracket = -1; + +#define NO_BRACE -1 + SSize_t bracket = NO_BRACE; + char funny = *s++; char *d = dest; char * const e = dest_end - 3; /* two-character token, ending NUL */ @@ -10905,7 +10908,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * 4) *d is NUL for anything else. */ - if (bracket != -1) { /* Found a '{' */ + if (bracket != NO_BRACE) { /* Found a '{' */ bool skip; char *s2; From 62b8c79486a5b7f388d10cab2907ab8c08d7a594 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 05:18:05 -0600 Subject: [PATCH 06/17] S_scan_ident: Add an assertion That this had to be true was not obvious to me without studying closely the code before it. Adding an assertion will result in others deciding they don't have to figure it out. --- toke.c | 1 + 1 file changed, 1 insertion(+) diff --git a/toke.c b/toke.c index 4d9c1b4f2065..1881a44b8244 100644 --- a/toke.c +++ b/toke.c @@ -10878,6 +10878,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * If that copied character is a digit, it means we have something like * ${10}, ${1547}, etc. Handle those the same way we handle $1, etc */ if (isDIGIT(*d)) { + assert(bracket != NO_BRACE); s = parse_ident(s - 1, PL_bufend, &d, e, is_utf8, STOP_AT_FIRST_NON_DIGIT); From 74c1c8edfa1b431e7404d4ad59a5a529507ad84e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 05:23:17 -0600 Subject: [PATCH 07/17] S_scan_ident: Swap conditionals order It's clearer to handle the short case first, and put the much longer case afterwards. --- toke.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/toke.c b/toke.c index 1881a44b8244..d1787b0ed681 100644 --- a/toke.c +++ b/toke.c @@ -10909,7 +10909,13 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * 4) *d is NUL for anything else. */ - if (bracket != NO_BRACE) { /* Found a '{' */ + if (bracket == NO_BRACE) { + if ( PL_lex_state == LEX_INTERPNORMAL + && ! PL_lex_brackets + && ! intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) + PL_lex_state = LEX_INTERPEND; + } + else { /* Found a '{' */ bool skip; char *s2; @@ -11029,10 +11035,6 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) PL_parser->sub_no_recover = TRUE; } } - else if ( PL_lex_state == LEX_INTERPNORMAL - && !PL_lex_brackets - && !intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) - PL_lex_state = LEX_INTERPEND; return s; } From 6577cc4e7ffc2c9f6230417b6f2a0a0300564839 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 05:56:59 -0600 Subject: [PATCH 08/17] S_scan_ident: Swap another set of conditionals order It's clearer to handle the short case first, and put the much longer case afterwards. --- toke.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/toke.c b/toke.c index d1787b0ed681..dabaf6d23c1d 100644 --- a/toke.c +++ b/toke.c @@ -10996,7 +10996,17 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) /* Expect to find a closing '}' after consuming any trailing * whitespace. */ - if (*s2 == '}') { /* Now increment line numbers if applicable. */ + if (*s2 != '}') { + /* Didn't find the closing '}' at the point we expected, so + * restore the state such that the next thing to process is the + * opening '{' and let the parser handle it */ + s = SvPVX(PL_linestr) + bracket; + CopLINE_set(PL_curcop, orig_copline); + PL_parser->herelines = herelines; + *dest = '\0'; + PL_parser->sub_no_recover = TRUE; + } + else { /* Now increment line numbers if applicable. */ if (skip) s = skipspace(s); s++; @@ -11024,16 +11034,6 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) } } } - else { - /* Didn't find the closing '}' at the point we expected, so - * restore the state such that the next thing to process is the - * opening '{" and let the parser handle it */ - s = SvPVX(PL_linestr) + bracket; - CopLINE_set(PL_curcop, orig_copline); - PL_parser->herelines = herelines; - *dest = '\0'; - PL_parser->sub_no_recover = TRUE; - } } return s; From 9a733c9b255b347c7a803fdfbef07f43b7164c7d Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 05:27:03 -0600 Subject: [PATCH 09/17] S_scan_ident: Move declaractions close to first use These were declared far above, due to C89 that is no longer a constraint. --- toke.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toke.c b/toke.c index dabaf6d23c1d..c68d8b9d30b5 100644 --- a/toke.c +++ b/toke.c @@ -10916,8 +10916,6 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) PL_lex_state = LEX_INTERPEND; } else { /* Found a '{' */ - bool skip; - char *s2; /* Handle the interior of braces. First look to see if the character * pointed to by 'd' is legal as the start of an identifier. @@ -10984,6 +10982,8 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) if ( !tmp_copline ) tmp_copline = CopLINE(PL_curcop); + char *s2; + bool skip; if ((skip = s < PL_bufend && isSPACE(*s))) { /* Avoid incrementing line numbers or resetting PL_linestart, in case we have to back up. */ From 6063f3dfcdf56aa4e8f436bf9282fbbd24a5d24b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 07:42:44 -0600 Subject: [PATCH 10/17] S_scan_ident: Remove unnecessary complexity This check that the code just below won't look beyond the end of the buffer, is rendered redundant by the "_safe" macro which does the check itself. --- toke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toke.c b/toke.c index c68d8b9d30b5..bf4166aad49f 100644 --- a/toke.c +++ b/toke.c @@ -10852,7 +10852,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and * '{' without knowing if is UTF-8 or not. */ - if ( (s <= PL_bufend - ((is_utf8) ? UTF8SKIP(s) : 1)) + if ( s < PL_bufend && ( isGRAPH_A(*s) || (is_utf8 ? isIDFIRST_utf8_safe(s, PL_bufend) : (isGRAPH_L1(*s) && LIKELY((U8) *s != SHY_NATIVE))))) From bca943021afa06ee0d1e6ad4003e6905090a3cf3 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 08:14:58 -0600 Subject: [PATCH 11/17] S_scan_ident: Collapse two loops By setting a variable in advance, we can merge two loops into one. --- toke.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/toke.c b/toke.c index bf4166aad49f..11f350e5ad01 100644 --- a/toke.c +++ b/toke.c @@ -10852,22 +10852,16 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and * '{' without knowing if is UTF-8 or not. */ + STRLEN advance = 1; if ( s < PL_bufend && ( isGRAPH_A(*s) - || (is_utf8 ? isIDFIRST_utf8_safe(s, PL_bufend) + || (is_utf8 ? (advance = isIDFIRST_utf8_safe(s, PL_bufend)) : (isGRAPH_L1(*s) && LIKELY((U8) *s != SHY_NATIVE))))) { - if (is_utf8) { - const STRLEN skip = UTF8SKIP(s); STRLEN i; - d[skip] = '\0'; - for ( i = 0; i < skip; i++ ) + d[advance] = '\0'; + for ( i = 0; i < advance; i++ ) d[i] = *s++; - } - else { - *d = *s++; - d[1] = '\0'; - } } /* 'd' has not been advanced, but if 's' pointed to a legal identifier From 76a13ece0a0798069447a17078a76523f8086561 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 10:34:37 -0600 Subject: [PATCH 12/17] S_scan_ident: Avoid a recalculation Save the value from the first time into a variable --- toke.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/toke.c b/toke.c index 11f350e5ad01..aba45f1e0382 100644 --- a/toke.c +++ b/toke.c @@ -10912,17 +10912,17 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) else { /* Found a '{' */ /* Handle the interior of braces. First look to see if the character - * pointed to by 'd' is legal as the start of an identifier. - * If it isn't a normal identifier, it could be a control-character - * one. Those have to be followed by a \w character. Prefer a normal - * identifier, as UTF-8 strings could erroneously be conflated with a - * control character identifier. */ - if ( isIDFIRST_lazy_if_safe(d, e, is_utf8) - || ( ! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ - && isWORDCHAR(*s)) - ) { - Size_t advance; - if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) { + * pointed to by 'd' is legal as the start of an identifier. */ + Size_t advance = isIDFIRST_lazy_if_safe(d, e, is_utf8); + + /* If it isn't a normal identifier, it could be a control-character + * one. Those have to be followed by a \w character. */ + if (advance || ( ! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ + && isWORDCHAR(*s))) + { + /* Prefer a normal identifier, as UTF-8 strings could erroneously + * be conflated with a control character identifier. */ + if (advance) { /* Now parse the normal identifier. * From a940e9959b08ce4733e8a59f96e194975b158494 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 10:53:59 -0600 Subject: [PATCH 13/17] S_parse_ident: Swap order of conditionals I don't know what I was thinking when I recently thought these needed to be in a different order. The conjuctions are all &&, so might as well do the simpler things first --- toke.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toke.c b/toke.c index aba45f1e0382..41f2b082d3fc 100644 --- a/toke.c +++ b/toke.c @@ -10604,8 +10604,8 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end, * Unicode definition only when UTF-8 is in effect. We have to check * for the subset before checking for the superset. */ Size_t advance; - if ( (advance = isIDFIRST_lazy_if_safe(s, s_end, is_utf8)) - && (is_utf8 || idfirst_only)) + if ( (is_utf8 || idfirst_only) + && (advance = isIDFIRST_lazy_if_safe(s, s_end, is_utf8))) { const char *this_start = s; s += advance; From 2c42ac57f87ed510cd31389ef8c6e436e109d4a4 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 16:09:00 -0600 Subject: [PATCH 14/17] S_parse_ident: Add ability to start parse in middle S_scan_ident would like to call this function, already having looked at the first character of an identifier, and deciding it is legal. It wants this function to finish the scan. This commit adds a flag to S_parse_ident to accommodate this. --- toke.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/toke.c b/toke.c index 41f2b082d3fc..8f905da05ec5 100644 --- a/toke.c +++ b/toke.c @@ -181,6 +181,7 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th #define STOP_AT_FIRST_NON_DIGIT (1 << 4) #define CHECK_ONLY (1 << 5) #define CHECK_UNARY (1 << 6) +#define IDCONT_first_OK (1 << 7) #ifdef DEBUGGING static const char* const lex_state_names[] = { @@ -10597,6 +10598,11 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end, * in things like Foo::$bar */ const bool check_dollar = flags & CHECK_DOLLAR; + /* There is a use case for calling this function in the middle of having + * parsed a portion of an identifier. Therefore it should be able to + * accept the first character being an IDCont, and not necessarily an + * IDFIRST. The 'IDCONT_first_OK' flag is used to indicate this */ + while (s < s_end) { /* For non-UTF8, variables that match ASCII \w are a superset of @@ -10605,7 +10611,10 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end, * for the subset before checking for the superset. */ Size_t advance; if ( (is_utf8 || idfirst_only) - && (advance = isIDFIRST_lazy_if_safe(s, s_end, is_utf8))) + && (advance = (flags & IDCONT_first_OK) + ? isIDCONT_lazy_if_safe((U8 *) s, (U8 *) s_end, + is_utf8) + : isIDFIRST_lazy_if_safe(s, s_end, is_utf8))) { const char *this_start = s; s += advance; From 9b263a232e2c5bfd440a226d5ea309e4d991e247 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 16:11:52 -0600 Subject: [PATCH 15/17] S_scan_ident: Char in middle needt not be IDStart This fixes a bug in this function, in which it required the second character in an identifier to be IDStart, instead of IDCont. This hasn't been caught because most identifiers are ASCII, and generally for the purposes of this function in the ASCII range, all \w characters can be IDStart. --- t/comp/parser.t | 14 +++++++++++++- toke.c | 26 +++++++++++++------------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/t/comp/parser.t b/t/comp/parser.t index 44d8fef2ced1..658811374f65 100644 --- a/t/comp/parser.t +++ b/t/comp/parser.t @@ -8,7 +8,7 @@ BEGIN { chdir 't' if -d 't'; } -print "1..192\n"; +print "1..193\n"; sub failed { my ($got, $expected, $name) = @_; @@ -673,6 +673,18 @@ is $@, "", 'substr keys assignment'; is ($@, "", "Handles all numeric package component after ::"); } +{ + my $expected = "this is the way the identifier ends; not with a bang"; + my $result; + eval "use utf8; my \$e\x{1df8}claire = '$expected'; \$result = \${e\x{1df8}claire}"; + if ($@) { + failed($@, "no error", "Didn't crash"); + } + else { + is ($result, $expected, "Parser can handle a continuation as 2nd char"); + } +} + # Add new tests HERE (above this line) # bug #74022: Loop on characters in \p{OtherIDContinue} diff --git a/toke.c b/toke.c index 8f905da05ec5..5ce1871d77af 100644 --- a/toke.c +++ b/toke.c @@ -10933,26 +10933,26 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * be conflated with a control character identifier. */ if (advance) { - /* Now parse the normal identifier. - * - * khw: The code below is buggy because we already have parsed - * and copied the first character of it. The next character - * could be any IDCONT one, not just an IDFIRST */ + /* Now parse the normal identifier. But note, we already have + * parsed and copied the first character of it. That means we + * are jumping into the middle; so tell that to parse_ident. + * */ d += advance; s = parse_ident(s, PL_bufend, &d, e, is_utf8, - (ALLOW_PACKAGE | CHECK_DOLLAR)); + (ALLOW_PACKAGE|CHECK_DOLLAR)|IDCONT_first_OK); } else { /* caret word: ${^Foo} ${^CAPTURE[0]} */ /* Now parse the control character identifier. Again, we have - * already copied the first character. */ + * already copied the first character. This routine is + * sufficiently chummy with parse_ident to know that when we + * say the string isn't UTF-8, it will do the right thing in + * looking only for ASCII \w characters as identifier + * continuations */ d++; - while (isWORDCHAR(*s) && d < e) { - *d++ = *s++; - } - if (d >= e) - croak("%s", ident_too_long); - *d = '\0'; + s = parse_ident(s, PL_bufend, &d, e, + false, /* Don't allow UTF-8 */ + IDCONT_first_OK); } tmp_copline = CopLINE(PL_curcop); From a4546543bde5fc791f137c558f1fc81959c7fe5c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 16:21:00 -0600 Subject: [PATCH 16/17] S_scan_ident: Add a check-only option There is a bug here in which this function is called from S_intuit_more just to see if there is an identifier in the string it is looking at. But that call can have "subtle implications on parsing" (according to the long-standing comments in it). We need a way to call scan_ident without side-effects. This commit adds that capability. The next will use it. --- toke.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/toke.c b/toke.c index 5ce1871d77af..cc91bc824f21 100644 --- a/toke.c +++ b/toke.c @@ -10768,13 +10768,17 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR * It returns a pointer into the input buffer pointing to just after all the * bytes this function consumed; or croaks if an invalid identifier is found. * - * XXX: This function has subtle implications on parsing, and - * changing how it behaves can cause a variable to change from - * being a run time rv2sv call or a compile time binding to a - * specific variable name. + * XXX: This function normally has subtle implications on parsing, and + * changing how it behaves can cause a variable to change from being a run + * time rv2sv call or a compile time binding to a specific variable name. * - * Use the CHECK_UNARY flag to cause this to look for ambiguities with unary - * operators. + * However, it can be called with the CHECK_ONLY flag which keeps it from + * making any changes besides populating the memory 'dest' points to. If the + * identifier is illegal, it returns NULL instead of croaking. + * + * And use the CHECK_UNARY flag to cause this to look for ambiguities with + * unary operators. This is silently overriden if CHECK_ONLY is also + * specified. */ STATIC char * S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) @@ -10791,22 +10795,29 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) char * const e = dest_end - 3; /* two-character token, ending NUL */ bool is_utf8 = cBOOL(UTF); line_t orig_copline = 0, tmp_copline = 0; - const bool chk_unary = (flags & CHECK_UNARY); + /* Leave the flag in its position, so can pass this on without needing to + * anything extra */ + const U32 check_only = flags & CHECK_ONLY; + + const bool chk_unary = ! check_only && (flags & CHECK_UNARY); if (isSPACE(*s) || !*s) s = skipspace(s); /* See if it is a "normal" identifier */ s = parse_ident(s, PL_bufend, &d, e, is_utf8, - (ALLOW_PACKAGE | STOP_AT_FIRST_NON_DIGIT)); - d = dest; + (ALLOW_PACKAGE | STOP_AT_FIRST_NON_DIGIT | check_only)); + if (s == NULL) { + return NULL; + } + d = dest; if (*d) { /* Here parse_ident() found a digit variable or an identifier (anything valid as a bareword), so job done and return. */ - if (PL_lex_state != LEX_NORMAL) + if (! check_only && PL_lex_state != LEX_NORMAL) PL_lex_state = LEX_INTERPENDMAYBE; return s; } @@ -10860,7 +10871,6 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * Because all ASCII characters have the same representation whether * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and * '{' without knowing if is UTF-8 or not. */ - STRLEN advance = 1; if ( s < PL_bufend && ( isGRAPH_A(*s) @@ -10883,7 +10893,10 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) if (isDIGIT(*d)) { assert(bracket != NO_BRACE); s = parse_ident(s - 1, PL_bufend, &d, e, is_utf8, - STOP_AT_FIRST_NON_DIGIT); + STOP_AT_FIRST_NON_DIGIT | check_only); + if (s == NULL) { + return NULL; + } /* The code below is expecting d to point to the final digit */ d--; @@ -10913,7 +10926,8 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) */ if (bracket == NO_BRACE) { - if ( PL_lex_state == LEX_INTERPNORMAL + if ( ! check_only + && PL_lex_state == LEX_INTERPNORMAL && ! PL_lex_brackets && ! intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) PL_lex_state = LEX_INTERPEND; @@ -10939,7 +10953,10 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * */ d += advance; s = parse_ident(s, PL_bufend, &d, e, is_utf8, - (ALLOW_PACKAGE|CHECK_DOLLAR)|IDCONT_first_OK); + ( ALLOW_PACKAGE + | CHECK_DOLLAR + | IDCONT_first_OK + | check_only)); } else { /* caret word: ${^Foo} ${^CAPTURE[0]} */ @@ -10955,12 +10972,23 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) IDCONT_first_OK); } + if (s == NULL) { /* Can't be NULL unless is check_only */ + return NULL; + } + tmp_copline = CopLINE(PL_curcop); if (s < PL_bufend && isSPACE(*s)) { s = skipspace(s); } if (*s == '[' || (*s == '{' && strNE(dest, "sub"))) { + + /* In this branch, 's' is not changed further. If only + * checking validity, return now before any state changes */ + if (check_only) { + return s; + } + /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */ if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) { @@ -11004,6 +11032,12 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) * restore the state such that the next thing to process is the * opening '{' and let the parser handle it */ s = SvPVX(PL_linestr) + bracket; + + /* The final change to 's' has just been made. If only validity + * checking, return before making any state changes */ + if (check_only) { + return s; + } CopLINE_set(PL_curcop, orig_copline); PL_parser->herelines = herelines; *dest = '\0'; @@ -11013,6 +11047,13 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) if (skip) s = skipspace(s); s++; + + /* The final change to 's' has just been made. If only validity + * checking, return before making any state changes */ + if (check_only) { + return s; + } + if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) { PL_lex_state = LEX_INTERPEND; PL_expect = XREF; From 7b4e630352ec8dbc78d9c6d1a5bf789dc6858c84 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 19 Oct 2025 16:24:33 -0600 Subject: [PATCH 17/17] S_intuit_more: Call scan_ident in check-only mode This fixes the bug that examining the parse buffer had side-effects. I don't know what the implications of that were. --- toke.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/toke.c b/toke.c index cc91bc824f21..aef854a9e641 100644 --- a/toke.c +++ b/toke.c @@ -4743,12 +4743,13 @@ S_intuit_more(pTHX_ char *s, char *e, * changed since the code was first added */ char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; - /* khw: scan_ident shouldn't be used as-is. It has side - * effects and can end up calling this function recursively. - * - * khw: If what follows can't be an identifier, say it is too - * long or is $001, then it must be a charclass */ - scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), 0); + if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY)) + { + /* An illegal identifier means this can't be a subscript; + * it's an error or it could be a charclass */ + return false; + } + len = strlen(tmpbuf); /* khw: This only looks at global variables; lexicals came