diff --git a/embed.fnc b/embed.fnc index fd722fe8177e..f4368bf745b3 100644 --- a/embed.fnc +++ b/embed.fnc @@ -6153,7 +6153,15 @@ S |int |intuit_method |NN char *start \ |NULLOK SV *ioname \ |NULLOK NOCHECK CV *cv S |int |intuit_more |NN char *s \ - |NN char *e + |NN char *e \ + |U8 caller_context \ + |NULLOK char *caller_s \ + |Size_t caller_length +S |bool |is_existing_identifier \ + |NN char *s \ + |Size_t len \ + |char sigil \ + |bool is_utf8 S |I32 |lop |enum yytokentype t \ |I32 f \ |U8 x \ diff --git a/embed.h b/embed.h index 0d485caf3d0a..e23ead84884f 100644 --- a/embed.h +++ b/embed.h @@ -1684,7 +1684,8 @@ # define get_and_check_backslash_N_name_wrapper(a,b) S_get_and_check_backslash_N_name_wrapper(aTHX_ a,b) # define incline(a,b) S_incline(aTHX_ a,b) # define intuit_method(a,b,c) S_intuit_method(aTHX_ a,b,c) -# define intuit_more(a,b) S_intuit_more(aTHX_ a,b) +# define intuit_more(a,b,c,d,e) S_intuit_more(aTHX_ a,b,c,d,e) +# define is_existing_identifier(a,b,c,d) S_is_existing_identifier(aTHX_ a,b,c,d) # define lop(a,b,c,d) S_lop(aTHX_ a,b,c,d) # define missingterm(a,b) S_missingterm(aTHX_ a,b) # define parse_ident(a,b,c,d,e,f) S_parse_ident(aTHX_ a,b,c,d,e,f) diff --git a/proto.h b/proto.h index 7a7c783c67de..b486ee8e7303 100644 --- a/proto.h +++ b/proto.h @@ -9442,10 +9442,15 @@ S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv); assert(start) STATIC int -S_intuit_more(pTHX_ char *s, char *e); +S_intuit_more(pTHX_ char *s, char *e, U8 caller_context, char *caller_s, Size_t caller_length); # define PERL_ARGS_ASSERT_INTUIT_MORE \ assert(s); assert(e) +STATIC bool +S_is_existing_identifier(pTHX_ char *s, Size_t len, char sigil, bool is_utf8); +# define PERL_ARGS_ASSERT_IS_EXISTING_IDENTIFIER \ + assert(s) + STATIC I32 S_lop(pTHX_ enum yytokentype t, I32 f, U8 x, char *s); # define PERL_ARGS_ASSERT_LOP \ diff --git a/toke.c b/toke.c index 0d7d9dc45923..d055abb94690 100644 --- a/toke.c +++ b/toke.c @@ -101,6 +101,12 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th #define XFAKEEOF 0x40 #define XFAKEBRACK 0x80 +#define FROM_DOLLAR 1 +#define FROM_SNAIL 2 +#define FROM_PERCENT 3 +#define FROM_IDENT 4 +#define FROM_INTERDEPENDMAYBE 5 + #ifdef USE_UTF8_SCRIPTS # define UTF cBOOL(!IN_BYTES) #else @@ -4464,6 +4470,33 @@ S_scan_const(pTHX_ char *start) return s; } +STATIC bool +S_is_existing_identifier(pTHX_ char *s, Size_t len, char sigil, bool is_utf8) +{ + PERL_ARGS_ASSERT_IS_EXISTING_IDENTIFIER; + + /* This returns a boolean indicating if a string represents an identifier + * known to the program. 'sigil' is the character indicating the type of + * the identifier to look for. (though '%' is currently not specially + * handled.) The string from 's + 1' to (s + len) is looked at. s[0] is + * ignored, but must exist; the function overwrites it temporarily, + * restoring it before returning */ + + char save_sigil = s[0]; + s[0] = sigil; + PADOFFSET slot = pad_findmy_pv(s, 0); + s[0] = save_sigil; + + return slot != NOT_IN_PAD + || gv_fetchpvn_flags(s + 1, len - 1, + (is_utf8) ? SVf_UTF8 : 0, + (sigil == '@') + ? SVt_PVAV + : (sigil == '&') + ? SVt_PVCV + : SVt_PV); +} + /* S_intuit_more * Returns TRUE if there's more to the expression (e.g., a subscript), * FALSE otherwise. @@ -4487,7 +4520,11 @@ S_scan_const(pTHX_ char *start) /* This is the one truly awful dwimmer necessary to conflate C and sed. */ STATIC int -S_intuit_more(pTHX_ char *s, char *e) +S_intuit_more(pTHX_ char *s, char *e, + U8 caller_context, /* Who's calling us? basically an enum */ + char * caller_s, /* If non-NULL, the name of the identifier + that resulted in this call */ + Size_t caller_length) /* And the length of that name */ { PERL_ARGS_ASSERT_INTUIT_MORE; @@ -4558,28 +4595,51 @@ S_intuit_more(pTHX_ char *s, char *e) if (s[0] == ']' || s[0] == '^') return FALSE; - /* khw: If the context of this call is $foo[...], we may be able to avoid - * the heuristics below. The possibilities are: - * strict @foo $foo - * vars? exists exists - * y n n This is an error; return false now - * y n y must be a a charclass - * y y n must be a a subscript - * y y y ambiguous; do heuristics below - * n n n ambiguous; do heuristics below - * n n y ambiguous; do heuristics below, but I - * wonder if the initial bias should be a - * little towards charclass - * n y n ambiguous; do heuristics below, but I - * wonder if the initial bias should be a - * little towards subscript - * n y y ambiguous; do heuristics below - */ + bool under_strict_vars = PL_hints & HINT_STRICT_VARS; + + /* If the input is of the form '$foo[...', and there is a $foo scalar and + * no @foo array, then '...' is more likely to be a character class. + * (Under 'strict vars', we know at compile time all the accessible + * variables, so in that case it MUST be a character class.) If the + * situation is reversed, it is more likely to be (or must be) a + * subscript. */ + if (caller_context == FROM_DOLLAR) { + assert (caller_s); + + /* See if there is a known scalar for the input identifier */ + bool has_scalar = is_existing_identifier(caller_s, caller_length, + '$', UTF); + + /* Repeat to see if there is a known array of the given name */ + bool has_array = is_existing_identifier(caller_s, caller_length, + '@', UTF); + + unsigned int count = has_scalar + has_array; - /* Find matching ']'. khw: Actually it finds the next ']' and assumes it - * matches the '['. In order to account for the possibility of the ']' - * being inside the scope of \Q or preceded by an even number of - * backslashes, this should be rewritten */ + /* Under strict, we need some variable to be declared. */ + if (under_strict_vars) { + + /* If none are, is an error. Return false to stop useless further + * parsing. */ + if (count == 0) { + return false; + } + + /* When just one variable is declared, the construct has to match + * what the variable is. If it is an array, this must be a + * subscript which needs further processing; otherwise it is a + * character class needing nothing further. */ + if (count == 1) { + return has_array; + } + + /* Here have both an array and a scalar with the same name. Drop + * down to use the heuristics to try to intuit which is meant */ + } + } + + /* Find matching ']'. khw: This means any s[1] below is guaranteed to + * exist */ const char * const send = (char *) memchr(s, ']', e - s); if (! send) /* has to be an expression */ return TRUE; @@ -5564,7 +5624,9 @@ yyl_dollar(pTHX_ char *s) s = skipspace(s); if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) - && intuit_more(s, PL_bufend)) { + && intuit_more(s, PL_bufend, FROM_DOLLAR, + PL_tokenbuf, strlen(PL_tokenbuf))) + { if (*s == '[') { PL_tokenbuf[0] = '@'; if (ckWARN(WARN_SYNTAX)) { @@ -6268,7 +6330,9 @@ yyl_percent(pTHX_ char *s) PREREF(PERLY_PERCENT_SIGN); } if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) - && intuit_more(s, PL_bufend)) { + && intuit_more(s, PL_bufend, FROM_PERCENT, + PL_tokenbuf, strlen(PL_tokenbuf))) + { if (*s == '[') PL_tokenbuf[0] = '@'; } @@ -6891,7 +6955,8 @@ yyl_snail(pTHX_ char *s) if (PL_lex_state == LEX_NORMAL || PL_lex_brackets) s = skipspace(s); if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) - && intuit_more(s, PL_bufend)) + && intuit_more(s, PL_bufend, FROM_SNAIL, + PL_tokenbuf, strlen(PL_tokenbuf))) { if (*s == '{') PL_tokenbuf[0] = '%'; @@ -9977,7 +10042,8 @@ Perl_yylex(pTHX) return yylex(); case LEX_INTERPENDMAYBE: - if (intuit_more(PL_bufptr, PL_bufend)) { + if (intuit_more(PL_bufptr, PL_bufend, FROM_INTERDEPENDMAYBE, NULL, 0)) + { PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */ break; } @@ -10470,6 +10536,7 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8, bool check_dollar) { PERL_ARGS_ASSERT_PARSE_IDENT; + assert(*s <= PL_bufend); while (*s < PL_bufend) { if (*d >= e) @@ -10790,7 +10857,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) } else if ( PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets - && !intuit_more(s, PL_bufend)) + && !intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) PL_lex_state = LEX_INTERPEND; return s; }