Skip to content

Commit ed3c048

Browse files
committed
S_intuit_more: 'use strict' allows much better handling
Most code these days runs under 'use strict'. That allows us to resolve ambiguity without resorting to heuristics in far more cases than before. This commit adds a parameter to intuit_more() that gives the context it is being called from. And when that call is to resolve what $foo[...] is supposed to mean, we can look up foo to see if it is an array or a scalar. If the former, the "..." must be a subscript; if a scalar, it must be a charclass. Only if there is both a $foo and an @foo is there ambiguity. If so, we drop down to using the heuristics
1 parent 966a79d commit ed3c048

File tree

4 files changed

+63
-26
lines changed

4 files changed

+63
-26
lines changed

embed.fnc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6082,7 +6082,8 @@ S |int |intuit_method |NN char *s \
60826082
|NULLOK SV *ioname \
60836083
|NULLOK NOCHECK CV *cv
60846084
S |int |intuit_more |NN char *s \
6085-
|NN char *e
6085+
|NN char *e \
6086+
|U8 caller_context
60866087
S |Size_t |is_existing_identifier \
60876088
|NN char *s \
60886089
|NN char *e \

embed.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1684,7 +1684,7 @@
16841684
# define get_and_check_backslash_N_name_wrapper(a,b) S_get_and_check_backslash_N_name_wrapper(aTHX_ a,b)
16851685
# define incline(a,b) S_incline(aTHX_ a,b)
16861686
# define intuit_method(a,b,c) S_intuit_method(aTHX_ a,b,c)
1687-
# define intuit_more(a,b) S_intuit_more(aTHX_ a,b)
1687+
# define intuit_more(a,b,c) S_intuit_more(aTHX_ a,b,c)
16881688
# define is_existing_identifier(a,b,c,d) S_is_existing_identifier(aTHX_ a,b,c,d)
16891689
# define lop(a,b,c,d) S_lop(aTHX_ a,b,c,d)
16901690
# define missingterm(a,b) S_missingterm(aTHX_ a,b)

proto.h

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

toke.c

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,12 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th
101101
#define XFAKEEOF 0x40
102102
#define XFAKEBRACK 0x80
103103

104+
#define FROM_DOLLAR 1
105+
#define FROM_SNAIL 2
106+
#define FROM_PERCENT 3
107+
#define FROM_IDENT 4
108+
#define FROM_INTERDEPENDMAYBE 5
109+
104110
#ifdef USE_UTF8_SCRIPTS
105111
# define UTF cBOOL(!IN_BYTES)
106112
#else
@@ -4547,7 +4553,7 @@ S_is_existing_identifier(pTHX_ char *s, char *e, char sigil, bool is_utf8)
45474553
/* This is the one truly awful dwimmer necessary to conflate C and sed. */
45484554

45494555
STATIC int
4550-
S_intuit_more(pTHX_ char *s, char *e)
4556+
S_intuit_more(pTHX_ char *s, char *e, U8 caller_context)
45514557
{
45524558
PERL_ARGS_ASSERT_INTUIT_MORE;
45534559

@@ -4608,23 +4614,53 @@ S_intuit_more(pTHX_ char *s, char *e)
46084614
if (s[0] == ']' || s[0] == '^')
46094615
return FALSE;
46104616

4611-
/* khw: If the context of this call is $foo[...], we may be able to avoid
4612-
* the heuristics below. The possibilities are:
4613-
* strict @foo $foo
4614-
* vars? exists exists
4615-
* y n n This is an error; return false now
4616-
* y n y must be a a charclass
4617-
* y y n must be a a subscript
4618-
* y y y ambiguous; do heuristics below
4619-
* n n n ambiguous; do heuristics below
4620-
* n n y ambiguous; do heuristics below, but I
4621-
* wonder if the initial bias should be a
4622-
* little towards charclass
4623-
* n y n ambiguous; do heuristics below, but I
4624-
* wonder if the initial bias should be a
4625-
* little towards subscript
4626-
* n y y ambiguous; do heuristics below
4627-
*/
4617+
4618+
/* If the input is of the form '$foo[...', and there is a $foo scalar and
4619+
* no @foo array, then '...' is more likely to be a character class.
4620+
* (Under 'strict vars', we know at compile time all the accessible
4621+
* variables, so in that case it MUST be a character class.) If the
4622+
* situation is reversed, it is more likely or must be a subscript */
4623+
if ( caller_context == FROM_DOLLAR
4624+
|| (caller_context == FROM_INTERDEPENDMAYBE && PL_tokenbuf[0] == '@'))
4625+
{
4626+
char * e = PL_tokenbuf + sizeof(PL_tokenbuf) + 1;
4627+
4628+
/* See if there is a known scalar for what our caller is asking about.
4629+
* */
4630+
bool has_scalar = is_existing_identifier(PL_tokenbuf, e, '$', UTF);
4631+
4632+
/* Repeat to see if there is a known array of the given name */
4633+
bool has_array = is_existing_identifier(PL_tokenbuf, e, '@', UTF);
4634+
4635+
unsigned int count = has_scalar + has_array;
4636+
4637+
/* Under strict, we need some variable to be declared. */
4638+
if (PL_hints & HINT_STRICT_VARS) {
4639+
4640+
/* If none are, is an error, return false to stop useless further
4641+
* parsing. */
4642+
if (count == 0) {
4643+
return false;
4644+
}
4645+
4646+
/* When just one variable is declared, the construct has to match
4647+
* what the variable is. If it is an array, this must be a
4648+
* subscript which needs further processing; otherwise it is a
4649+
* character class needing nothing further. */
4650+
if (count == 1) {
4651+
return has_array;
4652+
}
4653+
4654+
/* Here have both an array and a scalar with the same name. Drop
4655+
* down to use the heuristics to try to intuit which is meant */
4656+
}
4657+
else {
4658+
/* Here, there could be undeclared variables. But khw believes if
4659+
* one is known to exist but not the other, it is more likely that
4660+
* the other doesn't exist, so we can factor this in to the
4661+
* heuristics below */
4662+
}
4663+
}
46284664

46294665
/* Find matching ']'. khw: Actually it finds the next ']' and assumes it
46304666
* matches the '['. In order to account for the possibility of the ']'
@@ -5585,7 +5621,7 @@ yyl_dollar(pTHX_ char *s)
55855621
s = skipspace(s);
55865622

55875623
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
5588-
&& intuit_more(s, PL_bufend)) {
5624+
&& intuit_more(s, PL_bufend, FROM_DOLLAR)) {
55895625
if (*s == '[') {
55905626
PL_tokenbuf[0] = '@';
55915627
if (ckWARN(WARN_SYNTAX)) {
@@ -6288,7 +6324,7 @@ yyl_percent(pTHX_ char *s)
62886324
PREREF(PERLY_PERCENT_SIGN);
62896325
}
62906326
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6291-
&& intuit_more(s, PL_bufend)) {
6327+
&& intuit_more(s, PL_bufend, FROM_PERCENT)) {
62926328
if (*s == '[')
62936329
PL_tokenbuf[0] = '@';
62946330
}
@@ -6910,7 +6946,7 @@ yyl_snail(pTHX_ char *s)
69106946
if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
69116947
s = skipspace(s);
69126948
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6913-
&& intuit_more(s, PL_bufend))
6949+
&& intuit_more(s, PL_bufend, FROM_SNAIL))
69146950
{
69156951
if (*s == '{')
69166952
PL_tokenbuf[0] = '%';
@@ -9993,7 +10029,7 @@ Perl_yylex(pTHX)
999310029
return yylex();
999410030

999510031
case LEX_INTERPENDMAYBE:
9996-
if (intuit_more(PL_bufptr, PL_bufend)) {
10032+
if (intuit_more(PL_bufptr, PL_bufend, FROM_INTERDEPENDMAYBE)) {
999710033
PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */
999810034
break;
999910035
}
@@ -10807,7 +10843,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
1080710843
}
1080810844
else if ( PL_lex_state == LEX_INTERPNORMAL
1080910845
&& !PL_lex_brackets
10810-
&& !intuit_more(s, PL_bufend))
10846+
&& !intuit_more(s, PL_bufend, FROM_IDENT))
1081110847
PL_lex_state = LEX_INTERPEND;
1081210848
return s;
1081310849
}

0 commit comments

Comments
 (0)