Skip to content

Commit 3450d19

Browse files
committed
intuit_more: 'use strict' allows much better handling
Most code these days runs under 'use strict'. That allows us to resolve ambiguity without resorting to heuristics in far more cases than before. This commit adds a parameter to intuit_more() that gives the context it is being called from. And when that call is to resolve what $foo[...] is supposed to mean, we can look up foo to see if it is an array or a scalar. If the former, the "..." must be a subscript; if a scalar, it must be a charclass. Only if there is both a $foo and an @foo is there ambiguity. If so, we drop down to using the heuristics
1 parent aa93969 commit 3450d19

File tree

4 files changed

+72
-30
lines changed

4 files changed

+72
-30
lines changed

embed.fnc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6153,7 +6153,10 @@ S |int |intuit_method |NN char *start \
61536153
|NULLOK SV *ioname \
61546154
|NULLOK NOCHECK CV *cv
61556155
S |int |intuit_more |NN char *s \
6156-
|NN char *e
6156+
|NN char *e \
6157+
|U8 caller_context \
6158+
|NULLOK char *caller_s \
6159+
|Size_t caller_length
61576160
S |bool |is_existing_identifier \
61586161
|NN char *s \
61596162
|Size_t len \

embed.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1684,7 +1684,7 @@
16841684
# define get_and_check_backslash_N_name_wrapper(a,b) S_get_and_check_backslash_N_name_wrapper(aTHX_ a,b)
16851685
# define incline(a,b) S_incline(aTHX_ a,b)
16861686
# define intuit_method(a,b,c) S_intuit_method(aTHX_ a,b,c)
1687-
# define intuit_more(a,b) S_intuit_more(aTHX_ a,b)
1687+
# define intuit_more(a,b,c,d,e) S_intuit_more(aTHX_ a,b,c,d,e)
16881688
# define is_existing_identifier(a,b,c,d) S_is_existing_identifier(aTHX_ a,b,c,d)
16891689
# define lop(a,b,c,d) S_lop(aTHX_ a,b,c,d)
16901690
# define missingterm(a,b) S_missingterm(aTHX_ a,b)

proto.h

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

toke.c

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,12 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th
101101
#define XFAKEEOF 0x40
102102
#define XFAKEBRACK 0x80
103103

104+
#define FROM_DOLLAR 1
105+
#define FROM_SNAIL 2
106+
#define FROM_PERCENT 3
107+
#define FROM_IDENT 4
108+
#define FROM_INTERDEPENDMAYBE 5
109+
104110
#ifdef USE_UTF8_SCRIPTS
105111
# define UTF cBOOL(!IN_BYTES)
106112
#else
@@ -4514,7 +4520,11 @@ S_is_existing_identifier(pTHX_ char *s, Size_t len, char sigil, bool is_utf8)
45144520
/* This is the one truly awful dwimmer necessary to conflate C and sed. */
45154521

45164522
STATIC int
4517-
S_intuit_more(pTHX_ char *s, char *e)
4523+
S_intuit_more(pTHX_ char *s, char *e,
4524+
U8 caller_context, /* Who's calling us? basically an enum */
4525+
char * caller_s, /* If non-NULL, the name of the identifier
4526+
that resulted in this call */
4527+
Size_t caller_length) /* And the length of that name */
45184528
{
45194529
PERL_ARGS_ASSERT_INTUIT_MORE;
45204530

@@ -4585,28 +4595,51 @@ S_intuit_more(pTHX_ char *s, char *e)
45854595
if (s[0] == ']' || s[0] == '^')
45864596
return FALSE;
45874597

4588-
/* khw: If the context of this call is $foo[...], we may be able to avoid
4589-
* the heuristics below. The possibilities are:
4590-
* strict @foo $foo
4591-
* vars? exists exists
4592-
* y n n This is an error; return false now
4593-
* y n y must be a a charclass
4594-
* y y n must be a a subscript
4595-
* y y y ambiguous; do heuristics below
4596-
* n n n ambiguous; do heuristics below
4597-
* n n y ambiguous; do heuristics below, but I
4598-
* wonder if the initial bias should be a
4599-
* little towards charclass
4600-
* n y n ambiguous; do heuristics below, but I
4601-
* wonder if the initial bias should be a
4602-
* little towards subscript
4603-
* n y y ambiguous; do heuristics below
4604-
*/
4598+
bool under_strict_vars = PL_hints & HINT_STRICT_VARS;
4599+
4600+
/* If the input is of the form '$foo[...', and there is a $foo scalar and
4601+
* no @foo array, then '...' is more likely to be a character class.
4602+
* (Under 'strict vars', we know at compile time all the accessible
4603+
* variables, so in that case it MUST be a character class.) If the
4604+
* situation is reversed, it is more likely to be (or must be) a
4605+
* subscript. */
4606+
if (caller_context == FROM_DOLLAR) {
4607+
assert (caller_s);
4608+
4609+
/* See if there is a known scalar for the input identifier */
4610+
bool has_scalar = is_existing_identifier(caller_s, caller_length,
4611+
'$', UTF);
4612+
4613+
/* Repeat to see if there is a known array of the given name */
4614+
bool has_array = is_existing_identifier(caller_s, caller_length,
4615+
'@', UTF);
4616+
4617+
unsigned int count = has_scalar + has_array;
4618+
4619+
/* Under strict, we need some variable to be declared. */
4620+
if (under_strict_vars) {
4621+
4622+
/* If none are, is an error. Return false to stop useless further
4623+
* parsing. */
4624+
if (count == 0) {
4625+
return false;
4626+
}
4627+
4628+
/* When just one variable is declared, the construct has to match
4629+
* what the variable is. If it is an array, this must be a
4630+
* subscript which needs further processing; otherwise it is a
4631+
* character class needing nothing further. */
4632+
if (count == 1) {
4633+
return has_array;
4634+
}
46054635

4606-
/* Find matching ']'. khw: Actually it finds the next ']' and assumes it
4607-
* matches the '['. In order to account for the possibility of the ']'
4608-
* being inside the scope of \Q or preceded by an even number of
4609-
* backslashes, this should be rewritten */
4636+
/* Here have both an array and a scalar with the same name. Drop
4637+
* down to use the heuristics to try to intuit which is meant */
4638+
}
4639+
}
4640+
4641+
/* Find matching ']'. khw: This means any s[1] below is guaranteed to
4642+
* exist */
46104643
const char * const send = (char *) memchr(s, ']', e - s);
46114644
if (! send) /* has to be an expression */
46124645
return TRUE;
@@ -5591,7 +5624,9 @@ yyl_dollar(pTHX_ char *s)
55915624
s = skipspace(s);
55925625

55935626
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
5594-
&& intuit_more(s, PL_bufend)) {
5627+
&& intuit_more(s, PL_bufend, FROM_DOLLAR,
5628+
PL_tokenbuf, strlen(PL_tokenbuf)))
5629+
{
55955630
if (*s == '[') {
55965631
PL_tokenbuf[0] = '@';
55975632
if (ckWARN(WARN_SYNTAX)) {
@@ -6295,7 +6330,9 @@ yyl_percent(pTHX_ char *s)
62956330
PREREF(PERLY_PERCENT_SIGN);
62966331
}
62976332
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6298-
&& intuit_more(s, PL_bufend)) {
6333+
&& intuit_more(s, PL_bufend, FROM_PERCENT,
6334+
PL_tokenbuf, strlen(PL_tokenbuf)))
6335+
{
62996336
if (*s == '[')
63006337
PL_tokenbuf[0] = '@';
63016338
}
@@ -6918,7 +6955,8 @@ yyl_snail(pTHX_ char *s)
69186955
if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
69196956
s = skipspace(s);
69206957
if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6921-
&& intuit_more(s, PL_bufend))
6958+
&& intuit_more(s, PL_bufend, FROM_SNAIL,
6959+
PL_tokenbuf, strlen(PL_tokenbuf)))
69226960
{
69236961
if (*s == '{')
69246962
PL_tokenbuf[0] = '%';
@@ -10004,7 +10042,8 @@ Perl_yylex(pTHX)
1000410042
return yylex();
1000510043

1000610044
case LEX_INTERPENDMAYBE:
10007-
if (intuit_more(PL_bufptr, PL_bufend)) {
10045+
if (intuit_more(PL_bufptr, PL_bufend, FROM_INTERDEPENDMAYBE, NULL, 0))
10046+
{
1000810047
PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */
1000910048
break;
1001010049
}
@@ -10818,7 +10857,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary)
1081810857
}
1081910858
else if ( PL_lex_state == LEX_INTERPNORMAL
1082010859
&& !PL_lex_brackets
10821-
&& !intuit_more(s, PL_bufend))
10860+
&& !intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0))
1082210861
PL_lex_state = LEX_INTERPEND;
1082310862
return s;
1082410863
}

0 commit comments

Comments
 (0)