Skip to content

Commit 5e9f082

Browse files
committed
intuit_more: Handle numeric identifiers
This function was totally unaware of the possibility of these.
1 parent ac4d84e commit 5e9f082

File tree

1 file changed

+64
-8
lines changed

1 file changed

+64
-8
lines changed

toke.c

Lines changed: 64 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4758,13 +4758,70 @@ S_intuit_more(pTHX_ char *s, char *e,
47584758
return false;
47594759
}
47604760

4761-
/* khw: This only looks at global variables; lexicals came
4762-
* later, and this hasn't been updated. Ouch!! */
4763-
if ( len > 1
4764-
&& gv_fetchpvn_flags(tmpbuf + 1,
4765-
len,
4766-
UTF ? SVf_UTF8 : 0,
4767-
SVt_PV))
4761+
/* If there is extra stuff in the source, like braces, it means
4762+
* this is almost definitely intended to be an identifier */
4763+
bool decorated;
4764+
decorated = (Size_t) (s_after_ident - s) > len;
4765+
4766+
if (isDIGIT_A(tmpbuf[1])) {
4767+
4768+
/* &41 and &4b are illegal subroutine names so is an error or
4769+
* a charclass */
4770+
if (s[0] == '&') {
4771+
return false;
4772+
}
4773+
4774+
/* Here, matches [$@]\d+. If the next input character is a
4775+
* \w, we would have something like $456x, which is an illegal
4776+
* identifer, so is an error or a charclass */
4777+
if ( ! decorated
4778+
&& isWORDCHAR_lazy_if_safe(s_after_ident,
4779+
PL_bufend, UTF))
4780+
{
4781+
return false;
4782+
}
4783+
4784+
/* We don't get here if this potential identifier starts with
4785+
* leading zeros, due to the logic in scan_ident. */
4786+
assert(len == 1 || tmpbuf[0] != '0');
4787+
4788+
/* The chances are vanishingly small that someone is going to
4789+
* want [$0] to expand to the program's name in a character
4790+
* class. But, what would the program's name be doing as part
4791+
* of a subscript either? The only likely scenario is that
4792+
* this is meant to be a charclass matching either '$' or '0'.
4793+
* */
4794+
if (tmpbuf[1] == '0') {
4795+
return false;
4796+
}
4797+
4798+
/* Here it is either something like $1 which is supposed to
4799+
* match either dollar or 1, or it is supposed to expand to
4800+
* what is in $1 left over from a capturing group from the
4801+
* previous pattern match. In the latter case, it could be
4802+
* either a part of wanting to calculate a subscript, or to
4803+
* use as the contents of as part of the character class.
4804+
* Larger (undecorated) numbers are much less likely to have
4805+
* had capturing groups, so they lean more towards a
4806+
* charclass. 100 is what this function has traditionally
4807+
* used for len>1; khw thinks there is no bias one way or the
4808+
* other for length 1 ones. But has chosen 100 for decorated
4809+
* identifiers
4810+
*
4811+
* XXX long enough identifiers could probably return false
4812+
* immediately here, rather than using weights. */
4813+
if (decorated || len > 1) {
4814+
weight -= 100;
4815+
}
4816+
}
4817+
else if ( len > 1
4818+
/* khw: This only looks at global variables; lexicals
4819+
* came later, and this hasn't been updated. Ouch!!
4820+
* */
4821+
&& gv_fetchpvn_flags(tmpbuf + 1,
4822+
len,
4823+
UTF ? SVf_UTF8 : 0,
4824+
SVt_PV))
47684825
{
47694826
weight -= 100;
47704827

@@ -4817,7 +4874,6 @@ S_intuit_more(pTHX_ char *s, char *e,
48174874
* \? must be subscript for things like \d, but not \a.
48184875
*/
48194876

4820-
48214877
case '\\':
48224878
if (s[1] == '\0') {
48234879
/* \ followed by NUL strongly indicates character class */

0 commit comments

Comments
 (0)