diff --git a/toke.c b/toke.c index aef854a9e641..d5d06d3748cd 100644 --- a/toke.c +++ b/toke.c @@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e, * written, and regcurly never required a comma, as in {0}. Probably it is * ok as-is */ if (s[0] == '{') { - if (regcurly(s, e, NULL)) { - return FALSE; - } - return TRUE; + return ! regcurly(s, e, NULL); } /* Here is '[': maybe we have a character class. Examine the guts */ @@ -4735,72 +4732,147 @@ S_intuit_more(pTHX_ char *s, char *e, * looks for. * */ - if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) { - Size_t len; - /* khw: where did the magic number 4 come from?. This buffer - * was 4 times as large as tokenbuf in 1997, and had not - * changed since the code was first added */ - char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; + /* (Reserve tmpbuf[0] for future commits, hence +1) */ + char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ]; + char * s_after_ident; + + /* scan_ident returns NULL if the input looks like an identifier + * that is illegal, e.g., it is too long or is like $001. */ + s_after_ident = scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), + CHECK_ONLY); + if (s_after_ident == NULL) { + + /* An illegal identifier means this can't be a subscript; + * it's an error or it could be a charclass */ + return false; + } + + Size_t len; /* (C++ forbids joining these 2 lines) */ + len = strlen(tmpbuf + 1); + + /* If it doesn't look like an identifier at all, scan_ident will + * set tmpbuf[1] to NUL. This is either an error or a character + * class. */ + if (len == 0) { + return false; + } + + /* If there is extra stuff in the source, like braces, it means + * this is almost definitely intended to be an identifier */ + bool decorated; + decorated = (Size_t) (s_after_ident - s) > len; + + if (isDIGIT_A(tmpbuf[1])) { + + /* &41 and &4b are illegal subroutine names so is an error or + * a charclass */ + if (s[0] == '&') { + return false; + } - if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY)) + /* Here, matches [$@]\d+. If the next input character is a + * \w, we would have something like $456x, which is an illegal + * identifer, so is an error or a charclass */ + if ( ! decorated + && isWORDCHAR_lazy_if_safe(s_after_ident, + PL_bufend, UTF)) { - /* An illegal identifier means this can't be a subscript; - * it's an error or it could be a charclass */ return false; } - len = strlen(tmpbuf); + /* We don't get here if this potential identifier starts with + * leading zeros, due to the logic in scan_ident. */ + assert(len == 1 || tmpbuf[0] != '0'); - /* khw: This only looks at global variables; lexicals came - * later, and this hasn't been updated. Ouch!! */ - if ( len > 1 - && gv_fetchpvn_flags(tmpbuf, - len, - UTF ? SVf_UTF8 : 0, - SVt_PV)) + /* The chances are vanishingly small that someone is going to + * want [$0] to expand to the program's name in a character + * class. But, what would the program's name be doing as part + * of a subscript either? The only likely scenario is that + * this is meant to be a charclass matching either '$' or '0'. + * */ + if (tmpbuf[1] == '0') { + return false; + } + + /* Here it is either something like $1 which is supposed to + * match either dollar or 1, or it is supposed to expand to + * what is in $1 left over from a capturing group from the + * previous pattern match. In the latter case, it could be + * either a part of wanting to calculate a subscript, or to + * use as the contents of as part of the character class. + * Larger (undecorated) numbers are much less likely to have + * had capturing groups, so they lean more towards a + * charclass. 100 is what this function has traditionally + * used for len>1; khw thinks there is no bias one way or the + * other for length 1 ones. But has chosen 100 for decorated + * identifiers + * + * XXX long enough identifiers could probably return false + * immediately here, rather than using weights. */ + if (decorated || len > 1) { + weight -= 100; + } + } + else if (len > 1) { + /* See if there is a known identifier of the given kind. For + * arrays, this might also be a reference to one of its + * elements. XXX Maybe the latter should require a following + * '[' */ + if ( is_existing_identifier(tmpbuf, len, s[0], UTF) + || ( s[0] == '$' + && is_existing_identifier(tmpbuf, len, '@', UTF))) { weight -= 100; - /* khw: Below we keep track of repeated characters; People - * rarely say qr/[aba]/, as the second a is pointless. - * (Some do it though as a mnemonic that is meaningful to - * them.) But generally, repeated characters make things - * more likely to be a charclass. But here, this an - * identifier so likely a subscript. Its spelling should - * be irrelevant to the repeated characters test. So, we - * should advance past it. Suppose it is a hash element, - * like $subscripts{$which}. We should advance past the - * braces and key */ + /* khw: Below we keep track of repeated characters; + * People rarely say qr/[aba]/, as the second a is + * pointless. (Some do it though as a mnemonic that is + * meaningful to them.) But generally, repeated characters + * make things more likely to be a charclass. But here, + * this an identifier so likely a subscript. Its spelling + * should be irrelevant to the repeated characters test. + * So, we should advance past it. Suppose it is a hash + * element, like $subscripts{$which}. We should advance + * past the braces and key */ } - else { - /* Not a multi-char identifier already known in the - * program; is somewhat likely to be a subscript. - * - * khw: Our test suite contains several constructs like - * [$A-Z]. Excluding length 1 identifiers in the - * conditional above means such are much less likely to be - * mistaken for subscripts. I would argue that if the next - * character is a '-' followed by an alpha, that would make - * it much more likely to be a charclass. It would only - * make sense to be an expression if that alpha string is a - * bareword with meaning; something like [$A-ord] */ + else { /* Isn't a known identifier */ + /* Under strict, this means an error. */ + if (under_strict_vars) { + return false; + } + + /* Otherwise still somewhat likely to be a subscript */ weight -= 10; } } - else if ( s[0] == '$' - && s[1] - && memCHRs("[#!%*<>()-=", s[1])) + else if ( len == 1 + && s[0] == '$' + && memCHRs("[#!%*<>()-=", tmpbuf[1])) { /* Here we have what could be a punctuation variable. If the * next character after it is a closing bracket, it makes it * quite likely to be that, and hence a subscript. If it is * something else, more mildly a subscript */ - if (/*{*/ memCHRs("])} =", s[2])) + if (/*{*/ memCHRs("])} =", tmpbuf[2])) weight -= 10; else weight -= 1; } + else { + /* Not a multi-char identifier already known in the program; + * is somewhat likely to be a subscript. + * + * khw: Our test suite contains several constructs like + * [$A-Z]. Excluding length 1 identifiers in the conditional + * above means such are much less likely to be mistaken for + * subscripts. I would argue that if the next character is a + * '-' followed by an alpha, that would make it much more + * likely to be a charclass. It would only make sense to be + * an expression if that alpha string is a bareword with + * meaning; something like [$A-ord] */ + weight -= 10; + } break; /* khw: [:blank:] strongly indicates a charclass */ @@ -4812,66 +4884,64 @@ S_intuit_more(pTHX_ char *s, char *e, * \? must be subscript for things like \d, but not \a. */ - case '\\': - if (s[1]) { - if (memCHRs("wds]", s[1])) { - weight += 100; /* \w \d \s => strongly charclass */ - /* khw: \] can't happen, as any ']' is beyond our search. - * Why not \W \D \S \h \v, etc as well? Should they have - * the same weights as \w \d \s or should all or some be - * in the 'abcfnrtvx' below? */ - } else if (seen[(U8)'\''] || seen[(U8)'"']) { - weight += 1; - /* khw: This is problematic. Enough so, that I misread - * it, and added a wrong comment about what it does in - * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it - * doesn't look at the current character. What it - * actually does is: if any quote has been seen in the - * parse, don't do the rest of the else's below, but for - * every subsequent backslashed character encountered - * (except \0 \w \s \d), increment the weight to lean a - * bit more towards being a charclass. That means that - * every backslash sequence following the first occurrence - * of a quote increments the weight regardless of what the - * sequence is. Again, \0 \w \d and \s are not controlled - * by this else, so they change the weight by a lot more. - * But what makes them so special that they aren't subject - * to this. Any why does having a quote change the - * behavior from then on. And why only backslashed - * sequences get this treatment? This code has been - * unchanged since this function was added in 1993. I - * don't get it. Instead, it does seem to me that it is - * especially unlikely to repeat a quote in a charclass, - * but that having just a single quote is indicative of a - * charclass, and having pairs of quotes is indicative of - * a subscript. Similarly for things that could indicate - * nesting of braces or parens. */ - } - else if (memCHRs("abcfnrtvx", s[1])) - weight += 40; /* \n, etc => charclass */ - /* khw: Why not \e etc as well? */ - else if (isDIGIT(s[1])) { - weight += 40; /* \123 => charclass */ - while (s[1] && isDIGIT(s[1])) - s++; - } - - /* khw: There are lots more possible escape sequences. Some, - * like \A,\z have no special meaning to charclasses, so might - * indicate a subscript, but I don't know what they would be - * doing there either. Some have been added to the language - * after this code was written, but no one thought to, or - * could wade through this function, to add them. Things like - * \p{} for properties, \N and \N{}, for example. - * - * It's problematic that \a is treated as plain 'a' for - * purposes of the 'seen' array. Whatever is matched by these - * backslashed sequences should not be added to 'seen'. That - * includes the backslash. */ - } - else /* \ followed by NUL strongly indicates character class */ + if (s[1] == '\0') { + /* \ followed by NUL strongly indicates character class */ weight += 100; + } + else if (memCHRs("wds]", s[1])) { + weight += 100; /* \w \d \s => strongly charclass */ + /* khw: \] can't happen, as any ']' is beyond our search. Why + * not \W \D \S \h \v, etc as well? Should they have the same + * weights as \w \d \s or should all or some be in the + * 'abcfnrtvx' below? */ + } + else if (seen[(U8)'\''] || seen[(U8)'"']) { + weight += 1; + /* khw: This is problematic. Enough so, that I misread it, + * and added a wrong comment about what it does in + * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it + * doesn't look at the current character. What it actually + * does is: if any quote has been seen in the parse, don't do + * the rest of the else's below, but for every subsequent + * backslashed character encountered (except \0 \w \s \d), + * increment the weight to lean a bit more towards being a + * charclass. That means that every backslash sequence + * following the first occurrence of a quote increments the + * weight regardless of what the sequence is. Again, \0 \w \d + * and \s are not controlled by this else, so they change the + * weight by a lot more. But what makes them so special that + * they aren't subject to this. Any why does having a quote + * change the behavior from then on. And why only backslashed + * sequences get this treatment? This code has been unchanged + * since this function was added in 1993. I don't get it. + * Instead, it does seem to me that it is especially unlikely + * to repeat a quote in a charclass, but that having just a + * single quote is indicative of a charclass, and having pairs + * of quotes is indicative of a subscript. Similarly for + * things that could indicate nesting of braces or parens. */ + } + else if (memCHRs("abcfnrtvx", s[1])) + weight += 40; /* \n, etc => charclass */ + /* khw: Why not \e etc as well? */ + else if (isDIGIT(s[1])) { + weight += 40; /* \123 => charclass */ + while (s[1] && isDIGIT(s[1])) + s++; + } + + /* khw: There are lots more possible escape sequences. Some, like + * \A,\z have no special meaning to charclasses, so might indicate + * a subscript, but I don't know what they would be doing there + * either. Some have been added to the language after this code + * was written, but no one thought to, or could wade through this + * function, to add them. Things like \p{} for properties, \N and + * \N{}, for example. + * + * It's problematic that \a is treated as plain 'a' for purposes + * of the 'seen' array. Whatever is matched by these backslashed + * sequences should not be added to 'seen'. That includes the + * backslash. */ break; case '-':