Perl · khwilliamson · Aug 17, 2025 · Sep 5, 2025 · Sep 5, 2025 · Oct 27, 2025
diff --git a/toke.c b/toke.c
@@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e,
      * written, and regcurly never required a comma, as in {0}.  Probably it is
      * ok as-is */
     if (s[0] == '{') {
-        if (regcurly(s, e, NULL)) {
-            return FALSE;
-        }
-        return TRUE;
+        return ! regcurly(s, e, NULL);
     }
 
     /* Here is '[': maybe we have a character class.  Examine the guts */
@@ -4735,27 +4732,28 @@ S_intuit_more(pTHX_ char *s, char *e,
              * looks for.
              *
              */
-            if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
-                Size_t len;
 
                 /* khw: where did the magic number 4 come from?.  This buffer
                  * was 4 times as large as tokenbuf in 1997, and had not
                  * changed since the code was first added */
                 char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
 
-                if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY))
+                /* (Reserve tmpbuf[0] for future commits) */
+                if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
+                                 CHECK_ONLY))
                 {
                     /* An illegal identifier means this can't be a subscript;
                      * it's an error or it could be a charclass */
                     return false;
                 }
 
-                len = strlen(tmpbuf);
+                Size_t len; /* (C++ forbids joining these 2 lines) */
+                len = strlen(tmpbuf + 1);
 
                 /* khw: This only looks at global variables; lexicals came
                  * later, and this hasn't been updated.  Ouch!! */
                 if (   len > 1
-                    && gv_fetchpvn_flags(tmpbuf,
+                    && gv_fetchpvn_flags(tmpbuf + 1,
                                          len,
                                          UTF ? SVf_UTF8 : 0,
                                          SVt_PV))
@@ -4773,6 +4771,20 @@ S_intuit_more(pTHX_ char *s, char *e,
                      * like $subscripts{$which}.  We should advance past the
                      * braces and key */
                 }
+                else if (len == 1) {
+                 if (   s[0] == '$'
+                     && s[1]
+                     && memCHRs("[#!%*<>()-=", tmpbuf[1]))
+            {
+                /* Here we have what could be a punctuation variable.  If the
+                 * next character after it is a closing bracket, it makes it
+                 * quite likely to be that, and hence a subscript.  If it is
+                 * something else, more mildly a subscript */
+                if (/*{*/ memCHRs("])} =", tmpbuf[2]))
+                    weight -= 10;
+                else
+                    weight -= 1;
+            }
                 else {
                     /* Not a multi-char identifier already known in the
                      * program; is somewhat likely to be a subscript.
@@ -4788,19 +4800,6 @@ S_intuit_more(pTHX_ char *s, char *e,
                     weight -= 10;
                 }
             }
-            else if (   s[0] == '$'
-                     && s[1]
-                     && memCHRs("[#!%*<>()-=", s[1]))
-            {
-                /* Here we have what could be a punctuation variable.  If the
-                 * next character after it is a closing bracket, it makes it
-                 * quite likely to be that, and hence a subscript.  If it is
-                 * something else, more mildly a subscript */
-                if (/*{*/ memCHRs("])} =", s[2]))
-                    weight -= 10;
-                else
-                    weight -= 1;
-            }
             break;
 
           /* khw:  [:blank:] strongly indicates a charclass */
@@ -4814,64 +4813,63 @@ S_intuit_more(pTHX_ char *s, char *e,
 
 
           case '\\':
-            if (s[1]) {
-                if (memCHRs("wds]", s[1])) {
-                    weight += 100;  /* \w \d \s => strongly charclass */
-                    /* khw: \] can't happen, as any ']' is beyond our search.
-                     * Why not \W \D \S \h \v, etc as well?  Should they have
-                     * the same weights as \w \d \s or should all or some be
-                     * in the 'abcfnrtvx' below? */
-                } else if (seen[(U8)'\''] || seen[(U8)'"']) {
-                    weight += 1;
-                    /* khw: This is problematic.  Enough so, that I misread
-                     * it, and added a wrong comment about what it does in
-                     * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it
-                     * doesn't look at the current character.  What it
-                     * actually does is: if any quote has been seen in the
-                     * parse, don't do the rest of the else's below, but for
-                     * every subsequent backslashed character encountered
-                     * (except \0 \w \s \d), increment the weight to lean a
-                     * bit more towards being a charclass.  That means that
-                     * every backslash sequence following the first occurrence
-                     * of a quote increments the weight regardless of what the
-                     * sequence is.  Again, \0 \w \d and \s are not controlled
-                     * by this else, so they change the weight by a lot more.
-                     * But what makes them so special that they aren't subject
-                     * to this.  Any why does having a quote change the
-                     * behavior from then on.  And why only backslashed
-                     * sequences get this treatment?  This code has been
-                     * unchanged since this function was added in 1993.  I
-                     * don't get it.  Instead, it does seem to me that it is
-                     * especially unlikely to repeat a quote in a charclass,
-                     * but that having just a single quote is indicative of a
-                     * charclass, and having pairs of quotes is indicative of
-                     * a subscript.  Similarly for things that could indicate
-                     * nesting of braces or parens. */
-                }
-                else if (memCHRs("abcfnrtvx", s[1]))
-                    weight += 40;   /* \n, etc => charclass */
-                    /* khw: Why not \e etc as well? */
-                else if (isDIGIT(s[1])) {
-                    weight += 40;   /* \123 => charclass */
-                    while (s[1] && isDIGIT(s[1]))
-                        s++;
-                }
-
-                /* khw: There are lots more possible escape sequences.  Some,
-                 * like \A,\z have no special meaning to charclasses, so might
-                 * indicate a subscript, but I don't know what they would be
-                 * doing there either.  Some have been added to the language
-                 * after this code was written, but no one thought to, or
-                 * could wade through this function, to add them.  Things like
-                 * \p{} for properties, \N and \N{}, for example.
-                 *
-                 * It's problematic that \a is treated as plain 'a' for
-                 * purposes of the 'seen' array.  Whatever is matched by these
-                 * backslashed sequences should not be added to 'seen'.  That
-                 * includes the backslash. */
-            }
-            else /* \ followed by NUL strongly indicates character class */
+            if (s[1] == '\0') {
+                /* \ followed by NUL strongly indicates character class */
                 weight += 100;
+            }
+            else if (memCHRs("wds]", s[1])) {
+                weight += 100;  /* \w \d \s => strongly charclass */
+                /* khw: \] can't happen, as any ']' is beyond our search.  Why
+                 * not \W \D \S \h \v, etc as well?  Should they have the same
+                 * weights as \w \d \s or should all or some be in the
+                 * 'abcfnrtvx' below? */
+            }
+            else if (seen[(U8)'\''] || seen[(U8)'"']) {
+                weight += 1;
+                /* khw: This is problematic.  Enough so, that I misread it,
+                 * and added a wrong comment about what it does in
+                 * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it
+                 * doesn't look at the current character.  What it actually
+                 * does is: if any quote has been seen in the parse, don't do
+                 * the rest of the else's below, but for every subsequent
+                 * backslashed character encountered (except \0 \w \s \d),
+                 * increment the weight to lean a bit more towards being a
+                 * charclass.  That means that every backslash sequence
+                 * following the first occurrence of a quote increments the
+                 * weight regardless of what the sequence is.  Again, \0 \w \d
+                 * and \s are not controlled by this else, so they change the
+                 * weight by a lot more.  But what makes them so special that
+                 * they aren't subject to this.  Any why does having a quote
+                 * change the behavior from then on.  And why only backslashed
+                 * sequences get this treatment?  This code has been unchanged
+                 * since this function was added in 1993.  I don't get it.
+                 * Instead, it does seem to me that it is especially unlikely
+                 * to repeat a quote in a charclass, but that having just a
+                 * single quote is indicative of a charclass, and having pairs
+                 * of quotes is indicative of a subscript.  Similarly for
+                 * things that could indicate nesting of braces or parens. */
+            }
+            else if (memCHRs("abcfnrtvx", s[1]))
+                weight += 40;   /* \n, etc => charclass */
+                /* khw: Why not \e etc as well? */
+            else if (isDIGIT(s[1])) {
+                weight += 40;   /* \123 => charclass */
+                while (s[1] && isDIGIT(s[1]))
+                    s++;
+            }
+
+            /* khw: There are lots more possible escape sequences.  Some, like
+             * \A,\z have no special meaning to charclasses, so might indicate
+             * a subscript, but I don't know what they would be doing there
+             * either.  Some have been added to the language after this code
+             * was written, but no one thought to, or could wade through this
+             * function, to add them.  Things like \p{} for properties, \N and
+             * \N{}, for example.
+             *
+             * It's problematic that \a is treated as plain 'a' for purposes
+             * of the 'seen' array.  Whatever is matched by these backslashed
+             * sequences should not be added to 'seen'.  That includes the
+             * backslash. */
             break;
 
           case '-':