S_scan_ident: Char in middle needt not be IDStart

khwilliamson · khwilliamson · commit c0ca4c8d4969 · 2025-10-26T13:58:09.000-06:00
This fixes a bug in this function, in which it required the second
character in an identifier to be IDStart, instead of IDCont.  This
hasn't been caught because most identifiers are ASCII, and generally for
the purposes of this function in the ASCII range, all \w characters can
be IDStart.
diff --git a/t/comp/parser.t b/t/comp/parser.t
@@ -8,7 +8,7 @@ BEGIN {
     chdir 't' if -d 't';
 }
 
-print "1..192\n";
+print "1..193\n";
 
 sub failed {
     my ($got, $expected, $name) = @_;
@@ -673,6 +673,18 @@ is $@, "", 'substr keys assignment';
     is ($@, "", "Handles all numeric package component after ::");
 }
 
+{
+    my $expected = "this is the way the identifier ends; not with a bang";
+    my $result;
+    eval "use utf8; my \$e\x{1df8}claire = '$expected'; \$result = \${e\x{1df8}claire}";
+    if ($@) {
+        failed($@, "no error", "Didn't crash");
+    }
+    else {
+        is ($result, $expected, "Parser can handle a continuation as 2nd char");
+    }
+}
+
 # Add new tests HERE (above this line)
 
 # bug #74022: Loop on characters in \p{OtherIDContinue}
diff --git a/toke.c b/toke.c
@@ -10933,26 +10933,26 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
              * be conflated with a control character identifier. */
             if (advance) {
 
-                /* Now parse the normal identifier.
-                 *
-                 * khw: The code below is buggy because we already have parsed
-                 * and copied the first character of it.  The next character
-                 * could be any IDCONT one, not just an IDFIRST */
+                /* Now parse the normal identifier.  But note, we already have
+                 * parsed and copied the first character of it.  That means we
+                 * are jumping into the middle; so tell that to parse_ident.
+                 * */
                 d += advance;
                 s = parse_ident(s, PL_bufend, &d, e, is_utf8,
-                                (ALLOW_PACKAGE | CHECK_DOLLAR));
+                                (ALLOW_PACKAGE|CHECK_DOLLAR)|IDCONT_first_OK);
             }
             else { /* caret word: ${^Foo} ${^CAPTURE[0]} */
 
                 /* Now parse the control character identifier.  Again, we have
-                 * already copied the first character. */
+                 * already copied the first character.  This routine is
+                 * sufficiently chummy with parse_ident to know that when we
+                 * say the string isn't UTF-8, it will do the right thing in
+                 * looking only for ASCII \w characters as identifier
+                 * continuations */
                 d++;
-                while (isWORDCHAR(*s) && d < e) {
-                    *d++ = *s++;
-                }
-                if (d >= e)
-                    croak("%s", ident_too_long);
-                *d = '\0';
+                s = parse_ident(s, PL_bufend, &d, e,
+                                false,  /* Don't allow UTF-8 */
+                                IDCONT_first_OK);
             }
 
             tmp_copline = CopLINE(PL_curcop);