Skip to content

Commit 55164d8

Browse files
committed
S_parse_ident: Add ability to parse only IDFIRST strings
An identifier parsed by this function can include the ones most people would expect, but also ones that begin with a digit followed by ASCII \w characters. This commit adds a flag so that the function doesn't recognize the latter type as an identifier
1 parent fff0b52 commit 55164d8

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

toke.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th
171171
#define CHECK_KEYWORD (1 << 0)
172172
#define ALLOW_PACKAGE (1 << 1)
173173
#define CHECK_DOLLAR (1 << 2)
174+
#define IDFIRST_ONLY (1 << 3)
174175

175176
#ifdef DEBUGGING
176177
static const char* const lex_state_names[] = {
@@ -10313,9 +10314,12 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end,
1031310314
* 1) A normal identifier whose first character matches IDFIRST followed
1031410315
* by any number of characters which match IDCONT.
1031510316
* 2) An identifier that begins with an ASCII digit followed by any number
10316-
* of ASCII \w characters
10317-
*
10318-
* The function copies the identifier into the destination starting at *d
10317+
* of ASCII \w characters. This type can be prohibited, so that
10318+
* anything that doesn't match type 1) is not considered an identifier.
10319+
*/
10320+
const bool idfirst_only = flags & IDFIRST_ONLY;
10321+
10322+
/* The function copies the identifier into the destination starting at *d
1031910323
* (whose upper bound is 'e') and advances *d to point to just beyond the
1032010324
* end of the identifier, setting **d to a NUL character.
1032110325
*
@@ -10344,15 +10348,18 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end,
1034410348
* Unicode definition only when UTF-8 is in effect. We have to check
1034510349
* for the subset before checking for the superset. */
1034610350
Size_t advance;
10347-
if (is_utf8 && (advance = isIDFIRST_utf8_safe(s, s_end))) {
10351+
if ( (advance = isIDFIRST_lazy_if_safe(s, s_end, is_utf8))
10352+
&& (is_utf8 || idfirst_only))
10353+
{
1034810354
const char *this_start = s;
1034910355
s += advance;
1035010356

1035110357
/* Find the end of the identifier by accumulating characters until
1035210358
* find a non-identifier character */
1035310359
while (s < s_end) {
10354-
advance = isIDCONT_utf8_safe((const U8*) s,
10355-
(const U8*) s_end);
10360+
advance = isIDCONT_lazy_if_safe((const U8*) s,
10361+
(const U8*) s_end,
10362+
is_utf8);
1035610363
if (advance == 0) { /* Not an identifier character */
1035710364
break;
1035810365
}
@@ -10371,7 +10378,7 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end,
1037110378
Copy(this_start, *d, this_length, char);
1037210379
*d += this_length;
1037310380
}
10374-
else if ( isWORDCHAR_A(*s) ) {
10381+
else if (! idfirst_only && isWORDCHAR_A(*s) ) {
1037510382

1037610383
/* This is the superset; it accepts \w+, including an initial
1037710384
* digit */

0 commit comments

Comments
 (0)