Skip to content
Open
156 changes: 77 additions & 79 deletions toke.c
Original file line number Diff line number Diff line change
Expand Up @@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e,
* written, and regcurly never required a comma, as in {0}. Probably it is
* ok as-is */
if (s[0] == '{') {
if (regcurly(s, e, NULL)) {
return FALSE;
}
return TRUE;
return ! regcurly(s, e, NULL);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great change. Former pattern appears to be used in other places, marking such cleanup as my next TODO once my current PR is merged.

}

/* Here is '[': maybe we have a character class. Examine the guts */
Expand Down Expand Up @@ -4735,27 +4732,28 @@ S_intuit_more(pTHX_ char *s, char *e,
* looks for.
*
*/
if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
Size_t len;

/* khw: where did the magic number 4 come from?. This buffer
* was 4 times as large as tokenbuf in 1997, and had not
* changed since the code was first added */
char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];

if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY))
/* (Reserve tmpbuf[0] for future commits) */
if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
CHECK_ONLY))
{
/* An illegal identifier means this can't be a subscript;
* it's an error or it could be a charclass */
return false;
}

len = strlen(tmpbuf);
Size_t len; /* (C++ forbids joining these 2 lines) */
len = strlen(tmpbuf + 1);

/* khw: This only looks at global variables; lexicals came
* later, and this hasn't been updated. Ouch!! */
if ( len > 1
&& gv_fetchpvn_flags(tmpbuf,
&& gv_fetchpvn_flags(tmpbuf + 1,
len,
UTF ? SVf_UTF8 : 0,
SVt_PV))
Expand All @@ -4773,6 +4771,20 @@ S_intuit_more(pTHX_ char *s, char *e,
* like $subscripts{$which}. We should advance past the
* braces and key */
}
else if (len == 1) {
if ( s[0] == '$'
&& s[1]
&& memCHRs("[#!%*<>()-=", tmpbuf[1]))
{
/* Here we have what could be a punctuation variable. If the
* next character after it is a closing bracket, it makes it
* quite likely to be that, and hence a subscript. If it is
* something else, more mildly a subscript */
if (/*{*/ memCHRs("])} =", tmpbuf[2]))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

accidentally deleted previous comment :(

little bit confused: this happens in branch when strlen of tmpbuf is already checked to be 1, so tmpbuf[2] should \0

weight -= 10;
else
weight -= 1;
}
else {
/* Not a multi-char identifier already known in the
* program; is somewhat likely to be a subscript.
Expand All @@ -4788,19 +4800,6 @@ S_intuit_more(pTHX_ char *s, char *e,
weight -= 10;
}
}
else if ( s[0] == '$'
&& s[1]
&& memCHRs("[#!%*<>()-=", s[1]))
{
/* Here we have what could be a punctuation variable. If the
* next character after it is a closing bracket, it makes it
* quite likely to be that, and hence a subscript. If it is
* something else, more mildly a subscript */
if (/*{*/ memCHRs("])} =", s[2]))
weight -= 10;
else
weight -= 1;
}
break;

/* khw: [:blank:] strongly indicates a charclass */
Expand All @@ -4814,64 +4813,63 @@ S_intuit_more(pTHX_ char *s, char *e,


case '\\':
if (s[1]) {
if (memCHRs("wds]", s[1])) {
weight += 100; /* \w \d \s => strongly charclass */
/* khw: \] can't happen, as any ']' is beyond our search.
* Why not \W \D \S \h \v, etc as well? Should they have
* the same weights as \w \d \s or should all or some be
* in the 'abcfnrtvx' below? */
} else if (seen[(U8)'\''] || seen[(U8)'"']) {
weight += 1;
/* khw: This is problematic. Enough so, that I misread
* it, and added a wrong comment about what it does in
* 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
* doesn't look at the current character. What it
* actually does is: if any quote has been seen in the
* parse, don't do the rest of the else's below, but for
* every subsequent backslashed character encountered
* (except \0 \w \s \d), increment the weight to lean a
* bit more towards being a charclass. That means that
* every backslash sequence following the first occurrence
* of a quote increments the weight regardless of what the
* sequence is. Again, \0 \w \d and \s are not controlled
* by this else, so they change the weight by a lot more.
* But what makes them so special that they aren't subject
* to this. Any why does having a quote change the
* behavior from then on. And why only backslashed
* sequences get this treatment? This code has been
* unchanged since this function was added in 1993. I
* don't get it. Instead, it does seem to me that it is
* especially unlikely to repeat a quote in a charclass,
* but that having just a single quote is indicative of a
* charclass, and having pairs of quotes is indicative of
* a subscript. Similarly for things that could indicate
* nesting of braces or parens. */
}
else if (memCHRs("abcfnrtvx", s[1]))
weight += 40; /* \n, etc => charclass */
/* khw: Why not \e etc as well? */
else if (isDIGIT(s[1])) {
weight += 40; /* \123 => charclass */
while (s[1] && isDIGIT(s[1]))
s++;
}

/* khw: There are lots more possible escape sequences. Some,
* like \A,\z have no special meaning to charclasses, so might
* indicate a subscript, but I don't know what they would be
* doing there either. Some have been added to the language
* after this code was written, but no one thought to, or
* could wade through this function, to add them. Things like
* \p{} for properties, \N and \N{}, for example.
*
* It's problematic that \a is treated as plain 'a' for
* purposes of the 'seen' array. Whatever is matched by these
* backslashed sequences should not be added to 'seen'. That
* includes the backslash. */
}
else /* \ followed by NUL strongly indicates character class */
if (s[1] == '\0') {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally: great change, maybe even add break into then branches and get rid of else if branches

Curious question: common pattern is condition if (s[1]), so why not use if (! s[1] as well

/* \ followed by NUL strongly indicates character class */
weight += 100;
}
else if (memCHRs("wds]", s[1])) {
weight += 100; /* \w \d \s => strongly charclass */
/* khw: \] can't happen, as any ']' is beyond our search. Why
* not \W \D \S \h \v, etc as well? Should they have the same
* weights as \w \d \s or should all or some be in the
* 'abcfnrtvx' below? */
}
else if (seen[(U8)'\''] || seen[(U8)'"']) {
weight += 1;
/* khw: This is problematic. Enough so, that I misread it,
* and added a wrong comment about what it does in
* 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
* doesn't look at the current character. What it actually
* does is: if any quote has been seen in the parse, don't do
* the rest of the else's below, but for every subsequent
* backslashed character encountered (except \0 \w \s \d),
* increment the weight to lean a bit more towards being a
* charclass. That means that every backslash sequence
* following the first occurrence of a quote increments the
* weight regardless of what the sequence is. Again, \0 \w \d
* and \s are not controlled by this else, so they change the
* weight by a lot more. But what makes them so special that
* they aren't subject to this. Any why does having a quote
* change the behavior from then on. And why only backslashed
* sequences get this treatment? This code has been unchanged
* since this function was added in 1993. I don't get it.
* Instead, it does seem to me that it is especially unlikely
* to repeat a quote in a charclass, but that having just a
* single quote is indicative of a charclass, and having pairs
* of quotes is indicative of a subscript. Similarly for
* things that could indicate nesting of braces or parens. */
}
else if (memCHRs("abcfnrtvx", s[1]))
weight += 40; /* \n, etc => charclass */
/* khw: Why not \e etc as well? */
else if (isDIGIT(s[1])) {
weight += 40; /* \123 => charclass */
while (s[1] && isDIGIT(s[1]))
s++;
}

/* khw: There are lots more possible escape sequences. Some, like
* \A,\z have no special meaning to charclasses, so might indicate
* a subscript, but I don't know what they would be doing there
* either. Some have been added to the language after this code
* was written, but no one thought to, or could wade through this
* function, to add them. Things like \p{} for properties, \N and
* \N{}, for example.
*
* It's problematic that \a is treated as plain 'a' for purposes
* of the 'seen' array. Whatever is matched by these backslashed
* sequences should not be added to 'seen'. That includes the
* backslash. */
break;

case '-':
Expand Down