-
Notifications
You must be signed in to change notification settings - Fork 601
toke.c: S_intuit_more: More effectively use S_scan_ident #23880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: blead
Are you sure you want to change the base?
Changes from 6 commits
3b86b9c
b302a19
eb6d467
5eeec5e
2aeccb9
5d79145
26848f0
7eb08b7
b61e5b8
ac4d84e
5e9f082
9f31304
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| * written, and regcurly never required a comma, as in {0}. Probably it is | ||
| * ok as-is */ | ||
| if (s[0] == '{') { | ||
| if (regcurly(s, e, NULL)) { | ||
| return FALSE; | ||
| } | ||
| return TRUE; | ||
| return ! regcurly(s, e, NULL); | ||
| } | ||
|
|
||
| /* Here is '[': maybe we have a character class. Examine the guts */ | ||
|
|
@@ -4735,27 +4732,28 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| * looks for. | ||
| * | ||
| */ | ||
| if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) { | ||
| Size_t len; | ||
|
|
||
| /* khw: where did the magic number 4 come from?. This buffer | ||
| * was 4 times as large as tokenbuf in 1997, and had not | ||
| * changed since the code was first added */ | ||
| char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; | ||
|
|
||
| if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY)) | ||
| /* (Reserve tmpbuf[0] for future commits) */ | ||
| if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), | ||
| CHECK_ONLY)) | ||
| { | ||
| /* An illegal identifier means this can't be a subscript; | ||
| * it's an error or it could be a charclass */ | ||
| return false; | ||
| } | ||
|
|
||
| len = strlen(tmpbuf); | ||
| Size_t len; /* (C++ forbids joining these 2 lines) */ | ||
| len = strlen(tmpbuf + 1); | ||
|
|
||
| /* khw: This only looks at global variables; lexicals came | ||
| * later, and this hasn't been updated. Ouch!! */ | ||
| if ( len > 1 | ||
| && gv_fetchpvn_flags(tmpbuf, | ||
| && gv_fetchpvn_flags(tmpbuf + 1, | ||
| len, | ||
| UTF ? SVf_UTF8 : 0, | ||
| SVt_PV)) | ||
|
|
@@ -4773,6 +4771,20 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| * like $subscripts{$which}. We should advance past the | ||
| * braces and key */ | ||
| } | ||
| else if (len == 1) { | ||
| if ( s[0] == '$' | ||
| && s[1] | ||
| && memCHRs("[#!%*<>()-=", tmpbuf[1])) | ||
| { | ||
| /* Here we have what could be a punctuation variable. If the | ||
| * next character after it is a closing bracket, it makes it | ||
| * quite likely to be that, and hence a subscript. If it is | ||
| * something else, more mildly a subscript */ | ||
| if (/*{*/ memCHRs("])} =", tmpbuf[2])) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. accidentally deleted previous comment :( little bit confused: this happens in branch when strlen of tmpbuf is already checked to be 1, so tmpbuf[2] should \0 |
||
| weight -= 10; | ||
| else | ||
| weight -= 1; | ||
| } | ||
| else { | ||
| /* Not a multi-char identifier already known in the | ||
| * program; is somewhat likely to be a subscript. | ||
|
|
@@ -4788,19 +4800,6 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| weight -= 10; | ||
| } | ||
| } | ||
| else if ( s[0] == '$' | ||
| && s[1] | ||
| && memCHRs("[#!%*<>()-=", s[1])) | ||
| { | ||
| /* Here we have what could be a punctuation variable. If the | ||
| * next character after it is a closing bracket, it makes it | ||
| * quite likely to be that, and hence a subscript. If it is | ||
| * something else, more mildly a subscript */ | ||
| if (/*{*/ memCHRs("])} =", s[2])) | ||
| weight -= 10; | ||
| else | ||
| weight -= 1; | ||
| } | ||
| break; | ||
|
|
||
| /* khw: [:blank:] strongly indicates a charclass */ | ||
|
|
@@ -4814,64 +4813,63 @@ S_intuit_more(pTHX_ char *s, char *e, | |
|
|
||
|
|
||
| case '\\': | ||
| if (s[1]) { | ||
| if (memCHRs("wds]", s[1])) { | ||
| weight += 100; /* \w \d \s => strongly charclass */ | ||
| /* khw: \] can't happen, as any ']' is beyond our search. | ||
| * Why not \W \D \S \h \v, etc as well? Should they have | ||
| * the same weights as \w \d \s or should all or some be | ||
| * in the 'abcfnrtvx' below? */ | ||
| } else if (seen[(U8)'\''] || seen[(U8)'"']) { | ||
| weight += 1; | ||
| /* khw: This is problematic. Enough so, that I misread | ||
| * it, and added a wrong comment about what it does in | ||
| * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it | ||
| * doesn't look at the current character. What it | ||
| * actually does is: if any quote has been seen in the | ||
| * parse, don't do the rest of the else's below, but for | ||
| * every subsequent backslashed character encountered | ||
| * (except \0 \w \s \d), increment the weight to lean a | ||
| * bit more towards being a charclass. That means that | ||
| * every backslash sequence following the first occurrence | ||
| * of a quote increments the weight regardless of what the | ||
| * sequence is. Again, \0 \w \d and \s are not controlled | ||
| * by this else, so they change the weight by a lot more. | ||
| * But what makes them so special that they aren't subject | ||
| * to this. Any why does having a quote change the | ||
| * behavior from then on. And why only backslashed | ||
| * sequences get this treatment? This code has been | ||
| * unchanged since this function was added in 1993. I | ||
| * don't get it. Instead, it does seem to me that it is | ||
| * especially unlikely to repeat a quote in a charclass, | ||
| * but that having just a single quote is indicative of a | ||
| * charclass, and having pairs of quotes is indicative of | ||
| * a subscript. Similarly for things that could indicate | ||
| * nesting of braces or parens. */ | ||
| } | ||
| else if (memCHRs("abcfnrtvx", s[1])) | ||
| weight += 40; /* \n, etc => charclass */ | ||
| /* khw: Why not \e etc as well? */ | ||
| else if (isDIGIT(s[1])) { | ||
| weight += 40; /* \123 => charclass */ | ||
| while (s[1] && isDIGIT(s[1])) | ||
| s++; | ||
| } | ||
|
|
||
| /* khw: There are lots more possible escape sequences. Some, | ||
| * like \A,\z have no special meaning to charclasses, so might | ||
| * indicate a subscript, but I don't know what they would be | ||
| * doing there either. Some have been added to the language | ||
| * after this code was written, but no one thought to, or | ||
| * could wade through this function, to add them. Things like | ||
| * \p{} for properties, \N and \N{}, for example. | ||
| * | ||
| * It's problematic that \a is treated as plain 'a' for | ||
| * purposes of the 'seen' array. Whatever is matched by these | ||
| * backslashed sequences should not be added to 'seen'. That | ||
| * includes the backslash. */ | ||
| } | ||
| else /* \ followed by NUL strongly indicates character class */ | ||
| if (s[1] == '\0') { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally: great change, maybe even add Curious question: common pattern is condition |
||
| /* \ followed by NUL strongly indicates character class */ | ||
| weight += 100; | ||
| } | ||
| else if (memCHRs("wds]", s[1])) { | ||
| weight += 100; /* \w \d \s => strongly charclass */ | ||
| /* khw: \] can't happen, as any ']' is beyond our search. Why | ||
| * not \W \D \S \h \v, etc as well? Should they have the same | ||
| * weights as \w \d \s or should all or some be in the | ||
| * 'abcfnrtvx' below? */ | ||
| } | ||
| else if (seen[(U8)'\''] || seen[(U8)'"']) { | ||
| weight += 1; | ||
| /* khw: This is problematic. Enough so, that I misread it, | ||
| * and added a wrong comment about what it does in | ||
| * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it | ||
| * doesn't look at the current character. What it actually | ||
| * does is: if any quote has been seen in the parse, don't do | ||
| * the rest of the else's below, but for every subsequent | ||
| * backslashed character encountered (except \0 \w \s \d), | ||
| * increment the weight to lean a bit more towards being a | ||
| * charclass. That means that every backslash sequence | ||
| * following the first occurrence of a quote increments the | ||
| * weight regardless of what the sequence is. Again, \0 \w \d | ||
| * and \s are not controlled by this else, so they change the | ||
| * weight by a lot more. But what makes them so special that | ||
| * they aren't subject to this. Any why does having a quote | ||
| * change the behavior from then on. And why only backslashed | ||
| * sequences get this treatment? This code has been unchanged | ||
| * since this function was added in 1993. I don't get it. | ||
| * Instead, it does seem to me that it is especially unlikely | ||
| * to repeat a quote in a charclass, but that having just a | ||
| * single quote is indicative of a charclass, and having pairs | ||
| * of quotes is indicative of a subscript. Similarly for | ||
| * things that could indicate nesting of braces or parens. */ | ||
| } | ||
| else if (memCHRs("abcfnrtvx", s[1])) | ||
| weight += 40; /* \n, etc => charclass */ | ||
| /* khw: Why not \e etc as well? */ | ||
| else if (isDIGIT(s[1])) { | ||
| weight += 40; /* \123 => charclass */ | ||
| while (s[1] && isDIGIT(s[1])) | ||
| s++; | ||
| } | ||
|
|
||
| /* khw: There are lots more possible escape sequences. Some, like | ||
| * \A,\z have no special meaning to charclasses, so might indicate | ||
| * a subscript, but I don't know what they would be doing there | ||
| * either. Some have been added to the language after this code | ||
| * was written, but no one thought to, or could wade through this | ||
| * function, to add them. Things like \p{} for properties, \N and | ||
| * \N{}, for example. | ||
| * | ||
| * It's problematic that \a is treated as plain 'a' for purposes | ||
| * of the 'seen' array. Whatever is matched by these backslashed | ||
| * sequences should not be added to 'seen'. That includes the | ||
| * backslash. */ | ||
| break; | ||
|
|
||
| case '-': | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great change. Former pattern appears to be used in other places, marking such cleanup as my next TODO once my current PR is merged.