@@ -4545,19 +4545,42 @@ S_intuit_more(pTHX_ char *s, char *e)
45454545 /* Here is '[': maybe we have a character class. Examine the guts */
45464546 s ++ ;
45474547
4548+ /* khw: If the context of this call is $foo[...], we may be able to avoid
4549+ * the heuristics below. The possibilities are:
4550+ * strict @foo $foo
4551+ * vars? exists exists
4552+ * y n n This is an error; return false now
4553+ * y n y must be a a charclass
4554+ * y y n must be a a subscript
4555+ * y y y ambiguous; do heuristics below
4556+ * n n n ambiguous; do heuristics below
4557+ * n n y ambiguous; do heuristics below, but I
4558+ * wonder if the initial bias should be a
4559+ * little towards charclass
4560+ * n y n ambiguous; do heuristics below, but I
4561+ * wonder if the initial bias should be a
4562+ * little towards subscript
4563+ * n y y ambiguous; do heuristics below
4564+ */
4565+
45484566 /* '^' implies a character class; An empty '[]' isn't legal, but it does
45494567 * mean there isn't more to come */
45504568 if (s [0 ] == ']' || s [0 ] == '^' )
45514569 return FALSE;
45524570
4553- /* Find matching ']'. khw: This means any s[1] below is guaranteed to
4554- * exist */
4571+ /* Find matching ']'. khw: Actually it finds the next ']' and assumes it
4572+ * matches the '['. In order to account for the possibility of the ']'
4573+ * being inside the scope of \Q or preceded by an even number of
4574+ * backslashes, this should be rewritten */
45554575 const char * const send = (char * ) memchr (s , ']' , e - s );
45564576 if (! send ) /* has to be an expression */
45574577 return TRUE;
45584578
45594579 /* If the construct consists entirely of one or two digits, call it a
4560- * subscript. */
4580+ * subscript.
4581+ *
4582+ * khw: No one writes 03 to mean 3. Any string of digits beginning with
4583+ * '0' is likely to be a charclass, including length 2 ones. */
45614584 if (isDIGIT (s [0 ]) && send - s <= 2 && (send - s == 1 || (isDIGIT (s [1 ])))) {
45624585 return TRUE;
45634586 }
@@ -4613,15 +4636,42 @@ S_intuit_more(pTHX_ char *s, char *e)
46134636 char tmpbuf [sizeof PL_tokenbuf * 4 ];
46144637 scan_ident (s , tmpbuf , sizeof tmpbuf , FALSE);
46154638 len = (int )strlen (tmpbuf );
4639+
4640+ /* khw: This only looks at global variables; lexicals came
4641+ * later, and this hasn't been updated. Ouch!! */
46164642 if ( len > 1
46174643 && gv_fetchpvn_flags (tmpbuf ,
46184644 len ,
46194645 UTF ? SVf_UTF8 : 0 ,
46204646 SVt_PV ))
4647+ {
46214648 weight -= 100 ;
4622- else /* Not a multi-char identifier already known in the
4623- program; is somewhat likely to be a subscript */
4649+
4650+ /* khw: Below we keep track of repeated characters; People
4651+ * rarely say qr/[aba]/, as the second a is pointless.
4652+ * (Some do it though as a mnemonic that is meaningful to
4653+ * them.) But generally, repeated characters makes things
4654+ * more likely to be a charclass. But we have here that
4655+ * this an identifier so likely a subscript. Its spelling
4656+ * should be irrelevant to the repeated characters test.
4657+ * So, we should advance past it. Suppose it is a hash
4658+ * element, like $subscripts{$which}. We should advance
4659+ * past the braces and key */
4660+ }
4661+ else {
4662+ /* Not a multi-char identifier already known in the
4663+ * program; is somewhat likely to be a subscript.
4664+ *
4665+ * khw: Our test suite contains several constructs like
4666+ * [$A-Z]. Excluding length 1 identifiers in the
4667+ * conditional above means such are much less likely to be
4668+ * mistaken for subscripts. I would argue that if the next
4669+ * character is a '-' followed by an alpha, that would make
4670+ * it much more likely to be a charclass. It would only
4671+ * make sense to be an expression if that alpha string is a
4672+ * bareword with meaning; something like [$A-ord] */
46244673 weight -= 10 ;
4674+ }
46254675 }
46264676 else if ( s [0 ] == '$'
46274677 && s [1 ]
@@ -4638,13 +4688,43 @@ S_intuit_more(pTHX_ char *s, char *e)
46384688 }
46394689 break ;
46404690
4691+ /* khw: [:blank:] strongly indicates a charclass */
4692+
46414693 case '\\' :
46424694 if (s [1 ]) {
4643- if (memCHRs ("wds]" , s [1 ]))
4695+ if (memCHRs ("wds]" , s [1 ])) {
46444696 weight += 100 ; /* \w \d \s => strongly charclass */
4645- /* khw: Why not \W \D \S \h \v, etc as well? */
4646- else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ])
4647- weight += 1 ; /* \' => mildly charclass */
4697+ /* khw: \] can't happen, as any ']' is beyond our search.
4698+ * Why not \W \D \S \h \v, etc as well? Should they have
4699+ * the same weights as \w \d \s or should all or some be
4700+ * in the 'abcfnrtvx' below? */
4701+ } else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ]) {
4702+ weight += 1 ;
4703+ /* khw: This is problematic. Enough so, that I misread
4704+ * it, and added a wrong comment about what it does in
4705+ * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
4706+ * doesn't look at the current character. What it
4707+ * actually does is: if any quote has been seen in the
4708+ * parse, don't do the rest of the else's below, but for
4709+ * every subsequent backslashed character encountered
4710+ * (except \0 \w \s \d), increment the weight to lean a
4711+ * bit more towards being a charclass. That means that
4712+ * every backslash sequence following the first occurrence
4713+ * of a quote increments the weight regardless of what the
4714+ * sequence is. Again, \0 \w \d and \s are not controlled
4715+ * by this else, so they change the weight by a lot more.
4716+ * But what makes them so special that they aren't subject
4717+ * to this. Any why does having a quote change the
4718+ * behavior from then on. And why only backslashed
4719+ * sequences get this treatment? This code has been
4720+ * unchanged since this function was added in 1993. I
4721+ * don't get it. Instead, it does seem to me that it is
4722+ * especially unlikely to repeat a quote in a charclass,
4723+ * but that having just a single quote is indicative of a
4724+ * charclass, and having pairs of quotes is indicative of
4725+ * a subscript. Similarly for things that could indicate
4726+ * nesting of braces or parens. */
4727+ }
46484728 else if (memCHRs ("abcfnrtvx" , s [1 ]))
46494729 weight += 40 ; /* \n, etc => charclass */
46504730 /* khw: Why not \e etc as well? */
@@ -4653,6 +4733,19 @@ S_intuit_more(pTHX_ char *s, char *e)
46534733 while (s [1 ] && isDIGIT (s [1 ]))
46544734 s ++ ;
46554735 }
4736+
4737+ /* khw: There are lots more possible escape sequences. Some,
4738+ * like \A,\z have no special meaning to charclasses, so might
4739+ * indicate a subscript, but I don't know what they would be
4740+ * doing there either. Some have been added to the language
4741+ * after this code was written, but no one thought to, or
4742+ * could wade through this function, to add them. Things like
4743+ * \p{} for properties, \N and \N{}, for example.
4744+ *
4745+ * It's problematic that \a is treated as plain 'a' for
4746+ * purposes of the 'seen' array. Whatever is matched by these
4747+ * backslashed sequences should not be added to 'seen'. That
4748+ * includes the backslash. */
46564749 }
46574750 else /* \ followed by NUL strongly indicates character class */
46584751 weight += 100 ;
@@ -4698,7 +4791,25 @@ S_intuit_more(pTHX_ char *s, char *e)
46984791 && isALPHA (s [1 ]))
46994792 {
47004793 /* Here it's \W (that isn't [$@&] ) followed immediately by two
4701- * alphas in a row. Accumulate all the consecutive alphas */
4794+ * alphas in a row. Accumulate all the consecutive alphas.
4795+ *
4796+ * khw: The code below was changed in 2015 by
4797+ * 56f81afc0f2d331537f38e6f12b86a850187cb8a to solve a
4798+ * buffer overrun. Prior to that commit, the code copied all
4799+ * the consecutive alphas to a temporary. The problem was
4800+ * that temporary's size could be exceeded, and the temporary
4801+ * wasn't even needed (at least by 2015). The called
4802+ * keyword() function doesn't need a copy. It takes a pointer
4803+ * to the first character and a length, hence it can operate
4804+ * on the original source text. It is intended to catch cases
4805+ * like $a[ord]. If it does match a keyword, we don't want
4806+ * the spelling of that keyword to affect the seen[] array.
4807+ * But if it isn't a keyword we do want to fall back to the
4808+ * normal behavior. And the 2015 commit removed that. It
4809+ * absorbs every bareword regardless, defeating the intent of
4810+ * the algorithm implementing the heuristics. That not many
4811+ * bugs have surfaced since indicates this whole thing doesn't
4812+ * get applied very much */
47024813 char * d = s ;
47034814 while (isALPHA (s [0 ]))
47044815 s ++ ;
@@ -4708,7 +4819,10 @@ S_intuit_more(pTHX_ char *s, char *e)
47084819 if (keyword (d , s - d , 0 ))
47094820 weight -= 150 ;
47104821
4711- /* khw: Should those alphas be marked as seen? */
4822+ /* khw: Barewords could also be subroutine calls, and these
4823+ * would also indicate a subscript. Like [green] where
4824+ * 'green' has been declared, for example, in 'use constant'
4825+ * */
47124826 }
47134827
47144828 /* Consecutive chars like [...12...] and [...ab...] are presumed
@@ -4723,20 +4837,28 @@ S_intuit_more(pTHX_ char *s, char *e)
47234837 /* But repeating a character inside a character class does nothing,
47244838 * like [aba], so less likely that someone makes such a class, more
47254839 * likely that it is a subscript; the more repeats, the less
4726- * likely. */
4840+ * likely.
4841+ *
4842+ * khw: I think this changes the weight too rapidly. Each time
4843+ * through the loop compounds the previous times. Instead, it
4844+ * would be better to have a separate loop after all the rest that
4845+ * changes the weight once based on how many times each character
4846+ * gets repeated */
47274847 weight -= seen [un_char ];
47284848 break ;
47294849 } /* End of switch */
47304850
47314851 /* khw: 'seen' is declared as a char. This ++ can cause it to wrap.
47324852 * This gives different results with compilers for which a plain 'char'
4733- * is actually unsigned, versus those where it is signed. I believe it
4734- * is undefined behavior to wrap a 'signed'. I think it should be
4735- * instead declared an unsigned int to make the chances of wrapping
4736- * essentially zero.
4853+ * is actually unsigned, versus those where it is signed. The C99
4854+ * standard allows a compiler to raise a signal when a 'signed' char
4855+ * is incremented outside its permissible range. I think 'seen'
4856+ * should be instead declared an unsigned, and a conditional added
4857+ * to prevent wrapping.
47374858 *
47384859 * And I believe that extra backslashes are different from other
4739- * repeated characters. */
4860+ * repeated characters. There may be others, like I have mentioned
4861+ * quotes and paired delimiters */
47404862 seen [un_char ]++ ;
47414863 } /* End of loop through each character of the construct */
47424864
0 commit comments