@@ -4552,19 +4552,47 @@ S_intuit_more(pTHX_ char *s, char *e)
45524552 /* Here is '[': maybe we have a character class. Examine the guts */
45534553 s ++ ;
45544554
4555- /* '^' implies a character class; An empty '[]' isn't legal, but it does
4556- * mean there isn't more to come */
4555+ /* '^' implies a character class; An empty '[]' isn't legal, and it means
4556+ * there isn't more to come */
45574557 if (s [0 ] == ']' || s [0 ] == '^' )
45584558 return FALSE;
45594559
4560- /* Find matching ']'. khw: This means any s[1] below is guaranteed to
4561- * exist */
4560+ /* khw: If the context of this call is $foo[...], we may be able to avoid
4561+ * the heuristics below. The possibilities are:
4562+ * strict @foo $foo
4563+ * vars? exists exists
4564+ * y n n This is an error; return false now
4565+ * y n y must be a a charclass
4566+ * y y n must be a a subscript
4567+ * y y y ambiguous; do heuristics below
4568+ * n n n ambiguous; do heuristics below
4569+ * n n y ambiguous; do heuristics below, but I
4570+ * wonder if the initial bias should be a
4571+ * little towards charclass
4572+ * n y n ambiguous; do heuristics below, but I
4573+ * wonder if the initial bias should be a
4574+ * little towards subscript
4575+ * n y y ambiguous; do heuristics below
4576+ */
4577+
4578+ /* Find matching ']'. khw: Actually it finds the next ']' and assumes it
4579+ * matches the '['. In order to account for the possibility of the ']'
4580+ * being inside the scope of \Q or preceded by an even number of
4581+ * backslashes, this should be rewritten */
45624582 const char * const send = (char * ) memchr (s , ']' , e - s );
45634583 if (! send ) /* has to be an expression */
45644584 return TRUE;
45654585
4586+ /* Below here, the heuristics start. One idea from alh is, given 'use
4587+ * 5.43.x', that for all digits, that if we have to resort to heuristics,
4588+ * we instead raise an error with an explanation of how to make it
4589+ * unambiguous: ${foo}[123] */
4590+
45664591 /* If the construct consists entirely of one or two digits, call it a
4567- * subscript. */
4592+ * subscript.
4593+ *
4594+ * khw: No one writes 03 to mean 3. Any string of digits beginning with
4595+ * '0' is likely to be a charclass, including length 2 ones. */
45684596 if (isDIGIT (s [0 ]) && send - s <= 2 && (send - s == 1 || (isDIGIT (s [1 ])))) {
45694597 return TRUE;
45704598 }
@@ -4599,6 +4627,13 @@ S_intuit_more(pTHX_ char *s, char *e)
45994627 Zero (seen , 256 , char );
46004628
46014629 /* Examine each character in the construct */
4630+ /* That this knows nothing of UTF-8 can lead to opposite results if the
4631+ * text is encoded in UTF-8 or not; another relic of the Unicode Bug.
4632+ * Suppose a string consistis of various un-repeated code points between
4633+ * 0x128 and 0x255. When encoded in UTF-8 their start bytes will all be
4634+ * \xC2 or \xC3. The heuristics below will count those as repeated bytes,
4635+ * and thus lean more towards this being a character class than when not
4636+ * in UTF-8. */
46024637 bool first_time = true;
46034638 for (; s < send ; s ++ , first_time = false) {
46044639 unsigned char prev_un_char = un_char ;
@@ -4614,21 +4649,61 @@ S_intuit_more(pTHX_ char *s, char *e)
46144649
46154650 /* Following one of these characters, we look to see if there is an
46164651 * identifier already found in the program by that name. If so,
4617- * strongly suspect this isn't a character class */
4652+ * strongly suspect this isn't a character class
4653+ *
4654+ * khw: This misses the possibility of lots of other syntaxes of
4655+ * variables, like $::foo or ${foo}, that scan_ident looks for */
46184656 if (isWORDCHAR_lazy_if_safe (s + 1 , PL_bufend , UTF )) {
46194657 Size_t len ;
4658+
4659+ /* khw: where did the magic number 4 come from?. This buffer
4660+ * was 4 times as large as tokenbuf in 1997, and had not
4661+ * changed since the code was first added */
46204662 char tmpbuf [ C_ARRAY_LENGTH (PL_tokenbuf ) * 4 ];
4663+
4664+ /* khw: scan_ident shouldn't be used as-is. It has side
4665+ * effects and can end up calling this function recursively.
4666+ *
4667+ * khw: If what follows can't be an identifier, say it is too
4668+ * long or is $001, then it must be a charclass */
46214669 scan_ident (s , tmpbuf , C_ARRAY_END (tmpbuf ), FALSE);
46224670 len = strlen (tmpbuf );
4671+
4672+ /* khw: This only looks at global variables; lexicals came
4673+ * later, and this hasn't been updated. Ouch!! */
46234674 if ( len > 1
46244675 && gv_fetchpvn_flags (tmpbuf ,
46254676 len ,
46264677 UTF ? SVf_UTF8 : 0 ,
46274678 SVt_PV ))
4679+ {
46284680 weight -= 100 ;
4629- else /* Not a multi-char identifier already known in the
4630- program; is somewhat likely to be a subscript */
4681+
4682+ /* khw: Below we keep track of repeated characters; People
4683+ * rarely say qr/[aba]/, as the second a is pointless.
4684+ * (Some do it though as a mnemonic that is meaningful to
4685+ * them.) But generally, repeated characters makes things
4686+ * more likely to be a charclass. But we have here that
4687+ * this an identifier so likely a subscript. Its spelling
4688+ * should be irrelevant to the repeated characters test.
4689+ * So, we should advance past it. Suppose it is a hash
4690+ * element, like $subscripts{$which}. We should advance
4691+ * past the braces and key */
4692+ }
4693+ else {
4694+ /* Not a multi-char identifier already known in the
4695+ * program; is somewhat likely to be a subscript.
4696+ *
4697+ * khw: Our test suite contains several constructs like
4698+ * [$A-Z]. Excluding length 1 identifiers in the
4699+ * conditional above means such are much less likely to be
4700+ * mistaken for subscripts. I would argue that if the next
4701+ * character is a '-' followed by an alpha, that would make
4702+ * it much more likely to be a charclass. It would only
4703+ * make sense to be an expression if that alpha string is a
4704+ * bareword with meaning; something like [$A-ord] */
46314705 weight -= 10 ;
4706+ }
46324707 }
46334708 else if ( s [0 ] == '$'
46344709 && s [1 ]
@@ -4645,13 +4720,51 @@ S_intuit_more(pTHX_ char *s, char *e)
46454720 }
46464721 break ;
46474722
4723+ /* khw: [:blank:] strongly indicates a charclass */
4724+ /* khw: Z-A definitely subscript
4725+ * Z-Z likely subscript
4726+ * "x - z" with blanks very likely subscript
4727+ * \N without { must be subscript
4728+ * \R must be subscript
4729+ * \? must be subscript for things like \d, but not \a.
4730+ */
4731+
4732+
46484733 case '\\' :
46494734 if (s [1 ]) {
4650- if (memCHRs ("wds]" , s [1 ]))
4735+ if (memCHRs ("wds]" , s [1 ])) {
46514736 weight += 100 ; /* \w \d \s => strongly charclass */
4652- /* khw: Why not \W \D \S \h \v, etc as well? */
4653- else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ])
4654- weight += 1 ; /* \' => mildly charclass */
4737+ /* khw: \] can't happen, as any ']' is beyond our search.
4738+ * Why not \W \D \S \h \v, etc as well? Should they have
4739+ * the same weights as \w \d \s or should all or some be
4740+ * in the 'abcfnrtvx' below? */
4741+ } else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ]) {
4742+ weight += 1 ;
4743+ /* khw: This is problematic. Enough so, that I misread
4744+ * it, and added a wrong comment about what it does in
4745+ * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
4746+ * doesn't look at the current character. What it
4747+ * actually does is: if any quote has been seen in the
4748+ * parse, don't do the rest of the else's below, but for
4749+ * every subsequent backslashed character encountered
4750+ * (except \0 \w \s \d), increment the weight to lean a
4751+ * bit more towards being a charclass. That means that
4752+ * every backslash sequence following the first occurrence
4753+ * of a quote increments the weight regardless of what the
4754+ * sequence is. Again, \0 \w \d and \s are not controlled
4755+ * by this else, so they change the weight by a lot more.
4756+ * But what makes them so special that they aren't subject
4757+ * to this. Any why does having a quote change the
4758+ * behavior from then on. And why only backslashed
4759+ * sequences get this treatment? This code has been
4760+ * unchanged since this function was added in 1993. I
4761+ * don't get it. Instead, it does seem to me that it is
4762+ * especially unlikely to repeat a quote in a charclass,
4763+ * but that having just a single quote is indicative of a
4764+ * charclass, and having pairs of quotes is indicative of
4765+ * a subscript. Similarly for things that could indicate
4766+ * nesting of braces or parens. */
4767+ }
46554768 else if (memCHRs ("abcfnrtvx" , s [1 ]))
46564769 weight += 40 ; /* \n, etc => charclass */
46574770 /* khw: Why not \e etc as well? */
@@ -4660,6 +4773,19 @@ S_intuit_more(pTHX_ char *s, char *e)
46604773 while (s [1 ] && isDIGIT (s [1 ]))
46614774 s ++ ;
46624775 }
4776+
4777+ /* khw: There are lots more possible escape sequences. Some,
4778+ * like \A,\z have no special meaning to charclasses, so might
4779+ * indicate a subscript, but I don't know what they would be
4780+ * doing there either. Some have been added to the language
4781+ * after this code was written, but no one thought to, or
4782+ * could wade through this function, to add them. Things like
4783+ * \p{} for properties, \N and \N{}, for example.
4784+ *
4785+ * It's problematic that \a is treated as plain 'a' for
4786+ * purposes of the 'seen' array. Whatever is matched by these
4787+ * backslashed sequences should not be added to 'seen'. That
4788+ * includes the backslash. */
46634789 }
46644790 else /* \ followed by NUL strongly indicates character class */
46654791 weight += 100 ;
@@ -4705,7 +4831,25 @@ S_intuit_more(pTHX_ char *s, char *e)
47054831 && isALPHA (s [1 ]))
47064832 {
47074833 /* Here it's \W (that isn't [$@&] ) followed immediately by two
4708- * alphas in a row. Accumulate all the consecutive alphas */
4834+ * alphas in a row. Accumulate all the consecutive alphas.
4835+ *
4836+ * khw: The code below was changed in 2015 by
4837+ * 56f81afc0f2d331537f38e6f12b86a850187cb8a to solve a
4838+ * buffer overrun. Prior to that commit, the code copied all
4839+ * the consecutive alphas to a temporary. The problem was
4840+ * that temporary's size could be exceeded, and the temporary
4841+ * wasn't even needed (at least by 2015). The called
4842+ * keyword() function doesn't need a copy. It takes a pointer
4843+ * to the first character and a length, hence it can operate
4844+ * on the original source text. It is intended to catch cases
4845+ * like $a[ord]. If it does match a keyword, we don't want
4846+ * the spelling of that keyword to affect the seen[] array.
4847+ * But if it isn't a keyword we do want to fall back to the
4848+ * normal behavior. And the 2015 commit removed that. It
4849+ * absorbs every bareword regardless, defeating the intent of
4850+ * the algorithm implementing the heuristics. That not many
4851+ * bugs have surfaced since indicates this whole thing doesn't
4852+ * get applied very much */
47094853 char * d = s ;
47104854 while (isALPHA (s [0 ]))
47114855 s ++ ;
@@ -4715,7 +4859,12 @@ S_intuit_more(pTHX_ char *s, char *e)
47154859 if (keyword (d , s - d , 0 ))
47164860 weight -= 150 ;
47174861
4718- /* khw: Should those alphas be marked as seen? */
4862+ /* khw: Barewords could also be subroutine calls, and these
4863+ * would also indicate a subscript. Like [green] where
4864+ * 'green' has been declared, for example, in 'use constant'
4865+ * Or maybe it should just call intuit_method() which checks
4866+ * for keyword, subs, and methods.
4867+ * */
47194868 }
47204869
47214870 /* Consecutive chars like [...12...] and [...ab...] are presumed
@@ -4730,23 +4879,36 @@ S_intuit_more(pTHX_ char *s, char *e)
47304879 /* But repeating a character inside a character class does nothing,
47314880 * like [aba], so less likely that someone makes such a class, more
47324881 * likely that it is a subscript; the more repeats, the less
4733- * likely. */
4882+ * likely.
4883+ *
4884+ * khw: I think this changes the weight too rapidly. Each time
4885+ * through the loop compounds the previous times. Instead, it
4886+ * would be better to have a separate loop after all the rest that
4887+ * changes the weight once based on how many times each character
4888+ * gets repeated */
47344889 weight -= seen [un_char ];
47354890 break ;
47364891 } /* End of switch */
47374892
47384893 /* khw: 'seen' is declared as a char. This ++ can cause it to wrap.
47394894 * This gives different results with compilers for which a plain 'char'
4740- * is actually unsigned, versus those where it is signed. I believe it
4741- * is undefined behavior to wrap a 'signed'. I think it should be
4742- * instead declared an unsigned int to make the chances of wrapping
4743- * essentially zero.
4895+ * is actually unsigned, versus those where it is signed. The C99
4896+ * standard allows a compiler to raise a signal when a 'signed' char
4897+ * is incremented outside its permissible range. I think 'seen'
4898+ * should be instead declared an unsigned, and a conditional added
4899+ * to prevent wrapping.
47444900 *
47454901 * And I believe that extra backslashes are different from other
4746- * repeated characters. */
4902+ * repeated characters. There may be others, like I have mentioned
4903+ * quotes and paired delimiters */
47474904 seen [un_char ]++ ;
47484905 } /* End of loop through each character of the construct */
47494906
4907+ /* khw: People on #irc have suggested things that I think boil down to:
4908+ * under 'use 5.43.x', output a warning like existing warnings for
4909+ * similar situations "Ambiguous use of [], resolved as ..." Perhaps
4910+ * suppress the message if all (or maybe almost all) the evidence points
4911+ * to the same outcome. This would involve two weight variables */
47504912 if (weight >= 0 ) /* probably a character class */
47514913 return FALSE;
47524914
0 commit comments