@@ -24,21 +24,27 @@ const PREC = {
2424 CALL : 13 ,
2525} ;
2626
27+ // NOTE: (reiniscirpons) We write the regexes below so that they match a character as soon as possible to enable
28+ // us to insert the line continuation regex correctly
2729const LITERAL_REGEXP = {
28- IDENTIFIER : / ( [ a - z A - Z _ @ 0 - 9 ] | \\ .) * ( [ a - z A - Z _ @ ] | \\ .) [ a - z A - Z _ @ 0 - 9 ] * / ,
29- INTEGER : / [ 0 - 9 ] + / ,
30- ESCAPE_SEQUENCE : / \\ ( [ ^ 0 - 7 \r \n ] | 0 x [ 0 - 9 a - f A - F ] { 2 , 2 } | [ 0 - 7 ] { 3 , 3 } ) / ,
31- LINE_CONTINUATION : / \\ \r ? \n / ,
32- NON_TRAILING_PERIOD_FLOAT : / [ 0 - 9 ] * \. [ 0 - 9 ] + / ,
30+ // NOTE: (reiniscirpons) Identifiers must contain a non-digit character. Otherwise they can contain any upper or lowercase letter,
31+ // any digit, the characters _ and @ and any character that is escaped by a backslash.
32+ // v starts with a non-digit v starts with an escape v starts with an initial segment of digits followed by a non-digit
33+ IDENTIFIER : / [ a - z A - Z _ @ ] ( [ a - z A - Z _ @ 0 - 9 ] | \\ .) * | \\ .( [ a - z A - Z _ @ 0 - 9 ] | \\ .) * | [ 0 - 9 ] [ 0 - 9 ] * ( [ a - z A - Z _ @ ] | \\ .) ( [ a - z A - Z _ @ 0 - 9 ] | \\ .) * / ,
34+ INTEGER : / [ 0 - 9 ] [ 0 - 9 ] * / ,
35+ ESCAPE_SEQUENCE : / \\ ( [ ^ 0 - 7 \r \n ] | 0 x [ 0 - 9 a - f A - F ] { 2 } | [ 0 - 7 ] { 3 } ) / ,
36+ NON_TRAILING_PERIOD_FLOAT : / [ 0 - 9 ] [ 0 - 9 ] * \. [ 0 - 9 ] + | \. [ 0 - 9 ] + / ,
3337 // TODO: (reiniscirpons) Perhaps break this up a bit?
34- // v Basic float selector v Exponent v Conversion marker v Eager conversion marker
35- // v Conversion marker
36- EXPONENT_OR_CONVERSION_FLOAT :
37- / ( [ 0 - 9 ] + \. [ 0 - 9 ] * | [ 0 - 9 ] * \. [ 0 - 9 ] + ) ( ( [ e d q E D Q ] [ \+ - ] ? [ 0 - 9 ] + [ a - z A - Z ] ? | [ a - c f - p r - z A - C F - P R - Z ] ) ( _ [ a - z A - Z ] ? ) ? | _ [ a - z A - Z ] ? ) / ,
38+ // v Basic selector v Exponent v Conversion marker v Eager conversion marker
39+ EXPONENT_OR_CONVERSION_FLOAT_LEADING_DIGIT :
40+ / [ 0 - 9 ] [ 0 - 9 ] * \. [ 0 - 9 ] * ( ( [ e d q E D Q ] [ \+ - ] ? [ 0 - 9 ] + [ a - z A - Z ] ? | [ a - c f - p r - z A - C F - P R - Z ] ) ( _ [ a - z A - Z ] ? ) ? | _ [ a - z A - Z ] ? ) / ,
41+ EXPONENT_OR_CONVERSION_FLOAT_LEADING_DOT :
42+ / \. [ 0 - 9 ] + ( ( [ e d q E D Q ] [ \+ - ] ? [ 0 - 9 ] + [ a - z A - Z ] ? | [ a - c f - p r - z A - C F - P R - Z ] ) ( _ [ a - z A - Z ] ? ) ? | _ [ a - z A - Z ] ? ) / ,
43+ LINE_CONTINUATION : / \\ \r ? \n / ,
3844 // Help topic or book must exclude the help operators and selectors, which
3945 // leads to the following rather complicated regex
4046 HELP_TOPIC_OR_BOOK :
41- / [ ^ - + & < > 0 - 9 : ] [ ^ \r \n : ] * | ( [ - + & ] | < < | > > ) [ ^ \r \n : ] + | [ 0 - 9 ] + [ ^ 0 - 9 \r \n : ] [ ^ \r \n : ] / ,
47+ / [ ^ - + & < > 0 - 9 : ] [ ^ \r \n : ] * | ( [ - + & ] | < < | > > ) [ ^ \r \n : ] + | [ 0 - 9 ] [ 0 - 9 ] * [ ^ 0 - 9 \r \n : ] [ ^ \r \n : ] / ,
4248} ;
4349
4450module . exports = grammar ( {
@@ -383,7 +389,11 @@ module.exports = grammar({
383389 ) ,
384390 $ . _trailing_period_float ,
385391 lineContinuation (
386- LITERAL_REGEXP . EXPONENT_OR_CONVERSION_FLOAT ,
392+ LITERAL_REGEXP . EXPONENT_OR_CONVERSION_FLOAT_LEADING_DIGIT ,
393+ LITERAL_REGEXP . LINE_CONTINUATION ,
394+ ) ,
395+ lineContinuation (
396+ LITERAL_REGEXP . EXPONENT_OR_CONVERSION_FLOAT_LEADING_DOT ,
387397 LITERAL_REGEXP . LINE_CONTINUATION ,
388398 ) ,
389399 ) ,
@@ -660,17 +670,20 @@ function makeParameters(leftSep, rightSep, parameterRule, ellipsisRule) {
660670 * Creates a line continuation regex.
661671 *
662672 * This function implements a RegExp transformation for matching an
663- * arbitrary number of line continuations within the base RegExp.
673+ * arbitrary number of line continuations _strictly_ within the base RegExp.
664674 *
665675 * Roughly speaking, if L is the regex matching the line continuation,
666676 * and T is this function, then
667- * T(x) = (xL* ) if x is a character class
677+ * T(x) = (L*x ) if x is a character class
668678 * T((A)) = (T(A))
669679 * T(AB) = T(A)T(B)
670680 * T(A | B) = T(A) | T(B)
671681 * T(A*) = T(A)*
672682 * We perform this transformation in a linear pass by essentially detecting
673683 * occurrences of character classes and performing the transformation on them.
684+ * Additional care is taken to ensure that L* is only inserted after a
685+ * character class is matched. This requires the input regex to have a special
686+ * form.
674687 *
675688 * @param {RegExp } base_regex
676689 * @param {RegExp } line_continuation_regex
@@ -703,17 +716,31 @@ function lineContinuation(base_regex, line_continuation_regex) {
703716 let escaped = false ;
704717 let square_bracket = false ;
705718 let curly_brace = false ;
719+ let start = true ;
720+ let level = 0 ;
706721 for ( const c of base_regex . source ) {
707722 // TODO: (reiniscirpons) Refactor more
708723
709724 // BEFORE
710725 if (
711726 ! curly_brace &&
712727 ! square_bracket &&
713- ! escaped &&
714- ( c == '\\' || c == '[' || ( c != '{' && ! special_symbols . has ( c ) ) )
728+ ! escaped
715729 ) {
716- result_regex_string = result_regex_string . concat ( '(' ) ;
730+ if ( c == '\\' || c == '[' || ( c != '{' && ! special_symbols . has ( c ) ) ) {
731+ result_regex_string = result_regex_string . concat ( '(' ) ;
732+ if ( ! start ) {
733+ result_regex_string = result_regex_string . concat (
734+ line_continuation_regex_string ,
735+ ) ;
736+ }
737+ } else if ( c == '(' ) {
738+ level += 1 ;
739+ } else if ( c == '|' && level == 0 ) {
740+ start = true ;
741+ } else if ( c == ')' ) {
742+ level -= 1 ;
743+ }
717744 }
718745
719746 result_regex_string = result_regex_string . concat ( c ) ;
@@ -730,10 +757,8 @@ function lineContinuation(base_regex, line_continuation_regex) {
730757 c != '{' &&
731758 ! special_symbols . has ( c ) )
732759 ) {
733- result_regex_string = result_regex_string . concat (
734- line_continuation_regex_string ,
735- ) ;
736760 result_regex_string = result_regex_string . concat ( ')' ) ;
761+ start = false ;
737762 }
738763
739764 // FLAGS
0 commit comments