Skip to content

Commit ed2480d

Browse files
fix: Put line continuation strictly inside matches (#19)
* Improve line continuation mechanism * Add more tests * Fix lints and add generated parser sources * I really need to fix the CI
1 parent 96c8ccd commit ed2480d

File tree

5 files changed

+8897
-8555
lines changed

5 files changed

+8897
-8555
lines changed

grammar.js

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,27 @@ const PREC = {
2424
CALL: 13,
2525
};
2626

27+
// NOTE: (reiniscirpons) We write the regexes below so that they match a character as soon as possible to enable
28+
// us to insert the line continuation regex correctly
2729
const LITERAL_REGEXP = {
28-
IDENTIFIER: /([a-zA-Z_@0-9]|\\.)*([a-zA-Z_@]|\\.)[a-zA-Z_@0-9]*/,
29-
INTEGER: /[0-9]+/,
30-
ESCAPE_SEQUENCE: /\\([^0-7\r\n]|0x[0-9a-fA-F]{2,2}|[0-7]{3,3})/,
31-
LINE_CONTINUATION: /\\\r?\n/,
32-
NON_TRAILING_PERIOD_FLOAT: /[0-9]*\.[0-9]+/,
30+
// NOTE: (reiniscirpons) Identifiers must contain a non-digit character. Otherwise they can contain any upper or lowercase letter,
31+
// any digit, the characters _ and @ and any character that is escaped by a backslash.
32+
// v starts with a non-digit v starts with an escape v starts with an initial segment of digits followed by a non-digit
33+
IDENTIFIER: /[a-zA-Z_@]([a-zA-Z_@0-9]|\\.)*|\\.([a-zA-Z_@0-9]|\\.)*|[0-9][0-9]*([a-zA-Z_@]|\\.)([a-zA-Z_@0-9]|\\.)*/,
34+
INTEGER: /[0-9][0-9]*/,
35+
ESCAPE_SEQUENCE: /\\([^0-7\r\n]|0x[0-9a-fA-F]{2}|[0-7]{3})/,
36+
NON_TRAILING_PERIOD_FLOAT: /[0-9][0-9]*\.[0-9]+|\.[0-9]+/,
3337
// TODO: (reiniscirpons) Perhaps break this up a bit?
34-
// v Basic float selector v Exponent v Conversion marker v Eager conversion marker
35-
// v Conversion marker
36-
EXPONENT_OR_CONVERSION_FLOAT:
37-
/([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)(([edqEDQ][\+-]?[0-9]+[a-zA-Z]?|[a-cf-pr-zA-CF-PR-Z])(_[a-zA-Z]?)?|_[a-zA-Z]?)/,
38+
// v Basic selector v Exponent v Conversion marker v Eager conversion marker
39+
EXPONENT_OR_CONVERSION_FLOAT_LEADING_DIGIT:
40+
/[0-9][0-9]*\.[0-9]*(([edqEDQ][\+-]?[0-9]+[a-zA-Z]?|[a-cf-pr-zA-CF-PR-Z])(_[a-zA-Z]?)?|_[a-zA-Z]?)/,
41+
EXPONENT_OR_CONVERSION_FLOAT_LEADING_DOT:
42+
/\.[0-9]+(([edqEDQ][\+-]?[0-9]+[a-zA-Z]?|[a-cf-pr-zA-CF-PR-Z])(_[a-zA-Z]?)?|_[a-zA-Z]?)/,
43+
LINE_CONTINUATION: /\\\r?\n/,
3844
// Help topic or book must exclude the help operators and selectors, which
3945
// leads to the following rather complicated regex
4046
HELP_TOPIC_OR_BOOK:
41-
/[^-+&<>0-9:][^\r\n:]*|([-+&]|<<|>>)[^\r\n:]+|[0-9]+[^0-9\r\n:][^\r\n:]/,
47+
/[^-+&<>0-9:][^\r\n:]*|([-+&]|<<|>>)[^\r\n:]+|[0-9][0-9]*[^0-9\r\n:][^\r\n:]/,
4248
};
4349

4450
module.exports = grammar({
@@ -383,7 +389,11 @@ module.exports = grammar({
383389
),
384390
$._trailing_period_float,
385391
lineContinuation(
386-
LITERAL_REGEXP.EXPONENT_OR_CONVERSION_FLOAT,
392+
LITERAL_REGEXP.EXPONENT_OR_CONVERSION_FLOAT_LEADING_DIGIT,
393+
LITERAL_REGEXP.LINE_CONTINUATION,
394+
),
395+
lineContinuation(
396+
LITERAL_REGEXP.EXPONENT_OR_CONVERSION_FLOAT_LEADING_DOT,
387397
LITERAL_REGEXP.LINE_CONTINUATION,
388398
),
389399
),
@@ -660,17 +670,20 @@ function makeParameters(leftSep, rightSep, parameterRule, ellipsisRule) {
660670
* Creates a line continuation regex.
661671
*
662672
* This function implements a RegExp transformation for matching an
663-
* arbitrary number of line continuations within the base RegExp.
673+
* arbitrary number of line continuations _strictly_ within the base RegExp.
664674
*
665675
* Roughly speaking, if L is the regex matching the line continuation,
666676
* and T is this function, then
667-
* T(x) = (xL*) if x is a character class
677+
* T(x) = (L*x) if x is a character class
668678
* T((A)) = (T(A))
669679
* T(AB) = T(A)T(B)
670680
* T(A | B) = T(A) | T(B)
671681
* T(A*) = T(A)*
672682
* We perform this transformation in a linear pass by essentially detecting
673683
* occurrences of character classes and performing the transformation on them.
684+
* Additional care is taken to ensure that L* is only inserted after a
685+
* character class is matched. This requires the input regex to have a special
686+
* form.
674687
*
675688
* @param {RegExp} base_regex
676689
* @param {RegExp} line_continuation_regex
@@ -703,17 +716,31 @@ function lineContinuation(base_regex, line_continuation_regex) {
703716
let escaped = false;
704717
let square_bracket = false;
705718
let curly_brace = false;
719+
let start = true;
720+
let level = 0;
706721
for (const c of base_regex.source) {
707722
// TODO: (reiniscirpons) Refactor more
708723

709724
// BEFORE
710725
if (
711726
!curly_brace &&
712727
!square_bracket &&
713-
!escaped &&
714-
(c == '\\' || c == '[' || (c != '{' && !special_symbols.has(c)))
728+
!escaped
715729
) {
716-
result_regex_string = result_regex_string.concat('(');
730+
if (c == '\\' || c == '[' || (c != '{' && !special_symbols.has(c))) {
731+
result_regex_string = result_regex_string.concat('(');
732+
if (!start) {
733+
result_regex_string = result_regex_string.concat(
734+
line_continuation_regex_string,
735+
);
736+
}
737+
} else if (c == '(') {
738+
level += 1;
739+
} else if (c == '|' && level == 0) {
740+
start = true;
741+
} else if (c == ')') {
742+
level -= 1;
743+
}
717744
}
718745

719746
result_regex_string = result_regex_string.concat(c);
@@ -730,10 +757,8 @@ function lineContinuation(base_regex, line_continuation_regex) {
730757
c != '{' &&
731758
!special_symbols.has(c))
732759
) {
733-
result_regex_string = result_regex_string.concat(
734-
line_continuation_regex_string,
735-
);
736760
result_regex_string = result_regex_string.concat(')');
761+
start = false;
737762
}
738763

739764
// FLAGS

src/grammar.json

Lines changed: 11 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)