@@ -55,7 +55,7 @@ RscGrammar preprocess(RscGrammar rsc) {
5555 // Replace occurrences of singleton ranges with just the corresponding
5656 // literal. This makes it easier to identify delimiters.
5757 return visit (rsc ) {
58- case s : \char -class ([range (char , char )]) => d
58+ case \char -class ([range (char , char )]) => d
5959 when d := \lit ("<stringChar (char )> " ), isDelimiter (d )
6060 }
6161}
@@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
113113
114114 // Analyze dependencies among productions
115115 println ("[LOG] Analyzing dependencies among productions" );
116- Dependencies dependencies = deps (toGraph (rsc ));
117- list [Production ] prods = dependencies
118- .removeProds (isCyclic , true ) // `true` means "also remove ancestors"
119- .retainProds (isNonEmpty )
120- .retainProds (hasCategory )
121- .getProds ();
116+ Graph [Production ] graph = toGraph (rsc );
117+ list [Production ] prods = deps (graph ).retainProds (isNonEmpty ).retainProds (hasCategory ).getProds ();
118+ list [Production ] prodsNonRecursive = prods & deps (graph ).removeProds (isCyclic , true ).getProds ();
119+ list [Production ] prodsRecursive = prods - prodsNonRecursive ;
122120
123121 // Analyze delimiters
124122 println ("[LOG] Analyzing delimiters" );
@@ -134,13 +132,15 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
134132 list [Production ] prodsKeywords = [prod (lex (KEYWORDS_PRODUCTION_NAME ), [\alt (keywords )], {\tag ("category" ("keyword.control" ))})];
135133
136134 // Return
137- bool isEmptyProd (prod (_, [\alt (alternatives )], _)) = alternatives == {};
138- list [ConversionUnit ] units
139- = [unit (rsc , p , hasNewline (rsc , p ), getOuterDelimiterPair (rsc , p ), getInnerDelimiterPair (rsc , p , getOnlyFirst = true )) | p <- prods ]
140- + [unit (rsc , p , false , <nothing (), nothing ()> , <nothing (), nothing ()> ) | p <- prodsDelimiters , !isEmptyProd (p )]
141- + [unit (rsc , p , false , <nothing (), nothing ()> , <nothing (), nothing ()> ) | p <- prodsKeywords , !isEmptyProd (p )];
142-
143- return sort (units );
135+ bool isRecursive (Production p )
136+ = p in prodsRecursive ;
137+ bool isEmptyProd (prod (_, [\alt (alternatives )], _))
138+ = alternatives == {};
139+
140+ set [ConversionUnit ] units = {};
141+ units += {unit (rsc , p , isRecursive (p ), hasNewline (rsc , p ), getOuterDelimiterPair (rsc , p ), getInnerDelimiterPair (rsc , p , getOnlyFirst = true )) | p <- prods };
142+ units += {unit (rsc , p , false , false , <nothing (), nothing ()> , <nothing (), nothing ()> ) | p <- prodsDelimiters + prodsKeywords , !isEmptyProd (p )};
143+ return sort ([*removeStrictPrefixes (units )]);
144144}
145145
146146@synopsis {
@@ -196,7 +196,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
196196
197197 // Convert all units in the group to match patterns (including,
198198 // optimistically, multi-line units as-if they are single-line)
199- for (u <- group ) {
199+ for (u <- group , ! u . recursive ) {
200200 TmRule r = toTmRule (toRegExp (u .rsc , u .prod , guard = true ))
201201 [name = "/inner/single/<u .name > " ];
202202
@@ -216,32 +216,116 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
216216 // Simple case: each unit does have an `end` inner delimiter
217217 if (_ <- group && all (u <- group , just (_) := u .innerDelimiters .end )) {
218218
219- // Compute a list of segments that need to be consumed between
219+ // Compute a set of segments that need to be consumed between
220220 // the `begin` delimiter and the `end` delimiters. Each of these
221221 // segments will be converted to a match pattern.
222222 set [Segment ] segs = {*getSegments (rsc , u .prod ) | u <- group };
223223 segs = {removeBeginEnd (seg , begins , ends ) | seg <- segs };
224224
225- list [Symbol ] terminals = [\seq (seg .symbols ) | seg <- segs ];
226- terminals = [s | s <- terminals , [] != s .symbols ];
227- terminals = [destar (s ) | s <- terminals ]; // The tokenization engine always tries to apply rules repeatedly
228- terminals = dup (terminals );
229- terminals = sortByMinimumLength (terminals ); // Small symbols first
230- terminals = reverse (terminals ); // Large symbols first
231- terminals = terminals + \char -class ([range (1 ,0x10FFFF )]); // Any char (as a fallback)
232-
233225 TmRule r = toTmRule (
234226 toRegExp (rsc , [begin ], {t }),
235227 toRegExp (rsc , [\alt (ends )], {t }),
236- [toTmRule (toRegExp (rsc , [s ], {t })) | s <- terminals ])
228+ [toTmRule (toRegExp (rsc , [s ], {t })) | s <- toTerminals ( segs ) ])
237229 [name = "/inner/multi/<intercalate ("," , [u .name | u <- group ])> "];
238230
239231 rules = insertIn(rules, (u: r | u <- group ));
240232 }
241233
242- // Complex case: some unit doesn't have an `end` inner delimiter
234+ // Complex case: some unit doesn't have an `end` inner delimiter.
235+ // This requires (substantial) extra care, as there is no obvious
236+ // marker to close the begin/end pattern with.
243237 else {
244- ; // TODO (part of future support for *recursive* multi-line units)
238+ Decomposition decomposition = decompose ([*group ]);
239+
240+ // TODO: The following condition can be true (even though there
241+ // has to be a `begin` delimiter) because `decompose` doesn't
242+ // expand non-terminals. Consider if it should, to maybe improve
243+ // accuracy.
244+ if ([] == decomposition .prefix ) {
245+ continue ;
246+ }
247+
248+ RegExp reBegin = toRegExp (rsc , decomposition .prefix , {t });
249+ RegExp reEnd = regExp ("(?=.)" , []);
250+
251+ patterns = for (suffix <- decomposition .suffixes ) {
252+ if (just (Symbol begin ) := getInnerDelimiterPair (rsc , suffix [0 ], getOnlyFirst = true ).begin ) {
253+ if (just (Symbol end ) := getInnerDelimiterPair (rsc , suffix [-1 ], getOnlyFirst = true ).end ) {
254+ // If the suffix has has both a `begin` delimiter
255+ // and an `end` delimiter, then generate a
256+ // begin/end pattern to highlight these delimiters
257+ // and all content in between.
258+
259+ set [Segment ] segs = getSegments (rsc , suffix );
260+ segs = {removeBeginEnd (seg , {begin }, {end }) | seg <- segs };
261+
262+ append toTmRule (
263+ toRegExp (rsc , [begin ], {t }),
264+ toRegExp (rsc , [end ], {t }),
265+ [toTmRule (toRegExp (rsc , [s ], {t })) | s <- toTerminals (segs )]);
266+ }
267+
268+ else {
269+ // If the suffix has a `begin` delimiter, but not
270+ // an `end` delimiter, then generate a match pattern
271+ // just to highlight that `begin` delimiter. Ignore
272+ // the remainder of the suffix (because it's
273+ // recursive, so no regular expression can be
274+ // generated for it).
275+ append toTmRule (toRegExp (rsc , [begin ], {t }));
276+ }
277+ }
278+
279+ else {
280+ // If the suffix doesn't have a `begin` delimiter, then
281+ // ignore it (because it's recursive, so no regular
282+ // expression can be generated for it).
283+ ;
284+ }
285+ }
286+
287+ TmRule r = toTmRule (reBegin , reEnd , patterns );
288+ r = r [name = "/inner/multi/<intercalate ("," , [u .name | u <- group ])> "];
289+ r = r[applyEndPatternLast = true];
290+
291+ rules = insertIn(rules, (u: r | u <- group ));
292+
293+ // TODO: The current approach produces "partially"
294+ // newline-sensitive rules, in the sense that newlines are
295+ // accepted between the prefix and the suffixes, but not between
296+ // symbols in the prefix. This approach could be improved to
297+ // produce "totally" newline-sensitive rules (at the cost of
298+ // much more complicated rule generation and generated rules) by
299+ // adopting an approach in which the rules for each symbol in
300+ // the prefix looks something like the following three:
301+ //
302+ // ```
303+ // "foo": {
304+ // "name": "foo",
305+ // "begin": "(\\@)",
306+ // "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
307+ // "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
308+ // "contentName": "comment",
309+ // "beginCaptures": { "1": { "name": "comment" } }
310+ // },
311+ // "foo.$": {
312+ // "begin": "$",
313+ // "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
314+ // "name": "foo.$",
315+ // "patterns": [ { "include": "#foo.^" }]
316+ // },
317+ // "foo.^": {
318+ // "begin": "^",
319+ // "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
320+ // "name": "foo.^",
321+ // "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
322+ // }
323+ // ```
324+ //
325+ // Note: This alternative approach would likely render the
326+ // present distinction between the "simple case" and the
327+ // "complex case" unneeded, so in that sense, rule generation
328+ // would actually become simpler.
245329 }
246330 }
247331 }
@@ -302,10 +386,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
302386 if (seg .final , _ <- symbols , symbols [-1 ] in ends ) {
303387 symbols = symbols [..-1 ];
304388 }
305-
306389 return seg [symbols = symbols ];
307390}
308391
392+ private list [Symbol ] toTerminals (set [Segment ] segs ) {
393+ list [Symbol ] terminals = [\seq (seg .symbols ) | seg <- segs ];
394+ terminals = [s | s <- terminals , [] != s .symbols ];
395+ terminals = [destar (s ) | s <- terminals ]; // The tokenization engine always tries to apply rules repeatedly
396+ terminals = dup (terminals );
397+ terminals = sortByMinimumLength (terminals ); // Small symbols first
398+ terminals = reverse (terminals ); // Large symbols first
399+ terminals = terminals + \char -class ([range (1 ,0x10FFFF )]); // Any char (as a fallback)
400+ return terminals ;
401+ }
402+
309403// TODO: This function could be moved to a separate, generic module
310404private list [&T ] dupLast (list [&T ] l )
311405 = reverse (dup (reverse (l ))); // TODO: Optimize/avoid `reverse`-ing?
0 commit comments