@@ -93,37 +93,147 @@ private Regex parsePattern(final String pattern, final boolean ignoreCase) throw
9393 }
9494
9595 /**
96- * Rewrites the given pattern to workaround limitations of the joni library which for example does not support
97- * negative variable-length look-behinds
96+ * Rewrites the given pattern to work around limitations of the Joni library, which does not support variable-length lookbehinds.
9897 *
99- * @see <a href="https://github.com/eclipse-tm4e/tm4e/issues/677">github.com/eclipse-tm4e/tm4e/issue/677</a>
98+ * Strategy:
99+ * <ul>
100+ * <li>Fixed-length lookbehind (positive or negative) is supported by Joni → leave unchanged.</li>
101+ * <li>Variable-length POSITIVE lookbehind at the pattern start → rewritten by <em>consuming</em> the context:<br/>
102+ * <code>(?<=X)Y</code> ⇒ <code>(?:X)Y</code><br/>
103+ * Trade-off: the overall match (<code>group(0)</code>) shifts left and now includes X, but capture groups remain intact.</li>
104+ * <li>Variable-length NEGATIVE lookbehind → no safe generic rewrite (changing it would alter semantics).
105+ * These are left unchanged, except for a handful of known safe special cases handled explicitly.</li>
106+ * </ul>
107+ *
108+ * @see <a href="https://github.com/eclipse-tm4e/tm4e/issues/677">github.com/eclipse-tm4e/tm4e/issues/677</a>
100109 */
101110 private String rewritePatternIfRequired (final String pattern ) {
111+ if (pattern .isEmpty ())
112+ return pattern ;
102113
103- // e.g. used in csharp.tmLanguage.json
104- final var lookbehind1 = "(?<!\\ .\\ s*)" ;
105- if (pattern .startsWith (lookbehind1 )) {
106- return "(?<!\\ .)\\ s*" + pattern .substring (lookbehind1 .length ());
107- }
114+ // --- Positive lookbehinds --------------------------------------------------
115+ // Joni supports fixed-length positive lookbehind, but not variable-length.
116+ // If the pattern starts with (?<=...), inspect its body:
117+ // - Fixed-length → keep as-is.
118+ // - Variable-length → rewrite by consuming the context: (?<=X)Y ==> (?:X)Y
119+ // (group numbering is preserved; only group(0) widens to include X).
120+ if (pattern .startsWith ("(?<=" )) {
121+ final int close = findBalancedGroupEnd (pattern , 0 ); // index of ')' closing (?<=...)
122+ if (close > 0 ) {
123+ final String body = pattern .substring ("(?<=" .length (), close );
124+
125+ if (isFixedLength (body ))
126+ return pattern ; // supported as-is
108127
109- // e.g. used in markdown.math.block.tmLanguage.json and tex.tmLanguage.json
110- final var lookbehind2 = "(?<=^ \\ s*)" ;
111- if ( pattern . startsWith ( lookbehind2 )) {
112- return " (?<=^) \\ s*" + pattern . substring ( lookbehind2 . length ());
128+ // Variable-length positive lookbehind: consume the prefix so the pattern runs on Joni.
129+ return "(?:" + body + ")" + pattern . substring ( close + 1 ) ;
130+ }
131+ // Unbalanced (?<=... → leave unchanged
113132 }
114133
115- // e.g. used in carbon.tmLanguage.json
116- final var lookbehind3 = "(?<=\\ s*\\ .)" ;
117- if (pattern .startsWith (lookbehind3 )) {
118- return "\\ s*\\ ." + pattern .substring (lookbehind3 .length ());
134+ // --- Negative lookbehinds --------------------------------------------------
135+ // We intentionally DO NOT apply a generic rewrite for variable-length NEGATIVE lookbehind:
136+ // - Fixed-length negative LB is Joni-compatible → leave unchanged.
137+ // - Variable-length negative LB has no semantics-preserving generic rewrite.
138+ // If you encounter real-world cases, add targeted rewrites below.
139+
140+ // Used in csharp.tmLanguage.json: (?<!\.\s*) ==> (?<!\.)\s*
141+ // Rationale: move the variable portion (\s*) out of the lookbehind while keeping intent.
142+ final var negLB = "(?<!\\ .\\ s*)" ;
143+ if (pattern .startsWith (negLB ))
144+ return "(?<!\\ .)\\ s*" + pattern .substring (negLB .length ());
145+
146+ return pattern ;
147+ }
148+
149+ /**
150+ * Returns the index of the ')' that closes the group whose '(' is at 'start'.
151+ * Assumes 'start' points at '(' of a construct like (?<=...) or (?<!...).
152+ * Returns -1 if unbalanced.
153+ */
154+ private int findBalancedGroupEnd (final String str , final int start ) {
155+ int depth = 0 ;
156+ boolean escaped = false ;
157+ for (int idx = start ; idx < str .length (); idx ++) {
158+ final char c = str .charAt (idx );
159+ if (escaped ) {
160+ escaped = false ;
161+ continue ;
162+ }
163+ if (c == '\\' ) {
164+ escaped = true ;
165+ continue ;
166+ }
167+ if (c == '(' ) {
168+ depth ++;
169+ } else if (c == ')' ) {
170+ depth --;
171+ if (depth == 0 )
172+ return idx ;
173+ }
119174 }
175+ return -1 ;
176+ }
120177
121- // e.g. used in julia.tmLanguage.json
122- final var lookbehind4 = "(?<=\\ S\\ s+)" ;
123- if (pattern .startsWith (lookbehind4 )) {
124- return "\\ S\\ s+" + pattern .substring (lookbehind4 .length ());
178+ /**
179+ * Fixed-length detector for a lookbehind body.
180+ * Returns false if it contains obvious variable-length features:
181+ * - unescaped *, +, ?, or alternation '|'
182+ * - {m,} (open upper bound) or {m,n} with m != n
183+ * If unsure, returns false.
184+ */
185+ private boolean isFixedLength (final String body ) {
186+ boolean escaped = false ;
187+ for (int idx = 0 ; idx < body .length (); idx ++) {
188+ final char ch = body .charAt (idx );
189+ if (escaped ) {
190+ escaped = false ;
191+ continue ;
192+ }
193+ if (ch == '\\' ) {
194+ escaped = true ;
195+ continue ;
196+ }
197+ if (ch == '*' || ch == '+' || ch == '?' || ch == '|' )
198+ return false ;
199+ if (ch == '{' ) {
200+ int j = idx + 1 ;
201+ while (j < body .length () && Character .isDigit (body .charAt (j ))) {
202+ j ++;
203+ }
204+ if (j == idx + 1 )
205+ return false ; // not {m
206+ int m ;
207+ try {
208+ m = Integer .parseInt (body .substring (idx + 1 , j ));
209+ } catch (final NumberFormatException e ) {
210+ return false ;
211+ }
212+ int n = m ;
213+ if (j < body .length () && body .charAt (j ) == ',' ) {
214+ j ++;
215+ int k = j ;
216+ while (k < body .length () && Character .isDigit (body .charAt (k ))) {
217+ k ++;
218+ }
219+ if (k == j )
220+ return false ; // {m,}
221+ try {
222+ n = Integer .parseInt (body .substring (j , k ));
223+ } catch (final NumberFormatException e ) {
224+ return false ;
225+ }
226+ if (m != n )
227+ return false ; // {m,n} with m != n
228+ j = k ;
229+ }
230+ final int close = body .indexOf ('}' , j );
231+ if (close < 0 )
232+ return false ; // malformed
233+ idx = close ;
234+ }
125235 }
126- return pattern ;
236+ return true ;
127237 }
128238
129239 /**
0 commit comments