Skip to content

Commit 1783f50

Browse files
committed
fix: improve OnigRegExp.rewritePatternIfRequired
1 parent 46e25c4 commit 1783f50

File tree

1 file changed

+131
-21
lines changed
  • org.eclipse.tm4e.core/src/main/java/org/eclipse/tm4e/core/internal/oniguruma

1 file changed

+131
-21
lines changed

org.eclipse.tm4e.core/src/main/java/org/eclipse/tm4e/core/internal/oniguruma/OnigRegExp.java

Lines changed: 131 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -93,37 +93,147 @@ private Regex parsePattern(final String pattern, final boolean ignoreCase) throw
9393
}
9494

9595
/**
96-
* Rewrites the given pattern to workaround limitations of the joni library which for example does not support
97-
* negative variable-length look-behinds
96+
* Rewrites the given pattern to work around limitations of the Joni library, which does not support variable-length lookbehinds.
9897
*
99-
* @see <a href="https://github.com/eclipse-tm4e/tm4e/issues/677">github.com/eclipse-tm4e/tm4e/issue/677</a>
98+
* Strategy:
99+
* <ul>
100+
* <li>Fixed-length lookbehind (positive or negative) is supported by Joni → leave unchanged.</li>
101+
* <li>Variable-length POSITIVE lookbehind at the pattern start → rewritten by <em>consuming</em> the context:<br/>
102+
* <code>(?&lt;=X)Y</code> ⇒ <code>(?:X)Y</code><br/>
103+
* Trade-off: the overall match (<code>group(0)</code>) shifts left and now includes X, but capture groups remain intact.</li>
104+
* <li>Variable-length NEGATIVE lookbehind → no safe generic rewrite (changing it would alter semantics).
105+
* These are left unchanged, except for a handful of known safe special cases handled explicitly.</li>
106+
* </ul>
107+
*
108+
* @see <a href="https://github.com/eclipse-tm4e/tm4e/issues/677">github.com/eclipse-tm4e/tm4e/issues/677</a>
100109
*/
101110
private String rewritePatternIfRequired(final String pattern) {
111+
if (pattern.isEmpty())
112+
return pattern;
102113

103-
// e.g. used in csharp.tmLanguage.json
104-
final var lookbehind1 = "(?<!\\.\\s*)";
105-
if (pattern.startsWith(lookbehind1)) {
106-
return "(?<!\\.)\\s*" + pattern.substring(lookbehind1.length());
107-
}
114+
// --- Positive lookbehinds --------------------------------------------------
115+
// Joni supports fixed-length positive lookbehind, but not variable-length.
116+
// If the pattern starts with (?<=...), inspect its body:
117+
// - Fixed-length → keep as-is.
118+
// - Variable-length → rewrite by consuming the context: (?<=X)Y ==> (?:X)Y
119+
// (group numbering is preserved; only group(0) widens to include X).
120+
if (pattern.startsWith("(?<=")) {
121+
final int close = findBalancedGroupEnd(pattern, 0); // index of ')' closing (?<=...)
122+
if (close > 0) {
123+
final String body = pattern.substring("(?<=".length(), close);
124+
125+
if (isFixedLength(body))
126+
return pattern; // supported as-is
108127

109-
// e.g. used in markdown.math.block.tmLanguage.json and tex.tmLanguage.json
110-
final var lookbehind2 = "(?<=^\\s*)";
111-
if (pattern.startsWith(lookbehind2)) {
112-
return "(?<=^)\\s*" + pattern.substring(lookbehind2.length());
128+
// Variable-length positive lookbehind: consume the prefix so the pattern runs on Joni.
129+
return "(?:" + body + ")" + pattern.substring(close + 1);
130+
}
131+
// Unbalanced (?<=... → leave unchanged
113132
}
114133

115-
// e.g. used in carbon.tmLanguage.json
116-
final var lookbehind3 = "(?<=\\s*\\.)";
117-
if (pattern.startsWith(lookbehind3)) {
118-
return "\\s*\\." + pattern.substring(lookbehind3.length());
134+
// --- Negative lookbehinds --------------------------------------------------
135+
// We intentionally DO NOT apply a generic rewrite for variable-length NEGATIVE lookbehind:
136+
// - Fixed-length negative LB is Joni-compatible → leave unchanged.
137+
// - Variable-length negative LB has no semantics-preserving generic rewrite.
138+
// If you encounter real-world cases, add targeted rewrites below.
139+
140+
// Used in csharp.tmLanguage.json: (?<!\.\s*) ==> (?<!\.)\s*
141+
// Rationale: move the variable portion (\s*) out of the lookbehind while keeping intent.
142+
final var negLB = "(?<!\\.\\s*)";
143+
if (pattern.startsWith(negLB))
144+
return "(?<!\\.)\\s*" + pattern.substring(negLB.length());
145+
146+
return pattern;
147+
}
148+
149+
/**
150+
* Returns the index of the ')' that closes the group whose '(' is at 'start'.
151+
* Assumes 'start' points at '(' of a construct like (?<=...) or (?<!...).
152+
* Returns -1 if unbalanced.
153+
*/
154+
private int findBalancedGroupEnd(final String str, final int start) {
155+
int depth = 0;
156+
boolean escaped = false;
157+
for (int idx = start; idx < str.length(); idx++) {
158+
final char c = str.charAt(idx);
159+
if (escaped) {
160+
escaped = false;
161+
continue;
162+
}
163+
if (c == '\\') {
164+
escaped = true;
165+
continue;
166+
}
167+
if (c == '(') {
168+
depth++;
169+
} else if (c == ')') {
170+
depth--;
171+
if (depth == 0)
172+
return idx;
173+
}
119174
}
175+
return -1;
176+
}
120177

121-
// e.g. used in julia.tmLanguage.json
122-
final var lookbehind4 = "(?<=\\S\\s+)";
123-
if (pattern.startsWith(lookbehind4)) {
124-
return "\\S\\s+" + pattern.substring(lookbehind4.length());
178+
/**
179+
* Fixed-length detector for a lookbehind body.
180+
* Returns false if it contains obvious variable-length features:
181+
* - unescaped *, +, ?, or alternation '|'
182+
* - {m,} (open upper bound) or {m,n} with m != n
183+
* If unsure, returns false.
184+
*/
185+
private boolean isFixedLength(final String body) {
186+
boolean escaped = false;
187+
for (int idx = 0; idx < body.length(); idx++) {
188+
final char ch = body.charAt(idx);
189+
if (escaped) {
190+
escaped = false;
191+
continue;
192+
}
193+
if (ch == '\\') {
194+
escaped = true;
195+
continue;
196+
}
197+
if (ch == '*' || ch == '+' || ch == '?' || ch == '|')
198+
return false;
199+
if (ch == '{') {
200+
int j = idx + 1;
201+
while (j < body.length() && Character.isDigit(body.charAt(j))) {
202+
j++;
203+
}
204+
if (j == idx + 1)
205+
return false; // not {m
206+
int m;
207+
try {
208+
m = Integer.parseInt(body.substring(idx + 1, j));
209+
} catch (final NumberFormatException e) {
210+
return false;
211+
}
212+
int n = m;
213+
if (j < body.length() && body.charAt(j) == ',') {
214+
j++;
215+
int k = j;
216+
while (k < body.length() && Character.isDigit(body.charAt(k))) {
217+
k++;
218+
}
219+
if (k == j)
220+
return false; // {m,}
221+
try {
222+
n = Integer.parseInt(body.substring(j, k));
223+
} catch (final NumberFormatException e) {
224+
return false;
225+
}
226+
if (m != n)
227+
return false; // {m,n} with m != n
228+
j = k;
229+
}
230+
final int close = body.indexOf('}', j);
231+
if (close < 0)
232+
return false; // malformed
233+
idx = close;
234+
}
125235
}
126-
return pattern;
236+
return true;
127237
}
128238

129239
/**

0 commit comments

Comments
 (0)