Skip to content

Commit 25437b3

Browse files
committed
TRegex: further restrict quantifier unrolling to avoid NFA transition
explosions.
1 parent 782b28c commit 25437b3

File tree

11 files changed

+105
-32
lines changed

11 files changed

+105
-32
lines changed

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JsTests.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,11 @@ public void generatedTests() {
688688
/* GENERATED CODE END - KEEP THIS MARKER FOR AUTOMATIC UPDATES */
689689
}
690690

691+
@Test
692+
public void quantifierUnrollNFAExplosion() {
693+
test("(?:\\3((?:[^]|.{0}|\\B|\ub9b5\\b|$){4,}){1,4})", "yim", "\ua3bb\n\n\u00a1\n\na\u2f77\n\n\ua3bb\n\n\u00a1\n\na\u2f77\n\n", 0, false);
694+
}
695+
691696
@Test
692697
public void overlappingBq() {
693698
testBoolean("(?=a{2,4})[ab]{4,68}c", "", NEVER_UNROLL_OPT, "aabbbbbbbbbbbbbbbbbbbbbbc", 0, true);

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/OracleDBTests.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1700,4 +1700,10 @@ public void orcl38190286() {
17001700
test("[[:alpha:]]", "", Collections.emptyMap(), Encodings.UTF_16, "\uDDF2", 0, false);
17011701
test("[[:alpha:]]", "", Collections.emptyMap(), Encodings.UTF_16, "\uD839\uDDF2", 0, false);
17021702
}
1703+
1704+
@Test
1705+
public void bqTransitionExplosion() {
1706+
test("(a(b(b(b(b(b(b(b(b(b(b(b(b(b(b(b(b(b(b(b|)|)|)|)|)|)|)|)|)|)|)|)|)|)|)|)|)|)|){2,2}c)de", "", Map.of("regexDummyLang.QuantifierUnrollLimitGroup", "1"),
1707+
"abbbbbbbcdebbbbbbbf", 0, true, 0, 11, 0, 9, 8, 8, 2, 8, 3, 8, 4, 8, 5, 8, 6, 8, 7, 8, 8, 8);
1708+
}
17031709
}

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexOptions.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,12 @@ public class TRegexOptions {
183183
*/
184184
public static final int TRegexQuantifierUnrollLimitGroup = 6;
185185

186+
/**
187+
* Quantified groups whose node count is greater that this threshold will not be considered for
188+
* quantifier unrolling.
189+
*/
190+
public static final int TRegexQuantifierUnrollLimitGroupNodeCount = 100;
191+
186192
/**
187193
* Bailout threshold for number of capture groups.
188194
*/

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/automaton/StateTransitionCanonicalizer.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242

4343
import java.util.Arrays;
4444
import java.util.Iterator;
45+
import java.util.Objects;
46+
47+
import org.graalvm.collections.EconomicSet;
4548

4649
import com.oracle.truffle.regex.charset.CodePointSet;
4750
import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer;
@@ -69,6 +72,7 @@ public abstract class StateTransitionCanonicalizer<SI extends StateIndex<? super
6972
private boolean anyArgConstraints;
7073
private final IntArrayBuffer stack = new IntArrayBuffer();
7174
private final IntArrayBuffer skipStack = new IntArrayBuffer();
75+
private final EconomicSet<ConstraintDeduplicationKey> constraintDeduplicationSet = EconomicSet.create();
7276

7377
private static final int INITIAL_CAPACITY = 8;
7478

@@ -287,6 +291,7 @@ private void calcDisjointTransitions(CompilationBuffer compilationBuffer) {
287291
for (int i : intersectingArgs[iStage1]) {
288292
addToStack(argTransitions.get(i), matcher, argConstraints.get(i), argOperations.get(i), resultLength);
289293
}
294+
constraintDeduplicationSet.clear();
290295
outer: while (!stack.isEmpty()) {
291296
int i = stack.pop();
292297
T argTransition = argTransitions.get(i);
@@ -301,30 +306,36 @@ private void calcDisjointTransitions(CompilationBuffer compilationBuffer) {
301306
if (constraintMerge == null) {
302307
continue;
303308
}
304-
assert TransitionConstraint.isNormalized(constraintMerge.middle());
305309

306310
// if the slot already contains an unconditional transition to the final state,
307311
// we don't have to check any other constraints.
308312
if (!(shouldPruneAfterFinalState() && leadsToFinalState.get(j))) {
309313
if (constraintMerge.lhs().length == 0) {
310314
addTransitionTo(j, argTransition, argOperation);
311315
} else {
316+
assert TransitionConstraint.isNormalized(constraintMerge.lhs()[0]);
312317
constraintBuilder[j] = constraintMerge.lhs()[0];
313318
for (int k = 1; k < constraintMerge.lhs().length; ++k) {
314319
duplicateSlot(j, matcher, constraintMerge.lhs()[k]);
315320
resultLength++;
316321
}
317-
318322
duplicateSlot(j, matcher, constraintMerge.middle());
319323
addTransitionTo(resultLength, argTransition, argOperation);
320324
resultLength++;
321325
}
322326
}
323327
for (long[] rhs : constraintMerge.rhs()) {
324-
addToStack(argTransition, matcher, rhs, argOperation, j + 1);
328+
// TransitionConstraint.intersectAndSubtract may yield the same constraint
329+
// formula for the same transition multiple times when intersecting a large
330+
// set of transitions. We drop those duplicates here, otherwise they would
331+
// increase this stage's run time exponentially.
332+
if (constraintDeduplicationSet.add(new ConstraintDeduplicationKey(argTransition.getId(), rhs))) {
333+
addToStack(argTransition, matcher, rhs, argOperation, j + 1);
334+
}
325335
}
326336
continue outer;
327337
}
338+
assert TransitionConstraint.isNormalized(argConstraint);
328339
createSlot();
329340
matcherBuilders[resultLength] = matcher;
330341
constraintBuilder[resultLength] = argConstraint;
@@ -336,6 +347,7 @@ private void calcDisjointTransitions(CompilationBuffer compilationBuffer) {
336347
}
337348

338349
private void duplicateSlot(int i, CodePointSet matcher, long[] constraints) {
350+
assert TransitionConstraint.isNormalized(constraints);
339351
createSlot();
340352
targetStateSets[resultLength] = targetStateSets[i].copy();
341353
transitionLists[resultLength].addAll(transitionLists[i]);
@@ -473,6 +485,22 @@ private TB[] mergeSameTargets(CompilationBuffer compilationBuffer) {
473485
return resultBuffer2.toArray(createResultArray(resultBuffer2.length()));
474486
}
475487

488+
private record ConstraintDeduplicationKey(int transitionID, long[] constraints) {
489+
490+
@Override
491+
public boolean equals(Object obj) {
492+
if (!(obj instanceof ConstraintDeduplicationKey o)) {
493+
return false;
494+
}
495+
return transitionID == o.transitionID && Arrays.equals(constraints, o.constraints);
496+
}
497+
498+
@Override
499+
public int hashCode() {
500+
return Objects.hash(transitionID, Arrays.hashCode(constraints));
501+
}
502+
}
503+
476504
protected abstract TB createTransitionBuilder(T[] transitions, StateSet<SI, S> targetStateSet, CodePointSet matcherBuilder, long[] constraints, long[] operations);
477505

478506
/**

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAStateTransition.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,11 @@ public int hashCode() {
148148
return getId();
149149
}
150150

151+
@Override
152+
public String toString() {
153+
return String.format("%d - %s -> %d", source.getId(), codePointSet, target.getId());
154+
}
155+
151156
@TruffleBoundary
152157
@Override
153158
public JsonValue toJson() {

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTPostProcessor.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,23 +178,25 @@ private boolean shouldUnroll(QuantifiableTerm term) {
178178
return ast.getNumberOfNodes() <= TRegexOptions.TRegexMaxParseTreeSizeForDFA && (term.getQuantifier().isUnrollTrivial() || term.isUnrollingCandidate(ast.getOptions()));
179179
}
180180

181-
private static final class ShouldUnrollQuantifierVisitor extends DepthFirstTraversalRegexASTVisitor {
181+
private static final class ShouldUnrollQuantifierVisitor extends NodeCountVisitor {
182182

183-
private boolean result;
183+
private boolean containsBackReference;
184184

185185
boolean shouldUnroll(Group group) {
186186
assert group.hasQuantifier();
187187
if (group.getQuantifier().isUnrollTrivial()) {
188188
return true;
189189
}
190-
result = true;
190+
count = 0;
191+
containsBackReference = false;
191192
run(group);
192-
return result;
193+
return count <= TRegexOptions.TRegexQuantifierUnrollLimitGroupNodeCount && !containsBackReference;
193194
}
194195

195196
@Override
196197
protected void visit(BackReference backReference) {
197-
result = false;
198+
super.visit(backReference);
199+
containsBackReference = true;
198200
}
199201
}
200202

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CharacterClass.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,28 @@ public void setWasSingleChar(boolean value) {
128128

129129
@Override
130130
public boolean isUnrollingCandidate(RegexOptions options) {
131-
return hasQuantifier() && getQuantifier().isWithinThreshold(options.quantifierUnrollLimitSingleCC);
131+
if (!hasQuantifier()) {
132+
return false;
133+
}
134+
if (getQuantifier().isWithinThreshold(Math.min(options.quantifierUnrollLimitGroup, options.quantifierUnrollLimitSingleCC))) {
135+
return true;
136+
}
137+
if (getQuantifier().isWithinThreshold(Math.max(options.quantifierUnrollLimitGroup, options.quantifierUnrollLimitSingleCC))) {
138+
// If the quantifier on this character class is large, but still within unroll
139+
// threshold, we take parent group quantifiers into account; If there is a bounded
140+
// quantifier on any parent group, unrolling the current quantifier may lead to either
141+
// an explosion of NFA states if the parent quantifier is unrolled as well, or an
142+
// explosion of guarded NFA transitions if the parent quantifier is not unrolled.
143+
Group parent = getQuantifiedParentGroup();
144+
while (parent != null) {
145+
if (!parent.getQuantifier().isUnrollTrivial()) {
146+
return false;
147+
}
148+
parent = parent.getQuantifiedParentGroup();
149+
}
150+
return true;
151+
}
152+
return false;
132153
}
133154

134155
public void addLookBehindEntry(RegexAST ast, LookBehindAssertion lookBehindEntry) {

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -706,26 +706,6 @@ public SubexpressionCall asSubexpressionCall() {
706706
return (SubexpressionCall) this;
707707
}
708708

709-
public Group getParentQuantifier() {
710-
RegexASTNode current = this;
711-
while (current.getParent() != null) {
712-
assert current instanceof Term;
713-
if (current.getParent() instanceof RegexASTSubtreeRootNode) {
714-
return null;
715-
} else {
716-
assert current.getParent().getParent() instanceof Group;
717-
var nextParent = current.getParent().getParent().asGroup();
718-
if (nextParent.hasQuantifier() && !nextParent.isExpandedQuantifier()) {
719-
return nextParent;
720-
}
721-
// structure is always Group -> Sequence -> Term
722-
current = nextParent;
723-
}
724-
}
725-
// this should only be reached by nodes generated by RegexAST#createNFAInitialStates()!
726-
return null;
727-
}
728-
729709
protected JsonObject toJson(String typeName) {
730710
return Json.obj(Json.prop("id", id),
731711
Json.prop("type", typeName),

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Term.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,23 @@ public RegexASTSubtreeRootNode getSubTreeParent() {
9595
// this should only be reached by nodes generated by RegexAST#createNFAInitialStates()!
9696
return null;
9797
}
98+
99+
public Group getQuantifiedParentGroup() {
100+
RegexASTNode current = this;
101+
while (current.getParent() != null) {
102+
if (current.getParent() instanceof RegexASTSubtreeRootNode) {
103+
return null;
104+
} else {
105+
assert current.getParent().getParent() instanceof Group;
106+
Group nextParent = current.getParent().getParent().asGroup();
107+
if (nextParent.hasNotUnrolledQuantifier()) {
108+
return nextParent;
109+
}
110+
// structure is always Group -> Sequence -> Term
111+
current = nextParent;
112+
}
113+
}
114+
// this should only be reached by nodes generated by RegexAST#createNFAInitialStates()!
115+
return null;
116+
}
98117
}

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -408,8 +408,9 @@ protected long[] getTransitionGuardsOnPath() {
408408
* @return the quantifier index or -1 if no guards are needed.
409409
*/
410410
protected int needsMaintainGuard() {
411-
Group parentQuant = root.getParentQuantifier();
412-
Group otherParent = cur.getParentQuantifier();
411+
assert cur instanceof Term;
412+
Group parentQuant = root.getQuantifiedParentGroup();
413+
Group otherParent = ((Term) cur).getQuantifiedParentGroup();
413414
if (parentQuant != null && parentQuant.equals(otherParent)) {
414415
for (long guard : transitionGuardsCanonicalized) {
415416
if (TransitionGuard.isQuantifierOp(guard)) {

0 commit comments

Comments
 (0)