Skip to content

Commit aad7de1

Browse files
zaclegarssuredjoooooe
authored andcommitted
TRegex: Add bounded quantifier in NFA/DFA mode.
1 parent c26d43d commit aad7de1

File tree

67 files changed

+3542
-338
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+3542
-338
lines changed

regex/mx.regex/copyrights/oracle.copyright.hash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
2+
# Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
33
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
#
55
# The Universal Permissive License (UPL), Version 1.0

regex/mx.regex/copyrights/oracle.copyright.star

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/jmh/TRegexVSJavaBenchmarks.java

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -100,14 +100,55 @@ private ParameterSet(String name, String regex, String flags, String input) {
100100
"([-!#-''*+/-9=?A-Z^-~]+(\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"([ ]!#-[^-~ ]|(\\\\[-~ ]))+\")@[0-9A-Za-z]([0-9A-Za-z-]{0,61}[0-9A-Za-z])?(\\.[0-9A-Za-z]([0-9A-Za-z-]{0,61}[0-9A-Za-z])?)+",
101101
"",
102102
103+
new ParameterSet("email_no_cg",
104+
"(?:[-!#-''*+/-9=?A-Z^-~]+(?:\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"(?:[ ]!#-[^-~ ]|(?:\\\\[-~ ]))+\")@[0-9A-Za-z](?:[0-9A-Za-z-]{0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:[0-9A-Za-z-]{0,61}[0-9A-Za-z])?)+",
105+
"",
106+
103107
new ParameterSet("email_dfa",
104108
"([-!#-''*+/-9=?A-Z^-~]+(\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"([ ]!#-[^-~ ]|(\\\\[-~ ]))+\")@[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?(\\.[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?)+",
105109
"",
106110
107111
new ParameterSet("apache_log",
108112
"(\\S+) (\\S+) (\\S+) \\[([A-Za-z0-9_:/]+\\s[-+]\\d{4})\\] \"(\\S+)\\s?(\\S+)?\\s?(\\S+)?\" (\\d{3}|-) (\\d+|-)\\s?\"?([^\"]*)\"?\\s?\"?([^\"]*)?\"?",
109113
"",
110-
"205.169.39.63 - - [03/Nov/2022:15:28:53 +0100] \"GET / HTTP/1.1\" 200 911 \"-\" \"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36\"")
114+
"205.169.39.63 - - [03/Nov/2022:15:28:53 +0100] \"GET / HTTP/1.1\" 200 911 \"-\" \"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36\""),
115+
new ParameterSet("bounded_quantifier",
116+
"a{10,110}-a{2,10}-a{5,12}",
117+
"",
118+
"b".repeat(100) + "a".repeat(100) + "-aaaaaa-aaaaaabaa"),
119+
new ParameterSet("simple bounded",
120+
"a{5,10}b",
121+
"",
122+
"a".repeat(200) + "aaaaaab"),
123+
new ParameterSet("complex_transition",
124+
"[ab]{3,5}[bc]{4,5}d",
125+
"",
126+
"aaababbbb-".repeat(10) + "aaababbbbd"),
127+
new ParameterSet("bq large bounds",
128+
"(?:aa){100,200}b",
129+
"",
130+
"aa".repeat(150) + "b"),
131+
new ParameterSet("bq small bounds",
132+
"(?:aa){10,64}b",
133+
"",
134+
"aa".repeat(150) + "b"),
135+
new ParameterSet("bq very large bounds",
136+
"(?:aa){100,600}b",
137+
"",
138+
"aa".repeat(150) + "b"),
139+
new ParameterSet("simple bq very large bounds",
140+
"a{100,600}b",
141+
"",
142+
"aa".repeat(150) + "b"),
143+
new ParameterSet("simple bq very very large bounds",
144+
"a{100,2600}b",
145+
"",
146+
"aa".repeat(150) + "b"),
147+
new ParameterSet("Android",
148+
"Android[\\- ][\\d]+(?:\\.[\\d]+)(?:\\.[\\d]+|); {0,2}[A-Za-z]{2}[_\\-][A-Za-z]{0,2}\\-? {0,2}; {0,2}(.{1,200}?)( Build[/ ]|\\))",
149+
"",
150+
"Mozilla/5.0 (Linux; Android 12; SM-N975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"),
151+
111152
});
112153

113154
private static Map<String, ParameterSet> createMap(ParameterSet[] parameterSets) {
@@ -122,11 +163,12 @@ private static Map<String, ParameterSet> createMap(ParameterSet[] parameterSets)
122163
public static class BenchState {
123164

124165
// excluded by default:
125-
// {"vowels", "date", "ipv4", "ipv6_1", "ipv6_2", "email", "email_dfa", "apache_log"}
166+
// {"vowels", "date", "ipv4", "ipv6_1", "ipv6_2", "email_no_cg", "email_dfa", "apache_log"}
126167
@Param({"ignoreCase", "URL"}) String benchName;
127168
Context context;
128169
Pattern javaPattern;
129170
Value tregexBool;
171+
Value tregexBoolNoUnroll;
130172
Value tregexCG;
131173
String input;
132174

@@ -139,9 +181,12 @@ public void setUp() {
139181
context.enter();
140182
ParameterSet p = benchmarks.get(benchName);
141183
javaPattern = Pattern.compile(p.regex, toJavaFlags(p.flags));
142-
tregexBool = context.parse(TRegexTestDummyLanguage.ID, TRegexTestDummyLanguage.BENCH_PREFIX + "GenerateDFAImmediately=true/" + p.regex + '/' + p.flags);
184+
tregexBool = context.parse(TRegexTestDummyLanguage.ID,
185+
TRegexTestDummyLanguage.BENCH_PREFIX + "GenerateDFAImmediately=true/" + p.regex + '/' + p.flags);
186+
tregexBoolNoUnroll = context.parse(TRegexTestDummyLanguage.ID,
187+
TRegexTestDummyLanguage.BENCH_PREFIX + "QuantifierUnrollThresholdSingleCC=1,QuantifierUnrollThresholdGroup=1,GenerateDFAImmediately=true/" + p.regex + '/' + p.flags);
143188
tregexCG = context.parse(TRegexTestDummyLanguage.ID, TRegexTestDummyLanguage.BENCH_CG_PREFIX + "GenerateDFAImmediately=true/" + p.regex + '/' + p.flags);
144-
input = "_".repeat(200) + p.input;
189+
input = p.input;
145190
}
146191

147192
private static int toJavaFlags(String flags) {
@@ -180,6 +225,11 @@ public boolean tregex(BenchState state) {
180225
return state.tregexBool.execute(state.input, 0).asBoolean();
181226
}
182227

228+
@Benchmark
229+
public boolean tregexNoUnroll(BenchState state) {
230+
return state.tregexBoolNoUnroll.execute(state.input, 0).asBoolean();
231+
}
232+
183233
@Benchmark
184234
public int tregexCG(BenchState state) {
185235
return state.tregexCG.execute(state.input, 0).asInt();

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JsTests.java

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151

5252
public class JsTests extends RegexTestBase {
5353

54+
private static final String NEVER_UNROLL_OPT = "QuantifierUnrollThresholdSingleCC=1,QuantifierUnrollThresholdGroup=1";
55+
5456
@Override
5557
String getEngineOptions() {
5658
return "";
@@ -95,6 +97,7 @@ public void zeroWidthBoundedQuantifier() {
9597
test("(a||b){100,200}", "", "ab", 0, true, 0, 2, 1, 2);
9698
test("(a||b){100,200}?", "", "ab", 0, true, 0, 1, 1, 1);
9799
test("(a||b){100,200}?$", "", "ab", 0, true, 0, 2, 1, 2);
100+
test("(a|){1,20}b", "", "aaaaaaaab", 0, true, 0, 9, 7, 8);
98101
}
99102

100103
@Test
@@ -283,6 +286,11 @@ public void mergedLookAheadLiteral() {
283286
test("(?:(?=(abc)))a", "", "abc", 0, true, 0, 1, 0, 3);
284287
}
285288

289+
@Test
290+
public void boundedQuantifierInNFAMode() {
291+
testBoolean("a{3}b", "", "aab", 0, false);
292+
}
293+
286294
@Test
287295
public void innerLiteralSurrogates() {
288296
test("\\udf06", "", "\uD834\uDF06", 0, true, 1, 2);
@@ -319,6 +327,90 @@ public void emptyTransitionMergedWithLookAhead() {
319327
test("a?(?=b(?<=ab)()|)", "", "a", 0, true, 0, 1, -1, -1);
320328
}
321329

330+
@Test
331+
public void boundedQuantifierPaperExample1() {
332+
testBoolean(".*a.{4,8}a", "y", NEVER_UNROLL_OPT, "---aaaaaaaa", 0, true);
333+
}
334+
335+
@Test
336+
public void boundedQuantifierPaperExample2() {
337+
testBoolean("(?:a{9})*b", "", NEVER_UNROLL_OPT, "aaaaaaaaaaaaaaab", 0, true);
338+
}
339+
340+
@Test
341+
public void boundedQuantifierPaperExample3() {
342+
testBoolean(".*a.{9}.", "y", NEVER_UNROLL_OPT, "aaaaaaaaaaa", 0, true);
343+
}
344+
345+
@Test
346+
public void boundedQuantifierFixed() {
347+
testBoolean("[0-9A-F]{8}", "i", NEVER_UNROLL_OPT,
348+
"OData-EntityId: https://url.com/api/data/v8.2/tests(00000000-0000-0000-0000-000000000001)", 0, true);
349+
testBoolean("0{2}", "i", NEVER_UNROLL_OPT,
350+
"0000", 0, true);
351+
}
352+
353+
@Test
354+
public void boundedQuantifierNullable() {
355+
testBoolean("((?:[0-9A-F]?){8})", "i", NEVER_UNROLL_OPT,
356+
"OData-EntityId: https://url.com/api/data/v8.2/tests(00000000-0000-0000-0000-000000000001)", 0, true);
357+
}
358+
359+
@Test
360+
public void dateRegex() {
361+
testBoolean("\\d{1,2}/\\d{1,2}/\\d{4}", "y", NEVER_UNROLL_OPT, "09/08/2024", 0, true);
362+
}
363+
364+
@Test
365+
public void simpleBoundedQuantifier() {
366+
testBoolean(".{2,4}", "sy", NEVER_UNROLL_OPT, "aaaaa", 0, true);
367+
testBoolean(".{3,4}", "sy", NEVER_UNROLL_OPT, "aa", 0, false);
368+
testBoolean("a[ab]{4,8}a", "", NEVER_UNROLL_OPT, "aaaaaaaa", 0, true);
369+
}
370+
371+
@Test
372+
public void multiBoundedQuantifier() {
373+
testBoolean("a{2,4}-a{3,4}", "s", NEVER_UNROLL_OPT, "aaa-aa-aaa", 0, true);
374+
}
375+
376+
@Test
377+
public void anchoredQuantifier() {
378+
testBoolean("(?:ab){2,4}$", "", NEVER_UNROLL_OPT, "aaabab", 0, true);
379+
}
380+
381+
@Test
382+
public void boundedQuantifiersWithOverlappingIterations() {
383+
testBoolean("(?:aa|aaa){3,6}b", "", NEVER_UNROLL_OPT, "aaaaaab", 0, true);
384+
testBoolean("(?:aa|aaa){3,6}b", "", NEVER_UNROLL_OPT, "aaaab", 0, false);
385+
testBoolean("(?:aa|aaa){3,6}b", "", NEVER_UNROLL_OPT, "aaaaab", 0, false);
386+
testBoolean("(?:aa|aaa){3,6}b", "y", NEVER_UNROLL_OPT, "aaaaaaab", 0, true);
387+
testBoolean("(?:aa|aaa){3,6}b", "y", NEVER_UNROLL_OPT, "aaaaaaaaaaaaaaab", 0, true);
388+
testBoolean("(?:aa|aaaaa){3,6}b", "y", NEVER_UNROLL_OPT, "aaaaaaab", 0, false);
389+
}
390+
391+
@Test
392+
public void email() {
393+
var prefix = "john.doe@john.@@";
394+
var input = "[email protected]";
395+
testBoolean("(?:[-!#-''*+/-9=?A-Z^-~]+(?:\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"(?:[ ]!#-[^-~ ]|(?:\\\\[-~ ]))+\")@[0-9A-Za-z](?:[0-9A-Za-z-]{0,61}[0-9A-Za-z])?(?:\\.[0-9A-Za-z](?:[0-9A-Za-z-]{0,61}[0-9A-Za-z])?)+",
396+
"",
397+
NEVER_UNROLL_OPT,
398+
prefix + input + " ",
399+
0,
400+
true);
401+
}
402+
403+
@Test
404+
public void nestedQuantifier() {
405+
testBoolean("(?:a{1,2}){1,2}", "", NEVER_UNROLL_OPT, "bbb", 0, false);
406+
}
407+
408+
@Test
409+
public void boundedQuantifierWithInversePriority() {
410+
testBoolean(".{4,5}d", "", NEVER_UNROLL_OPT, "aaaadddd", 0, true);
411+
testBoolean("a{2,3}d", "", NEVER_UNROLL_OPT, "babaaadaaaaa", 0, true);
412+
}
413+
322414
@Test
323415
public void gr60222() {
324416
test("(?<=a)b|", "m", "aaabaaa", 3, true, 3, 4);
@@ -560,4 +652,9 @@ public void generatedTests() {
560652

561653
/* GENERATED CODE END - KEEP THIS MARKER FOR AUTOMATIC UPDATES */
562654
}
655+
656+
@Test
657+
public void overlappingBq() {
658+
testBoolean("(?=a{2,4})[ab]{4,68}c", "", NEVER_UNROLL_OPT, "aabbbbbbbbbbbbbbbbbbbbbbc", 0, true);
659+
}
563660
}

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RegexTestBase.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ Value execRegexBoolean(Value compiledRegex, Encodings.Encoding encoding, Truffle
153153

154154
void testBoolean(String pattern, String flags, String options, String input, int fromIndex, boolean isMatch) {
155155
String expectedResult = isMatch ? "Match" : "NoMatch";
156+
options = options.isEmpty() ? "BooleanMatch=true" : "BooleanMatch=true," + options;
156157
try {
157158
Value compiledRegex = compileRegex(pattern, flags, options, getTRegexEncoding());
158159
Value result = execRegexBoolean(compiledRegex, getTRegexEncoding(), input, fromIndex);

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexOptions.java

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -166,11 +166,16 @@ public final class RegexOptions {
166166

167167
private static final String PARSE_SHORT_ERROR_MSG = "expected a short integer value";
168168

169+
private static final String QUANTIFIER_UNROLL_THRESHOLD_SINGLE_CC = "QuantifierUnrollThresholdSingleCC";
170+
private static final String QUANTIFIER_UNROLL_THRESHOLD_GROUP = "QuantifierUnrollThresholdGroup";
171+
169172
public static final RegexOptions DEFAULT = new RegexOptions(0,
170173
(short) TRegexOptions.TRegexMaxDFATransitions,
171174
(short) TRegexOptions.TRegexMaxBackTrackerMergeExplodeSize,
172175
ECMAScriptFlavor.INSTANCE,
173-
Encodings.UTF_16_RAW, null, null, JAVA_JDK_VERSION_DEFAULT);
176+
Encodings.UTF_16_RAW, null, null, JAVA_JDK_VERSION_DEFAULT,
177+
(short) TRegexOptions.TRegexQuantifierUnrollThresholdSingleCC,
178+
(short) TRegexOptions.TRegexQuantifierUnrollThresholdGroup);
174179

175180
private final int options;
176181
private final short maxDFASize;
@@ -180,6 +185,8 @@ public final class RegexOptions {
180185
private final MatchingMode matchingMode;
181186
private final String pythonLocale;
182187
private final byte javaJDKVersion;
188+
public final short quantifierUnrollThresholdSingleCC;
189+
public final short quantifierUnrollThresholdGroup;
183190

184191
private RegexOptions(
185192
int options,
@@ -189,7 +196,9 @@ private RegexOptions(
189196
Encodings.Encoding encoding,
190197
MatchingMode matchingMode,
191198
String pythonLocale,
192-
byte javaJDKVersion) {
199+
byte javaJDKVersion,
200+
short quantifierUnrollThresholdSingleCC,
201+
short quantifierUnrollThresholdGroup) {
193202
this.options = options;
194203
this.maxDFASize = maxDFASize;
195204
this.maxBackTrackerCompileSize = maxBackTrackerCompileSize;
@@ -198,6 +207,8 @@ private RegexOptions(
198207
this.matchingMode = matchingMode;
199208
this.pythonLocale = pythonLocale;
200209
this.javaJDKVersion = javaJDKVersion;
210+
this.quantifierUnrollThresholdSingleCC = quantifierUnrollThresholdSingleCC;
211+
this.quantifierUnrollThresholdGroup = quantifierUnrollThresholdGroup;
201212
}
202213

203214
public static Builder builder(Source source, String sourceString) {
@@ -336,11 +347,13 @@ public int getJavaJDKVersion() {
336347
}
337348

338349
public RegexOptions withBooleanMatch() {
339-
return new RegexOptions(options | BOOLEAN_MATCH, maxDFASize, maxBackTrackerCompileSize, flavor, encoding, matchingMode, pythonLocale, javaJDKVersion);
350+
return new RegexOptions(options | BOOLEAN_MATCH, maxDFASize, maxBackTrackerCompileSize, flavor, encoding, matchingMode, pythonLocale, javaJDKVersion, quantifierUnrollThresholdSingleCC,
351+
quantifierUnrollThresholdGroup);
340352
}
341353

342354
public RegexOptions withoutBooleanMatch() {
343-
return new RegexOptions(options & ~BOOLEAN_MATCH, maxDFASize, maxBackTrackerCompileSize, flavor, encoding, matchingMode, pythonLocale, javaJDKVersion);
355+
return new RegexOptions(options & ~BOOLEAN_MATCH, maxDFASize, maxBackTrackerCompileSize, flavor, encoding, matchingMode, pythonLocale, javaJDKVersion, quantifierUnrollThresholdSingleCC,
356+
quantifierUnrollThresholdGroup);
344357
}
345358

346359
@Override
@@ -459,12 +472,16 @@ public static final class Builder {
459472
private MatchingMode matchingMode;
460473
private String pythonLocale;
461474
private byte javaJDKVersion = JAVA_JDK_VERSION_DEFAULT;
475+
private short quantifierUnrollThresholdSingleCC;
476+
private short quantifierUnrollThresholdGroup;
462477

463478
private Builder(Source source, String sourceString) {
464479
this.source = source;
465480
this.src = sourceString;
466481
this.options = 0;
467482
this.flavor = ECMAScriptFlavor.INSTANCE;
483+
quantifierUnrollThresholdSingleCC = DEFAULT.quantifierUnrollThresholdSingleCC;
484+
quantifierUnrollThresholdGroup = DEFAULT.quantifierUnrollThresholdGroup;
468485
}
469486

470487
@TruffleBoundary
@@ -539,6 +556,18 @@ public int parseOptions() throws RegexSyntaxException {
539556
throw optionsSyntaxErrorUnexpectedKey();
540557
}
541558
break;
559+
case 'Q':
560+
switch (lookAheadInKey("QuantifierUnrollThreshold".length())) {
561+
case 'S':
562+
quantifierUnrollThresholdSingleCC = parseShortOption(QUANTIFIER_UNROLL_THRESHOLD_SINGLE_CC);
563+
break;
564+
case 'G':
565+
quantifierUnrollThresholdGroup = parseShortOption(QUANTIFIER_UNROLL_THRESHOLD_GROUP);
566+
break;
567+
default:
568+
throw optionsSyntaxErrorUnexpectedKey();
569+
}
570+
break;
542571
case 'R':
543572
parseBooleanOption(REGRESSION_TEST_MODE_NAME, REGRESSION_TEST_MODE);
544573
break;
@@ -763,7 +792,8 @@ public Encodings.Encoding getEncoding() {
763792
}
764793

765794
public RegexOptions build() {
766-
return new RegexOptions(options, maxDFASize, maxBackTrackerCompileSize, flavor, encoding, matchingMode, pythonLocale, javaJDKVersion);
795+
return new RegexOptions(options, maxDFASize, maxBackTrackerCompileSize, flavor, encoding, matchingMode, pythonLocale, javaJDKVersion, quantifierUnrollThresholdSingleCC,
796+
quantifierUnrollThresholdGroup);
767797
}
768798
}
769799
}

0 commit comments

Comments
 (0)