Skip to content

Commit 7456cd5

Browse files
committed
[GR-60108] TRegex: NFA generator improvements.
PullRequest: graal/19434
2 parents e89022b + 1ee4c15 commit 7456cd5

File tree

79 files changed

+4119
-1572
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+4119
-1572
lines changed

regex/src/com.oracle.truffle.regex.test.dummylang/src/com/oracle/truffle/regex/test/dummylang/TRegexTestDummyLanguage.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -59,6 +59,7 @@
5959
import com.oracle.truffle.api.strings.TruffleString;
6060
import com.oracle.truffle.regex.RegexLanguage;
6161
import com.oracle.truffle.regex.RegexObject;
62+
import com.oracle.truffle.regex.RegexSyntaxException;
6263

6364
@TruffleLanguage.Registration(name = TRegexTestDummyLanguage.NAME, id = TRegexTestDummyLanguage.ID, characterMimeTypes = TRegexTestDummyLanguage.MIME_TYPE, version = "0.1", dependentLanguages = RegexLanguage.ID)
6465
public class TRegexTestDummyLanguage extends TruffleLanguage<TRegexTestDummyLanguage.DummyLanguageContext> {
@@ -111,8 +112,12 @@ public Object execute(VirtualFrame frame) {
111112
}
112113
}.getCallTarget();
113114
}
114-
return DummyLanguageContext.get(null).getEnv().parseInternal(
115-
Source.newBuilder(RegexLanguage.ID, src, parsingRequest.getSource().getName()).internal(true).build());
115+
try {
116+
return DummyLanguageContext.get(null).getEnv().parseInternal(
117+
Source.newBuilder(RegexLanguage.ID, src, parsingRequest.getSource().getName()).internal(true).build());
118+
} catch (RegexSyntaxException e) {
119+
throw e.withErrorCodeInMessage();
120+
}
116121
}
117122

118123
@GenerateInline

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlagsTest.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ public void testParseFlags() {
5757
assertTrue(parse("i").isIgnoreCase());
5858
assertTrue(parse("m").isMultiLine());
5959
assertTrue(parse("s").isDotAll());
60-
assertTrue(parse("t").isTemplate());
6160
assertTrue(parse("u").isUnicodeExplicitlySet());
6261
assertTrue(parse("x").isVerbose());
6362
}

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/InputStringGeneratorTests.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -83,28 +83,29 @@ public void testBenchmarkRegexes() {
8383
testInputStringGenerator(
8484
"([-!#-''*+/-9=?A-Z^-~]+(\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"([ ]!#-[^-~ ]|(\\\\[-~ ]))+\")@[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?(\\.[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?)+");
8585
testInputStringGenerator("(\\S+) (\\S+) (\\S+) \\[([A-Za-z0-9_:/]+\\s[-+]\\d{4})\\] \"(\\S+)\\s?(\\S+)?\\s?(\\S+)?\" (\\d{3}|-) (\\d+|-)\\s?\"?([^\"]*)\"?\\s?\"?([^\"]*)?\"?");
86+
testInputStringGenerator("(?<=(a))\\1");
8687
}
8788

88-
private TruffleString generateInputString(String pattern, String flags, String options, Encodings.Encoding encoding) {
89+
private TruffleString generateInputString(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed) {
8990
String sourceString = createSourceString(pattern, flags, options, encoding);
9091
Source source = Source.newBuilder("regex", sourceString, "regexSource").build();
9192
RegexSource regexSource = RegexLanguage.createRegexSource(source);
9293
RegexAST ast = regexSource.getOptions().getFlavor().createParser(language, regexSource, new CompilationBuffer(regexSource.getEncoding())).parse();
93-
return InputStringGenerator.generate(ast, rng.nextLong());
94+
return InputStringGenerator.generate(ast, rngSeed);
9495
}
9596

9697
void testInputStringGenerator(String pattern) {
97-
testInputStringGenerator(pattern, "", getEngineOptions(), getTRegexEncoding());
98+
testInputStringGenerator(pattern, "", getEngineOptions(), getTRegexEncoding(), rng.nextLong());
9899
}
99100

100-
void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding) {
101+
void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed) {
101102
Value compiledRegex = compileRegex(pattern, flags);
102-
testInputStringGenerator(pattern, flags, options, encoding, compiledRegex);
103+
testInputStringGenerator(pattern, flags, options, encoding, rngSeed, compiledRegex);
103104
}
104105

105-
private void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, Value compiledRegex) {
106+
private void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed, Value compiledRegex) {
106107
for (int i = 0; i < 20; i++) {
107-
TruffleString input = generateInputString(pattern, flags, options, encoding);
108+
TruffleString input = generateInputString(pattern, flags, options, encoding, rngSeed);
108109
Assert.assertNotNull(input);
109110
Value result = execRegex(compiledRegex, encoding, input, 0);
110111
Assert.assertTrue(result.getMember("isMatch").asBoolean());

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JavaUtilPatternTests.java

Lines changed: 122 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,14 @@
4040
*/
4141
package com.oracle.truffle.regex.tregex.test;
4242

43-
import com.oracle.truffle.regex.charset.Range;
44-
import com.oracle.truffle.regex.tregex.parser.CaseFoldData;
45-
import com.oracle.truffle.regex.tregex.parser.flavors.java.JavaFlags;
46-
import com.oracle.truffle.regex.tregex.string.Encodings;
47-
import com.oracle.truffle.regex.util.EmptyArrays;
43+
import java.util.List;
44+
import java.util.concurrent.ExecutorService;
45+
import java.util.concurrent.Executors;
46+
import java.util.regex.Matcher;
47+
import java.util.regex.Pattern;
48+
import java.util.regex.PatternSyntaxException;
49+
import java.util.stream.Stream;
50+
4851
import org.graalvm.collections.Pair;
4952
import org.graalvm.polyglot.Context;
5053
import org.graalvm.polyglot.PolyglotException;
@@ -53,13 +56,12 @@
5356
import org.junit.Ignore;
5457
import org.junit.Test;
5558

56-
import java.util.List;
57-
import java.util.concurrent.ExecutorService;
58-
import java.util.concurrent.Executors;
59-
import java.util.regex.Matcher;
60-
import java.util.regex.Pattern;
61-
import java.util.regex.PatternSyntaxException;
62-
import java.util.stream.Stream;
59+
import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode;
60+
import com.oracle.truffle.regex.charset.Range;
61+
import com.oracle.truffle.regex.tregex.parser.CaseFoldData;
62+
import com.oracle.truffle.regex.tregex.parser.flavors.java.JavaFlags;
63+
import com.oracle.truffle.regex.tregex.string.Encodings;
64+
import com.oracle.truffle.regex.util.EmptyArrays;
6365

6466
public class JavaUtilPatternTests extends RegexTestBase {
6567

@@ -163,6 +165,8 @@ public void documentationSummary() {
163165
// Boundary matchers
164166
test("^", 0, "");
165167
test("$", 0, "");
168+
test("$", 0, "empty");
169+
test("\\Z", 0, "\r\n");
166170
test("\\b", 0, " a", 1);
167171
// test("\\b{g}", 0, "");
168172
test("\\B", 0, "b");
@@ -1263,6 +1267,112 @@ public void caseFolding() {
12631267
});
12641268
}
12651269

1270+
@Test
1271+
public void generatedTests() {
1272+
/* GENERATED CODE BEGIN - KEEP THIS MARKER FOR AUTOMATIC UPDATES */
1273+
1274+
// Generated using Java version 24
1275+
test("((A|){7,10}?){10,17}", "", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 0, true, 0, 86, 86, 86, 86, 86);
1276+
test("(a{1,30}){1,4}", "", "a", 0, true, 0, 1, 0, 1);
1277+
test("((a|){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7);
1278+
test("((a?){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7);
1279+
test("((|a){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0);
1280+
test("((a??){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0);
1281+
test("((a?){4,6}){4,6}", "", "aaaaaa", 0, true, 0, 6, 6, 6, 6, 6);
1282+
test("(a|^){100}", "", "a", 0, true, 0, 0, 0, 0);
1283+
test("(a|^){100}", "", "aa", 0, true, 0, 0, 0, 0);
1284+
test("(a|^){100}", "", "aa", 1, false);
1285+
test("(a|^){100}", "", "ab", 1, false);
1286+
test("(.)\\1{2,}", "", "billiam", 0, false);
1287+
test("(^_(a{1,2}[:])*a{1,2}[:]a{1,2}([.]a{1,4})?_)+", "", "_a:a:a.aaa_", 0, true, 0, 11, 0, 11, 1, 3, 6, 10);
1288+
test("(a{2}|())+$", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4);
1289+
test("^a(b*)\\1{4,6}?", "", "abbbb", 0, true, 0, 1, 1, 1);
1290+
test("^a(b*)\\1{4,6}?", "", "abbbbb", 0, true, 0, 6, 1, 2);
1291+
test("(?<=|$)", "", "a", 0, true, 0, 0);
1292+
test("(?=ab)a", "", "ab", 0, true, 0, 1);
1293+
test("(?=()|^)|x", "", "empty", 0, true, 0, 0, 0, 0);
1294+
test("a(?<=ba)", "", "ba", 0, true, 1, 2);
1295+
test("(?<=(?=|()))", "", "aa", 0, true, 0, 0, -1, -1);
1296+
test("\\d\\W", "iv", "4\u017f", 0, true, 0, 2);
1297+
test("[\u08bc-\ucf3a]", "iv", "\u03b0", 0, false);
1298+
test("a(?:|()\\1){1,2}", "", "a", 0, true, 0, 1, -1, -1);
1299+
expectSyntaxError("|(?<\\d\\1)\ub7e4", "", "", getTRegexEncoding(), "error", 0, ErrorCode.InvalidNamedGroup);
1300+
test("[a-z][a-z\u2028\u2029].|ab(?<=[a-z]w.)", "", "aac", 0, true, 0, 3);
1301+
test("(animation|animation-name)", "", "animation", 0, true, 0, 9, 0, 9);
1302+
test("(a|){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1303+
test("(a|){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3);
1304+
test("(|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1305+
test("(|a){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3);
1306+
test("(a||b){7,7}c", "", "aaabc", 0, true, 0, 5, 4, 4);
1307+
test("(a||b){7,7}c", "", "aaac", 0, true, 0, 4, 3, 3);
1308+
test("(a||b){7,7}c", "", "aaabac", 0, true, 0, 6, 5, 5);
1309+
test("($|a){7,7}", "", "aaa", 0, true, 0, 3, 3, 3);
1310+
test("($|a){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3);
1311+
test("(a|$){7,7}", "", "aaa", 0, true, 0, 3, 3, 3);
1312+
test("(a|$){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3);
1313+
test("(a|$|b){7,7}", "", "aaab", 0, true, 0, 4, 4, 4);
1314+
test("(a|$|b){7,7}", "", "aaa", 0, true, 0, 3, 3, 3);
1315+
test("(a|$|b){7,7}", "", "aaaba", 0, true, 0, 5, 5, 5);
1316+
test("((?=a)|a){7,7}b", "", "aaa", 0, false);
1317+
test("((?=[ab])|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1318+
test("((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1319+
test("a((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1320+
test("(a|){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1321+
test("(a|){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3);
1322+
test("(|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1323+
test("(|a){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3);
1324+
test("(a||b){0,7}c", "", "aaabc", 0, true, 0, 5, 4, 4);
1325+
test("(a||b){0,7}c", "", "aaac", 0, true, 0, 4, 3, 3);
1326+
test("(a||b){0,7}c", "", "aaabac", 0, true, 0, 6, 5, 5);
1327+
test("((?=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 2, 3);
1328+
test("((?=[ab])|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1329+
test("((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1330+
test("a((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3);
1331+
test("(a*?){11,11}?b", "", "aaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 26, 10, 25);
1332+
test("(?:a(b{0,19})c)", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 9, 1, 8);
1333+
test("(?:a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 11, 1, 8);
1334+
test("(?<=a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 9, 11, 1, 8);
1335+
test("[\ud0d9](?<=\\S)", "", "\ud0d9", 0, true, 0, 1);
1336+
test("[\ud0d9](?<=\\W)", "", "\ud0d9", 0, true, 0, 1);
1337+
test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 1);
1338+
test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 1);
1339+
test("[\u8053](?<=\\S)", "", "\u8053", 0, true, 0, 1);
1340+
test("[\u8053](?<=\\W)", "", "\u8053", 0, true, 0, 1);
1341+
test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 1);
1342+
test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 1);
1343+
test("\u0895|[\u8053\ud0d9]+(?<=\\S\\W\\S)", "", "\ud0d9\ud0d9\ud0d9\ud0d9", 0, true, 0, 4);
1344+
test("a|[bc]+(?<=[abc][abcd][abc])", "", "bbbb", 0, true, 0, 4);
1345+
test("a(b*)*c\\1d", "", "abbbbcbbd", 0, true, 0, 9, 3, 5);
1346+
test("(|a)||b(?<=cde)|", "", "a", 0, true, 0, 0, 0, 0);
1347+
test("^(\\1)?\\D*", "s", "empty", 0, true, 0, 5, -1, -1);
1348+
test("abcd(?<=d|c()d)", "", "_abcd", 0, true, 1, 5, -1, -1);
1349+
test("\\Dw\u3aa7\\A\\S(?<=\ue3b3|\\A()\\S)", "", "\udad1\udcfaw\u3aa7A\ue3b3", 0, false);
1350+
test("a(?:c|b(?=()))*", "", "abc", 0, true, 0, 3, 2, 2);
1351+
test("a(?:c|b(?=(c)))*", "", "abc", 0, true, 0, 3, 2, 3);
1352+
test("a(?:c|(?<=(a))b)*", "", "abc", 0, true, 0, 3, 0, 1);
1353+
test("(a||b){15,18}c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11);
1354+
test("(a||b){15,18}?c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11);
1355+
test("(?:ab|c|^){103,104}", "", "abcababccabccabababccabcababcccccabcababababccccabcabcabccabcabcccabababccabababcababababccababccabcababcabcabccabababccccabcab", 0, true, 0, 0);
1356+
test("((?<=a)bec)*d", "", "abecd", 0, true, 1, 5, 1, 4);
1357+
test("(|(^|\\z){2,77}?)?", "", "empty", 0, true, 0, 0, 0, 0, -1, -1);
1358+
test("a(|a{15,36}){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1);
1359+
test("a(|a{15,36}?){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1);
1360+
test("a(|a{15,36}){10,11}$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 66, 66, 66);
1361+
test("a(|a{15,36}?){10,11}b$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 67, 66, 66);
1362+
test("(?:a()|b??){22,26}c", "", "aabbbaabaaaaaabaaaac", 0, true, 0, 20, 19, 19);
1363+
test("b()(a\\1|){4,4}\\2c", "", "baaaac", 0, false);
1364+
test("a((?=b()|)[a-d])+", "", "abbbcbd", 0, true, 0, 7, 6, 7, 6, 6);
1365+
test("a(?=b(?<=ab)()|)", "", "ab", 0, true, 0, 1, 2, 2);
1366+
test("[ab]*?$(?<=[^b][ab][^b])", "", "aaaaaa", 0, true, 0, 6);
1367+
test("([ab]+){0,5}", "", "bbbba", 0, true, 0, 5, 0, 5);
1368+
test("[--a]", "v", "empty", 0, false);
1369+
test("(?:^\\1|$){10,11}bc", "", "aaaaaabc", 0, false);
1370+
test("a(?:|[0-9]+?a|[0-9a]){11,13}?[ab]", "", "a372a466a109585878b", 0, true, 0, 19);
1371+
test("\\Z", "", "\r\n", 0, true, 0, 0);
1372+
1373+
/* GENERATED CODE END - KEEP THIS MARKER FOR AUTOMATIC UPDATES */
1374+
}
1375+
12661376
void test(String pattern, int flags, String input) {
12671377
test(pattern, flags, input, 0);
12681378
}

0 commit comments

Comments
 (0)