diff --git a/regex/src/com.oracle.truffle.regex.test.dummylang/src/com/oracle/truffle/regex/test/dummylang/TRegexTestDummyLanguage.java b/regex/src/com.oracle.truffle.regex.test.dummylang/src/com/oracle/truffle/regex/test/dummylang/TRegexTestDummyLanguage.java index a707a003ee7..ae8a2434b80 100644 --- a/regex/src/com.oracle.truffle.regex.test.dummylang/src/com/oracle/truffle/regex/test/dummylang/TRegexTestDummyLanguage.java +++ b/regex/src/com.oracle.truffle.regex.test.dummylang/src/com/oracle/truffle/regex/test/dummylang/TRegexTestDummyLanguage.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -59,6 +59,7 @@ import com.oracle.truffle.api.strings.TruffleString; import com.oracle.truffle.regex.RegexLanguage; import com.oracle.truffle.regex.RegexObject; +import com.oracle.truffle.regex.RegexSyntaxException; @TruffleLanguage.Registration(name = TRegexTestDummyLanguage.NAME, id = TRegexTestDummyLanguage.ID, characterMimeTypes = TRegexTestDummyLanguage.MIME_TYPE, version = "0.1", dependentLanguages = RegexLanguage.ID) public class TRegexTestDummyLanguage extends TruffleLanguage { @@ -111,8 +112,12 @@ public Object execute(VirtualFrame frame) { } }.getCallTarget(); } - return DummyLanguageContext.get(null).getEnv().parseInternal( - Source.newBuilder(RegexLanguage.ID, src, parsingRequest.getSource().getName()).internal(true).build()); + try { + return DummyLanguageContext.get(null).getEnv().parseInternal( + Source.newBuilder(RegexLanguage.ID, src, parsingRequest.getSource().getName()).internal(true).build()); + } catch (RegexSyntaxException e) { + throw e.withErrorCodeInMessage(); + } } @GenerateInline diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlagsTest.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlagsTest.java index 08e98bd5014..baf33a7c89d 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlagsTest.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlagsTest.java @@ -57,7 +57,6 @@ public void testParseFlags() { assertTrue(parse("i").isIgnoreCase()); assertTrue(parse("m").isMultiLine()); assertTrue(parse("s").isDotAll()); - assertTrue(parse("t").isTemplate()); assertTrue(parse("u").isUnicodeExplicitlySet()); assertTrue(parse("x").isVerbose()); } diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/InputStringGeneratorTests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/InputStringGeneratorTests.java index 67650b767f0..eff8ed3cd0e 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/InputStringGeneratorTests.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/InputStringGeneratorTests.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -83,28 +83,29 @@ public void testBenchmarkRegexes() { testInputStringGenerator( "([-!#-''*+/-9=?A-Z^-~]+(\\.[-!#-''*+/-9=?A-Z^-~]+)*|\"([ ]!#-[^-~ ]|(\\\\[-~ ]))+\")@[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?(\\.[0-9A-Za-z]([0-9A-Za-z-]*[0-9A-Za-z])?)+"); testInputStringGenerator("(\\S+) (\\S+) (\\S+) \\[([A-Za-z0-9_:/]+\\s[-+]\\d{4})\\] \"(\\S+)\\s?(\\S+)?\\s?(\\S+)?\" (\\d{3}|-) (\\d+|-)\\s?\"?([^\"]*)\"?\\s?\"?([^\"]*)?\"?"); + testInputStringGenerator("(?<=(a))\\1"); } - private TruffleString generateInputString(String pattern, String flags, String options, Encodings.Encoding encoding) { + private TruffleString generateInputString(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed) { String sourceString = createSourceString(pattern, flags, options, encoding); Source source = Source.newBuilder("regex", sourceString, "regexSource").build(); RegexSource regexSource = RegexLanguage.createRegexSource(source); RegexAST ast = regexSource.getOptions().getFlavor().createParser(language, regexSource, new CompilationBuffer(regexSource.getEncoding())).parse(); - return InputStringGenerator.generate(ast, rng.nextLong()); + return InputStringGenerator.generate(ast, rngSeed); } void testInputStringGenerator(String pattern) { - testInputStringGenerator(pattern, "", getEngineOptions(), getTRegexEncoding()); + testInputStringGenerator(pattern, "", getEngineOptions(), getTRegexEncoding(), rng.nextLong()); } - void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding) { + void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed) { Value compiledRegex = compileRegex(pattern, flags); - testInputStringGenerator(pattern, flags, options, encoding, compiledRegex); + testInputStringGenerator(pattern, flags, options, encoding, rngSeed, compiledRegex); } - private void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, Value compiledRegex) { + private void testInputStringGenerator(String pattern, String flags, String options, Encodings.Encoding encoding, long rngSeed, Value compiledRegex) { for (int i = 0; i < 20; i++) { - TruffleString input = generateInputString(pattern, flags, options, encoding); + TruffleString input = generateInputString(pattern, flags, options, encoding, rngSeed); Assert.assertNotNull(input); Value result = execRegex(compiledRegex, encoding, input, 0); Assert.assertTrue(result.getMember("isMatch").asBoolean()); diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JavaUtilPatternTests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JavaUtilPatternTests.java index ba298070cc2..d3daf0ae6d9 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JavaUtilPatternTests.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JavaUtilPatternTests.java @@ -40,19 +40,6 @@ */ package com.oracle.truffle.regex.tregex.test; -import com.oracle.truffle.regex.charset.Range; -import com.oracle.truffle.regex.tregex.parser.CaseFoldData; -import com.oracle.truffle.regex.tregex.parser.flavors.java.JavaFlags; -import com.oracle.truffle.regex.tregex.string.Encodings; -import com.oracle.truffle.regex.util.EmptyArrays; -import org.graalvm.collections.Pair; -import org.graalvm.polyglot.Context; -import org.graalvm.polyglot.PolyglotException; -import org.graalvm.polyglot.Value; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; - import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -62,6 +49,21 @@ import java.util.regex.PatternSyntaxException; import java.util.stream.Stream; +import org.graalvm.collections.Pair; +import org.graalvm.polyglot.Context; +import org.graalvm.polyglot.PolyglotException; +import org.graalvm.polyglot.Value; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; +import com.oracle.truffle.regex.charset.Range; +import com.oracle.truffle.regex.tregex.parser.CaseFoldData; +import com.oracle.truffle.regex.tregex.parser.flavors.java.JavaFlags; +import com.oracle.truffle.regex.tregex.string.Encodings; +import com.oracle.truffle.regex.util.EmptyArrays; + public class JavaUtilPatternTests extends RegexTestBase { public static final String ENGINE_OPTIONS = "Flavor=JavaUtilPattern,MatchingMode=search,JavaJDKVersion=" + Runtime.version().feature(); @@ -164,6 +166,8 @@ public void documentationSummary() { // Boundary matchers test("^", 0, ""); test("$", 0, ""); + test("$", 0, "empty"); + test("\\Z", 0, "\r\n"); test("\\b", 0, " a", 1); // test("\\b{g}", 0, ""); test("\\B", 0, "b"); @@ -1266,6 +1270,112 @@ public void caseFolding() { }); } + @Test + public void generatedTests() { + /* GENERATED CODE BEGIN - KEEP THIS MARKER FOR AUTOMATIC UPDATES */ + + // Generated using Java version 24 + test("((A|){7,10}?){10,17}", "", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 0, true, 0, 86, 86, 86, 86, 86); + test("(a{1,30}){1,4}", "", "a", 0, true, 0, 1, 0, 1); + test("((a|){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7); + test("((a?){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7); + test("((|a){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((a??){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((a?){4,6}){4,6}", "", "aaaaaa", 0, true, 0, 6, 6, 6, 6, 6); + test("(a|^){100}", "", "a", 0, true, 0, 0, 0, 0); + test("(a|^){100}", "", "aa", 0, true, 0, 0, 0, 0); + test("(a|^){100}", "", "aa", 1, false); + test("(a|^){100}", "", "ab", 1, false); + test("(.)\\1{2,}", "", "billiam", 0, false); + test("(^_(a{1,2}[:])*a{1,2}[:]a{1,2}([.]a{1,4})?_)+", "", "_a:a:a.aaa_", 0, true, 0, 11, 0, 11, 1, 3, 6, 10); + test("(a{2}|())+$", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4); + test("^a(b*)\\1{4,6}?", "", "abbbb", 0, true, 0, 1, 1, 1); + test("^a(b*)\\1{4,6}?", "", "abbbbb", 0, true, 0, 6, 1, 2); + test("(?<=|$)", "", "a", 0, true, 0, 0); + test("(?=ab)a", "", "ab", 0, true, 0, 1); + test("(?=()|^)|x", "", "empty", 0, true, 0, 0, 0, 0); + test("a(?<=ba)", "", "ba", 0, true, 1, 2); + test("(?<=(?=|()))", "", "aa", 0, true, 0, 0, -1, -1); + test("\\d\\W", "iv", "4\u017f", 0, true, 0, 2); + test("[\u08bc-\ucf3a]", "iv", "\u03b0", 0, false); + test("a(?:|()\\1){1,2}", "", "a", 0, true, 0, 1, -1, -1); + expectSyntaxError("|(?<\\d\\1)\ub7e4", "", "", getTRegexEncoding(), "error", 0, ErrorCode.InvalidNamedGroup); + test("[a-z][a-z\u2028\u2029].|ab(?<=[a-z]w.)", "", "aac", 0, true, 0, 3); + test("(animation|animation-name)", "", "animation", 0, true, 0, 9, 0, 9); + test("(a|){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a|){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(|a){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a||b){7,7}c", "", "aaabc", 0, true, 0, 5, 4, 4); + test("(a||b){7,7}c", "", "aaac", 0, true, 0, 4, 3, 3); + test("(a||b){7,7}c", "", "aaabac", 0, true, 0, 6, 5, 5); + test("($|a){7,7}", "", "aaa", 0, true, 0, 3, 3, 3); + test("($|a){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$){7,7}", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$|b){7,7}", "", "aaab", 0, true, 0, 4, 4, 4); + test("(a|$|b){7,7}", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$|b){7,7}", "", "aaaba", 0, true, 0, 5, 5, 5); + test("((?=a)|a){7,7}b", "", "aaa", 0, false); + test("((?=[ab])|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("a((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a|){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a|){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3); + test("(|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(|a){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3); + test("(a||b){0,7}c", "", "aaabc", 0, true, 0, 5, 4, 4); + test("(a||b){0,7}c", "", "aaac", 0, true, 0, 4, 3, 3); + test("(a||b){0,7}c", "", "aaabac", 0, true, 0, 6, 5, 5); + test("((?=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 2, 3); + test("((?=[ab])|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("a((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a*?){11,11}?b", "", "aaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 26, 10, 25); + test("(?:a(b{0,19})c)", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 9, 1, 8); + test("(?:a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 11, 1, 8); + test("(?<=a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 9, 11, 1, 8); + test("[\ud0d9](?<=\\S)", "", "\ud0d9", 0, true, 0, 1); + test("[\ud0d9](?<=\\W)", "", "\ud0d9", 0, true, 0, 1); + test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 1); + test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 1); + test("[\u8053](?<=\\S)", "", "\u8053", 0, true, 0, 1); + test("[\u8053](?<=\\W)", "", "\u8053", 0, true, 0, 1); + test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 1); + test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 1); + test("\u0895|[\u8053\ud0d9]+(?<=\\S\\W\\S)", "", "\ud0d9\ud0d9\ud0d9\ud0d9", 0, true, 0, 4); + test("a|[bc]+(?<=[abc][abcd][abc])", "", "bbbb", 0, true, 0, 4); + test("a(b*)*c\\1d", "", "abbbbcbbd", 0, true, 0, 9, 3, 5); + test("(|a)||b(?<=cde)|", "", "a", 0, true, 0, 0, 0, 0); + test("^(\\1)?\\D*", "s", "empty", 0, true, 0, 5, -1, -1); + test("abcd(?<=d|c()d)", "", "_abcd", 0, true, 1, 5, -1, -1); + test("\\Dw\u3aa7\\A\\S(?<=\ue3b3|\\A()\\S)", "", "\udad1\udcfaw\u3aa7A\ue3b3", 0, false); + test("a(?:c|b(?=()))*", "", "abc", 0, true, 0, 3, 2, 2); + test("a(?:c|b(?=(c)))*", "", "abc", 0, true, 0, 3, 2, 3); + test("a(?:c|(?<=(a))b)*", "", "abc", 0, true, 0, 3, 0, 1); + test("(a||b){15,18}c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11); + test("(a||b){15,18}?c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11); + test("(?:ab|c|^){103,104}", "", "abcababccabccabababccabcababcccccabcababababccccabcabcabccabcabcccabababccabababcababababccababccabcababcabcabccabababccccabcab", 0, true, 0, 0); + test("((?<=a)bec)*d", "", "abecd", 0, true, 1, 5, 1, 4); + test("(|(^|\\z){2,77}?)?", "", "empty", 0, true, 0, 0, 0, 0, -1, -1); + test("a(|a{15,36}){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1); + test("a(|a{15,36}?){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1); + test("a(|a{15,36}){10,11}$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 66, 66, 66); + test("a(|a{15,36}?){10,11}b$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 67, 66, 66); + test("(?:a()|b??){22,26}c", "", "aabbbaabaaaaaabaaaac", 0, true, 0, 20, 19, 19); + test("b()(a\\1|){4,4}\\2c", "", "baaaac", 0, false); + test("a((?=b()|)[a-d])+", "", "abbbcbd", 0, true, 0, 7, 6, 7, 6, 6); + test("a(?=b(?<=ab)()|)", "", "ab", 0, true, 0, 1, 2, 2); + test("[ab]*?$(?<=[^b][ab][^b])", "", "aaaaaa", 0, true, 0, 6); + test("([ab]+){0,5}", "", "bbbba", 0, true, 0, 5, 0, 5); + test("[--a]", "v", "empty", 0, false); + test("(?:^\\1|$){10,11}bc", "", "aaaaaabc", 0, false); + test("a(?:|[0-9]+?a|[0-9a]){11,13}?[ab]", "", "a372a466a109585878b", 0, true, 0, 19); + test("\\Z", "", "\r\n", 0, true, 0, 0); + + /* GENERATED CODE END - KEEP THIS MARKER FOR AUTOMATIC UPDATES */ + } + void test(String pattern, int flags, String input) { test(pattern, flags, input, 0); } diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JsTests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JsTests.java index 5fa534eec16..2cf60ae7119 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JsTests.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/JsTests.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -44,6 +44,7 @@ import org.junit.Assert; import org.junit.Test; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.errors.JsErrorMessages; import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.string.Encodings; @@ -293,9 +294,11 @@ public void innerLiteralSurrogates() { @Test public void gr52906() { // Original test case - test("\\b(((.*?)){67108860})\\b|(?=(?=(?!.).\\b(\\d))){0,4}", "yi", "L1O\n\n\n11\n \n\n11\n \uD091 1aa\uFCDB=\n ", 0, true, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1); + // note: original counter value is 67108860, reduced to let the test finish in reasonable + // time. + test("\\b(((.*?)){9999})\\b|(?=(?=(?!.).\\b(\\d))){0,4}", "yi", "L1O\n\n\n11\n \n\n11\n \uD091 1aa\uFCDB=\n ", 0, true, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1); // Minimized version - test("(.*?){67108863}", "", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", 0, true, 0, 0, 0, 0); + test("(.*?){9999}", "", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", 0, true, 0, 0, 0, 0); // Linked issue test("(?=(?=(\\W)\u008e+|\\uC47A|(\\s)))+?|((((?:(\\\u0015)))+?))|(?:\\r|[^]+?[^])|\\3{3,}", "gyim", "", 0, true, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); @@ -307,4 +310,240 @@ public void gr52906() { public void gr56676() { test("(?> encoding.getStride(); + return execRegexBoolean(compiledRegex, encoding, converted, fromIndex, length, 0, length); + } + + Value execRegexBoolean(Value compiledRegex, Encodings.Encoding encoding, TruffleString input, int fromIndex, int toIndex, int regionFrom, int regionTo) { + return compiledRegex.invokeMember("execBoolean", input.switchEncodingUncached(encoding.getTStringEncoding()), fromIndex, toIndex, regionFrom, regionTo); + } + + void testBoolean(String pattern, String flags, String options, String input, int fromIndex, boolean isMatch) { + String expectedResult = isMatch ? "Match" : "NoMatch"; + try { + Value compiledRegex = compileRegex(pattern, flags, options, getTRegexEncoding()); + Value result = execRegexBoolean(compiledRegex, getTRegexEncoding(), input, fromIndex); + if (result.asBoolean() != isMatch) { + String actualResult = result.asBoolean() ? "Match" : "NoMatch"; + printTable(pattern, flags, input, fromIndex, expectedResult, actualResult); + if (ASSERTS) { + Assert.fail(options + regexSlashes(pattern, flags) + ' ' + quote(input) + " expected: " + expectedResult + ", actual: " + actualResult); + } + } + } catch (PolyglotException e) { + if (!ASSERTS && e.isSyntaxError()) { + printTable(pattern, flags, input, fromIndex, expectedResult, syntaxErrorToString(e.getMessage())); + } else { + throw e; + } + } + } + + void testBoolean(String pattern, String flags, String input, int fromIndex, boolean isMatch) { + testBoolean(pattern, flags, "BooleanMatch=true", input, fromIndex, isMatch); + } + void test(String pattern, String flags, String input, int fromIndex, boolean isMatch, int... captureGroupBoundsAndLastGroup) { test(pattern, flags, "", input, fromIndex, isMatch, captureGroupBoundsAndLastGroup); } @@ -155,6 +199,7 @@ void test(String pattern, String flags, String options, Encodings.Encoding encod throw e; } } + testBoolean(pattern, flags, "BooleanMatch=true" + (options.isEmpty() ? "" : "," + options), input, fromIndex, isMatch); } void test(Value compiledRegex, String pattern, String flags, String options, Encodings.Encoding encoding, String input, int fromIndex, boolean isMatch, int... captureGroupBoundsAndLastGroup) { @@ -235,12 +280,12 @@ void expectSyntaxError(String pattern, String flags, String expectedMessage, int expectSyntaxError(pattern, flags, "", getTRegexEncoding(), "", 0, expectedMessage, expectedPosition); } - void expectSyntaxError(String pattern, String flags, String options, String expectedMessage, int expectedPosition) { - expectSyntaxError(pattern, flags, options, getTRegexEncoding(), "", 0, expectedMessage, expectedPosition); + void expectSyntaxError(String pattern, String flags, String options, Encodings.Encoding encoding, String input, int fromIndex, ErrorCode expectedErrorCode) { + expectSyntaxError(pattern, flags, options, encoding, input, fromIndex, expectedErrorCode, Integer.MIN_VALUE); } - void expectSyntaxError(String pattern, String flags, String options, Encodings.Encoding encoding, String input, int fromIndex, String expectedMessage) { - expectSyntaxError(pattern, flags, options, encoding, input, fromIndex, expectedMessage, Integer.MIN_VALUE); + void expectSyntaxError(String pattern, String flags, String options, Encodings.Encoding encoding, String input, int fromIndex, ErrorCode expectedErrorCode, int expectedPosition) { + expectSyntaxError(pattern, flags, options, encoding, input, fromIndex, expectedErrorCode.name(), expectedPosition); } void expectSyntaxError(String pattern, String flags, String options, Encodings.Encoding encoding, String input, int fromIndex, String expectedMessage, int expectedPosition) { diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyUTF8Tests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyUTF8Tests.java index 6b858f8b25b..fa14cee7274 100644 --- a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyUTF8Tests.java +++ b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyUTF8Tests.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -40,9 +40,11 @@ */ package com.oracle.truffle.regex.tregex.test; -import com.oracle.truffle.regex.tregex.string.Encodings; import org.junit.Test; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; +import com.oracle.truffle.regex.tregex.string.Encodings; + public class RubyUTF8Tests extends RegexTestBase { @Override @@ -59,4 +61,327 @@ Encodings.Encoding getTRegexEncoding() { public void ignoreCaseBackReferences() { test("^(\uff21)(a)\\1\\2$", "i", "\uff21a\uff41A", 0, true, 0, 8, 0, 3, 3, 4); } + + @Test + public void generatedTests() { + /* GENERATED CODE BEGIN - KEEP THIS MARKER FOR AUTOMATIC UPDATES */ + + // Generated using Ruby version 3.3.5 + test("(?=.*c)a(()b?)?c", "", "ac", 0, true, 0, 2, 1, 1, 1, 1); + test("(?=.*c)a(b*)?c", "", "ac", 0, true, 0, 2, 1, 1); + test("(?=.*c)a(()b*)?c", "", "ac", 0, true, 0, 2, 1, 1, 1, 1); + test("(?=.*b)a{2}", "", "aaab", 0, true, 0, 2); + test("a{2}?", "", "c", 0, true, 0, 0); + test("a{2,4}?", "", "c", 0, false); + test("a+?", "", "c", 0, false); + test("a{2}?(b)?c", "", "c", 0, true, 0, 1, -1, -1); + test("(?>(aa)?)+", "", "a", 0, true, 0, 0, -1, -1); + test("(|a+?){0,4}b", "", "aaab", 0, true, 0, 4, 1, 3); + test("(a{2}|())+$", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4); + test("^a(b*)\\1{4,6}?", "", "abbbb", 0, true, 0, 1, 1, 1); + test("^a(b*)\\1{4,6}?", "", "abbbbb", 0, true, 0, 6, 1, 2); + test("a(?:c|b(?=()))*", "", "abc", 0, true, 0, 3, 2, 2); + test("a(?:c|b(?=(c)))*", "", "abc", 0, true, 0, 3, 2, 3); + test("a(?:c|(?<=(a))b)*", "", "abc", 0, true, 0, 3, 0, 1); + test("\\Z", "", "\r", 0, true, 1, 1); + test("(?<=\\A)", "", "\r", 0, true, 0, 0); + test("(?<=\\b)", "", "\r", 0, false); + test("(?<=\\B)", "", "\r", 0, true, 0, 0); + expectSyntaxError("(?<=+?)", "", "", getTRegexEncoding(), "error", 0, ErrorCode.InvalidQuantifier); + test("(?<=)", "", "empty", 0, true, 0, 0); + test("()?", "", "", 0, true, 0, 0, 0, 0); + test("(a*)?", "", "", 0, true, 0, 0, 0, 0); + test("(a*)*", "", "", 0, true, 0, 0, 0, 0); + test("(?:a|()){50,100}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 50, 50, 50); + test("()??", "", "", 0, true, 0, 0, -1, -1); + test("(a*?)?", "", "", 0, true, 0, 0, 0, 0); + test("(a*)??", "", "", 0, true, 0, 0, -1, -1); + test("(a*?)??", "", "", 0, true, 0, 0, -1, -1); + test("(a*?)*", "", "", 0, true, 0, 0, 0, 0); + test("(a*)*?", "", "", 0, true, 0, 0, -1, -1); + test("(a*?)*?", "", "", 0, true, 0, 0, -1, -1); + test("(a|\\2b|())*", "", "aaabbb", 0, true, 0, 6, 6, 6, 6, 6); + test("(a|\\2b|()){2,4}", "", "aaabbb", 0, true, 0, 3, 3, 3, 3, 3); + test("(a|\\2b|\\3()|())*", "", "aaabbb", 0, true, 0, 6, 6, 6, 6, 6, 3, 3); + test("(a|\\2b|\\3()|()){2,4}", "", "aaabbb", 0, true, 0, 3, 3, 3, -1, -1, 3, 3); + test("(a|\\2b|()){20,24}", "", "aaaaaaaaaaaaaaaaaaaabbbbb", 0, true, 0, 23, 22, 23, 20, 20); + test("(a|\\2b|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1); + test("(a|\\2b|()){2,4}", "", "aaabbb", 0, true, 0, 3, 3, 3, 3, 3); + test("(a|\\2b|\\3()|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1, -1, -1); + test("(a|\\2b|\\3()|()){2,4}", "", "aaabbb", 0, true, 0, 3, 3, 3, -1, -1, 3, 3); + test("(a|\\2b|()){20,24}", "", "aaaaaaaaaaaaaaaaaaaabbbbb", 0, true, 0, 23, 22, 23, 20, 20); + test("(?:|a)*", "", "aaa", 0, true, 0, 0); + test("(?:()|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("(|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("(()|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()\\1(?:|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("()\\1(?:()|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()\\1(|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()\\1(()|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0, 0, 0); + test("()(?:\\1|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("()(?:()\\1|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()(?:(\\1)|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()(?:\\1()|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()(\\1|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()(()\\1|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0, 0, 0); + test("()((\\1)|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0, 0, 0); + test("()(\\1()|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0, 0, 0); + test("(?:(?=a)|a)*", "", "aaa", 0, true, 0, 0); + test("(?:(?=a)()|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("(?:()(?=a)|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("(?:((?=a))|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("()\\1(?:(?=a)|a)*", "", "aaa", 0, true, 0, 0, 0, 0); + test("()\\1(?:(?=a)()|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()\\1(?:()(?=a)|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("()\\1(?:((?=a))|a)*", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("(?:|a)*?", "", "aaa", 0, true, 0, 0); + test("(?:()|a)*?", "", "aaa", 0, true, 0, 0, -1, -1); + test("(|a)*?", "", "aaa", 0, true, 0, 0, -1, -1); + test("(()|a)*?", "", "aaa", 0, true, 0, 0, -1, -1, -1, -1); + test("()\\1(?:|a)*?", "", "aaa", 0, true, 0, 0, 0, 0); + test("()\\1(?:()|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()\\1(|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()\\1(()|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1, -1, -1); + test("()(?:\\1|a)*?", "", "aaa", 0, true, 0, 0, 0, 0); + test("()(?:()\\1|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()(?:(\\1)|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()(?:\\1()|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()(\\1|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()(()\\1|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1, -1, -1); + test("()((\\1)|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1, -1, -1); + test("()(\\1()|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1, -1, -1); + test("(?:(?=a)|a)*?", "", "aaa", 0, true, 0, 0); + test("(?:(?=a)()|a)*?", "", "aaa", 0, true, 0, 0, -1, -1); + test("(?:()(?=a)|a)*?", "", "aaa", 0, true, 0, 0, -1, -1); + test("(?:((?=a))|a)*?", "", "aaa", 0, true, 0, 0, -1, -1); + test("()\\1(?:(?=a)|a)*?", "", "aaa", 0, true, 0, 0, 0, 0); + test("()\\1(?:(?=a)()|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()\\1(?:()(?=a)|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("()\\1(?:((?=a))|a)*?", "", "aaa", 0, true, 0, 0, 0, 0, -1, -1); + test("(|a|\\2b|())*", "", "aaabbb", 0, true, 0, 0, 0, 0, -1, -1); + test("(a||\\2b|())*", "", "aaabbb", 0, true, 0, 3, 3, 3, -1, -1); + test("(a|\\2b||())*", "", "aaabbb", 0, true, 0, 3, 3, 3, -1, -1); + test("(a|\\2b|()|)*", "", "aaabbb", 0, true, 0, 6, 6, 6, 6, 6); + test("(()|a|\\3b|())*", "", "aaabbb", 0, true, 0, 0, 0, 0, 0, 0, -1, -1); + test("(a|()|\\3b|())*", "", "aaabbb", 0, true, 0, 3, 3, 3, 3, 3, -1, -1); + test("(a|\\2b|()|())*", "", "aaabbb", 0, true, 0, 6, 6, 6, 6, 6, -1, -1); + test("(a|\\3b|()|())*", "", "aaabbb", 0, true, 0, 3, 3, 3, 3, 3, -1, -1); + test("(a|()|())*", "", "aaa", 0, true, 0, 3, 3, 3, 3, 3, -1, -1); + test("^(()|a|())*$", "", "aaa", 0, true, 0, 3, 3, 3, 3, 3, -1, -1); + test("(|a|\\2b|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1); + test("(a||\\2b|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1); + test("(a|\\2b||())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1); + test("(a|\\2b|()|)*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1); + test("(()|a|\\3b|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1, -1, -1); + test("(a|()|\\3b|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1, -1, -1); + test("(a|\\2b|()|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1, -1, -1); + test("(a|\\3b|()|())*?", "", "aaabbb", 0, true, 0, 0, -1, -1, -1, -1, -1, -1); + test("(a|()|())*?", "", "aaa", 0, true, 0, 0, -1, -1, -1, -1, -1, -1); + test("^(()|a|())*?$", "", "aaa", 0, true, 0, 3, 2, 3, 2, 2, -1, -1); + test("((A|){7,10}?){10,17}", "", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 0, true, 0, 86, 86, 86, 86, 86); + test("(a{1,30}){1,4}", "", "a", 0, true, 0, 1, 0, 1); + test("((a|){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7); + test("((a?){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7); + test("((|a){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((a??){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((a?){4,6}){4,6}", "", "aaaaaa", 0, true, 0, 6, 6, 6, 6, 6); + test("(a|){4,6}", "", "a", 0, true, 0, 1, 1, 1); + test("(a|){4,6}", "", "aa", 0, true, 0, 2, 2, 2); + test("(a|){4,6}", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|){4,6}", "", "aaaa", 0, true, 0, 4, 4, 4); + test("(a|){4,6}", "", "aaaaa", 0, true, 0, 5, 5, 5); + test("(a|){4,6}", "", "aaaaaa", 0, true, 0, 6, 5, 6); + test("(a|){4,6}", "", "aaaaaaa", 0, true, 0, 6, 5, 6); + test("(a|){4,6}?", "", "a", 0, true, 0, 1, 1, 1); + test("(a|){4,6}?", "", "aa", 0, true, 0, 2, 2, 2); + test("(a|){4,6}?", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|){4,6}?", "", "aaaa", 0, true, 0, 4, 3, 4); + test("(a|){4,6}?", "", "aaaaa", 0, true, 0, 4, 3, 4); + test("(a|){4,6}?", "", "aaaaaa", 0, true, 0, 4, 3, 4); + test("(a|){4,6}?", "", "aaaaaaa", 0, true, 0, 4, 3, 4); + test("(a|){4,6}?a", "", "a", 0, true, 0, 1, 0, 0); + test("(a|){4,6}?a", "", "aa", 0, true, 0, 2, 1, 1); + test("(a|){4,6}?a", "", "aaa", 0, true, 0, 3, 2, 2); + test("(a|){4,6}?a", "", "aaaa", 0, true, 0, 4, 3, 3); + test("(a|){4,6}?a", "", "aaaaa", 0, true, 0, 5, 3, 4); + test("(a|){4,6}?a", "", "aaaaaa", 0, true, 0, 5, 3, 4); + test("(a|){4,6}?a", "", "aaaaaaa", 0, true, 0, 5, 3, 4); + test("(a|){4,6}?a", "", "aaaaaaaa", 0, true, 0, 5, 3, 4); + test("(|a){4,6}a", "", "a", 0, true, 0, 1, 0, 0); + test("(|a){4,6}a", "", "aa", 0, true, 0, 1, 0, 0); + test("(|a){4,6}a", "", "aaa", 0, true, 0, 1, 0, 0); + test("(|a){4,6}a", "", "aaaa", 0, true, 0, 1, 0, 0); + test("(|a){4,6}a", "", "aaaaa", 0, true, 0, 1, 0, 0); + test("(|a){4,6}a", "", "aaaaaa", 0, true, 0, 1, 0, 0); + test("(|a){4,6}a", "", "aaaaaaa", 0, true, 0, 1, 0, 0); + test("((a|){4,6}){4,6}", "", "a", 0, true, 0, 1, 1, 1, 1, 1); + test("((a|){4,6}){4,6}", "", "aa", 0, true, 0, 2, 2, 2, 2, 2); + test("((a|){4,6}){4,6}", "", "aaa", 0, true, 0, 3, 3, 3, 3, 3); + test("((a|){4,6}){4,6}", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4); + test("((a|){4,6}){4,6}", "", "aaaaa", 0, true, 0, 5, 5, 5, 5, 5); + test("((a|){4,6}){4,6}", "", "aaaaaa", 0, true, 0, 6, 6, 6, 6, 6); + test("((a|){4,6}){4,6}", "", "aaaaaaa", 0, true, 0, 7, 7, 7, 7, 7); + test("((a|){4,6}){4,6}", "", "aaaaaaaa", 0, true, 0, 8, 8, 8, 8, 8); + test("((a|){4,6}){4,6}", "", "aaaaaaaaa", 0, true, 0, 9, 9, 9, 9, 9); + test("((a|){4,6}){4,6}", "", "aaaaaaaaaa", 0, true, 0, 10, 10, 10, 10, 10); + test("((a|){4,6}){4,6}", "", "aaaaaaaaaaa", 0, true, 0, 11, 11, 11, 11, 11); + test("((a|){4,6}){4,6}", "", "aaaaaaaaaaaa", 0, true, 0, 12, 12, 12, 12, 12); + test("((a|){4,6}){4,6}", "", "aaaaaaaaaaaaa", 0, true, 0, 13, 13, 13, 13, 13); + test("((|a){4,6}){4,6}", "", "a", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaaaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((|a){4,6}){4,6}", "", "aaaaaaaaaaaaa", 0, true, 0, 0, 0, 0, 0, 0); + test("((a|){4,6}?){4,6}", "", "a", 0, true, 0, 1, 1, 1, 1, 1); + test("((a|){4,6}?){4,6}", "", "aa", 0, true, 0, 2, 2, 2, 2, 2); + test("((a|){4,6}?){4,6}", "", "aaa", 0, true, 0, 3, 3, 3, 3, 3); + test("((a|){4,6}?){4,6}", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4); + test("((a|){4,6}?){4,6}", "", "aaaaa", 0, true, 0, 5, 5, 5, 5, 5); + test("((a|){4,6}?){4,6}", "", "aaaaaa", 0, true, 0, 6, 6, 6, 6, 6); + test("((a|){4,6}?){4,6}", "", "aaaaaaaa", 0, true, 0, 8, 8, 8, 8, 8); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaa", 0, true, 0, 9, 9, 9, 9, 9); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaa", 0, true, 0, 10, 10, 10, 10, 10); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaa", 0, true, 0, 11, 11, 11, 11, 11); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaa", 0, true, 0, 12, 12, 12, 12, 12); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaaa", 0, true, 0, 13, 13, 13, 13, 13); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaaaa", 0, true, 0, 14, 14, 14, 14, 14); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaaaaa", 0, true, 0, 15, 15, 15, 15, 15); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaa", 0, true, 0, 16, 16, 16, 16, 16); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaaa", 0, true, 0, 17, 17, 17, 17, 17); + test("((a|){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaaaa", 0, true, 0, 18, 18, 18, 18, 18); + test("((a){4,6}?){4,6}", "", "a", 0, false); + test("((a){4,6}?){4,6}", "", "aa", 0, false); + test("((a){4,6}?){4,6}", "", "aaa", 0, false); + test("((a){4,6}?){4,6}", "", "aaaa", 0, false); + test("((a){4,6}?){4,6}", "", "aaaaa", 0, false); + test("((a){4,6}?){4,6}", "", "aaaaaa", 0, false); + test("((a){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaa", 0, true, 0, 16, 12, 16, 15, 16); + test("((a){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaaa", 0, true, 0, 16, 12, 16, 15, 16); + test("((a){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaaaaaa", 0, true, 0, 20, 16, 20, 19, 20); + test("((a){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 24, 20, 24, 23, 24); + test("((a){4,6}?){4,6}", "", "aaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 24, 20, 24, 23, 24); + test("((a){4,6}){4,6}", "", "a", 0, false); + test("((a){4,6}){4,6}", "", "aa", 0, false); + test("((a){4,6}){4,6}", "", "aaa", 0, false); + test("((a){4,6}){4,6}", "", "aaaa", 0, false); + test("((a){4,6}){4,6}", "", "aaaaa", 0, false); + test("((a){4,6}){4,6}", "", "aaaaaa", 0, false); + test("((a){4,6}){4,6}", "", "aaaaaaaaaaaaaaaa", 0, true, 0, 16, 12, 16, 15, 16); + test("((a){4,6}){4,6}", "", "aaaaaaaaaaaaaaaaa", 0, true, 0, 17, 13, 17, 16, 17); + test("((a){4,6}){4,6}", "", "aaaaaaaaaaaaaaaaaaaa", 0, true, 0, 20, 16, 20, 19, 20); + test("((a){4,6}){4,6}", "", "aaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 24, 18, 24, 23, 24); + test("((a){4,6}){4,6}", "", "aaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 24, 18, 24, 23, 24); + test("((a){4,}){4,6}", "", "a", 0, false); + test("((a){4,}){4,6}", "", "aa", 0, false); + test("((a){4,}){4,6}", "", "aaa", 0, false); + test("((a){4,}){4,6}", "", "aaaa", 0, false); + test("((a){4,}){4,6}", "", "aaaaa", 0, false); + test("((a){4,}){4,6}", "", "aaaaaa", 0, false); + test("((a){4,}){4,6}", "", "aaaaaaaaaaaaaaaa", 0, true, 0, 16, 12, 16, 15, 16); + test("((a){4,}){4,6}", "", "aaaaaaaaaaaaaaaaa", 0, true, 0, 17, 13, 17, 16, 17); + test("((a){4,}){4,6}", "", "aaaaaaaaaaaaaaaaaaaa", 0, true, 0, 20, 16, 20, 19, 20); + test("((a){4,}){4,6}", "", "aaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 24, 20, 24, 23, 24); + test("((a){4,}){4,6}", "", "aaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 25, 21, 25, 24, 25); + test("(.)\\1{2,}", "", "billiam", 0, false); + test("(^_(a{1,2}[:])*a{1,2}[:]a{1,2}([.]a{1,4})?_)+", "", "_a:a:a.aaa_", 0, true, 0, 11, 0, 11, 1, 3, 6, 10); + test("(a{2}|())+$", "", "aaaa", 0, true, 0, 4, 4, 4, 4, 4); + test("^a(b*)\\1{4,6}?", "", "abbbb", 0, true, 0, 1, 1, 1); + test("^a(b*)\\1{4,6}?", "", "abbbbb", 0, true, 0, 6, 1, 2); + test("(?<=|$)", "", "a", 0, true, 0, 0); + test("(?=ab)a", "", "ab", 0, true, 0, 1); + test("(?=()|^)|x", "", "empty", 0, true, 0, 0, 0, 0); + test("a(?<=ba)", "", "ba", 0, true, 1, 2); + expectSyntaxError("(?<=(?<=a)[])", "i", "", getTRegexEncoding(), "empty", 0, ErrorCode.InvalidCharacterClass); + test("\\d\\W", "i", "4\u017f", 0, true, 0, 3); + test("[\u08bc-\ucf3a]", "i", "\u03b0", 0, true, 0, 2); + test("[\u0450-\u6c50]\u7e57\u55ad()\u64e7\\d|", "i", "\u03b0\u7e57\u55ad\u64e79", 0, true, 0, 12, 8, 8); + test("(?<=(?<=a)b^c)c", "", "abcc", 0, false); + test("a(?:|()\\1){1,2}", "", "a", 0, true, 0, 1, -1, -1); + test("[a-z][a-z\u2028\u2029].|ab(?<=[a-z]w.)", "", "aac", 0, true, 0, 3); + test("(animation|animation-name)", "", "animation", 0, true, 0, 9, 0, 9); + test("(a|){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a|){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(|a){7,7}?b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a||b){7,7}c", "", "aaabc", 0, true, 0, 5, 4, 4); + test("(a||b){7,7}c", "", "aaac", 0, true, 0, 4, 3, 3); + test("(a||b){7,7}c", "", "aaabac", 0, true, 0, 6, 5, 5); + test("($|a){7,7}", "", "aaa", 0, true, 0, 3, 3, 3); + test("($|a){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$){7,7}", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$){7,7}?", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$|b){7,7}", "", "aaab", 0, true, 0, 4, 4, 4); + test("(a|$|b){7,7}", "", "aaa", 0, true, 0, 3, 3, 3); + test("(a|$|b){7,7}", "", "aaaba", 0, true, 0, 5, 5, 5); + test("((?=a)|a){7,7}b", "", "aaa", 0, false); + test("((?=[ab])|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("a((?<=a)|a){7,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a|){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a|){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3); + test("(|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(|a){0,7}?b", "", "aaab", 0, true, 0, 4, 2, 3); + test("(a||b){0,7}c", "", "aaabc", 0, true, 0, 5, 4, 4); + test("(a||b){0,7}c", "", "aaac", 0, true, 0, 4, 3, 3); + test("(a||b){0,7}c", "", "aaabac", 0, true, 0, 6, 5, 5); + test("((?=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 2, 3); + test("((?=[ab])|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("a((?<=a)|a){0,7}b", "", "aaab", 0, true, 0, 4, 3, 3); + test("(a*){11,11}b", "", "aaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 26, 25, 25); + test("(?:a(b{0,19})c)", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 9, 1, 8); + test("(?:a(b{0,19})c)de", "", "abbbbbbbcdebbbbbbbf", 0, true, 0, 11, 1, 8); + test("[\ud0d9](?<=\\S)", "", "\ud0d9", 0, true, 0, 3); + test("[\ud0d9](?<=\\W)", "", "\ud0d9", 0, true, 0, 3); + test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 3); + test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 3); + test("[\u8053](?<=\\S)", "", "\u8053", 0, true, 0, 3); + test("[\u8053](?<=\\W)", "", "\u8053", 0, true, 0, 3); + test("\u0895(?<=\\S)", "", "\u0895", 0, true, 0, 3); + test("\u0895(?<=\\W)", "", "\u0895", 0, true, 0, 3); + test("\u0895|[\u8053\ud0d9]+(?<=\\S\\W\\S)", "", "\ud0d9\ud0d9\ud0d9\ud0d9", 0, true, 0, 12); + test("\u0895|[\u8053\ud0d9]+(?<=\\S\\W\\S)", "", "\ud0d9\ud0d9\ud0d9\ud0d9", 0, true, 0, 12); + test("\u0895|[\u8053\ud0d9]+(?<=\\S\\W\\S)", "", "\ud0d9\ud0d9\ud0d9\ud0d9", 0, true, 0, 12); + test("a|[bc]+(?<=[abc][abcd][abc])", "", "bbbb", 0, true, 0, 4); + test("a(b*)*c\\1d", "", "abbbbcbbd", 0, true, 0, 9, 3, 5); + test("(|a)||b(?<=cde)|", "", "a", 0, true, 0, 0, 0, 0); + test("^(\\1)?\\D*", "", "empty", 0, true, 0, 5, -1, -1); + test("abcd(?<=d|c()d)", "", "_abcd", 0, true, 1, 5, -1, -1); + test("\\Dw\u3aa7\\A\\S(?<=\ue3b3|\\A()\\S)", "", "\udad1\udcfaw\u3aa7A\ue3b3", 0, false); + test("a(?:c|b(?=()))*", "", "abc", 0, true, 0, 3, 2, 2); + test("a(?:c|b(?=(c)))*", "", "abc", 0, true, 0, 3, 2, 3); + test("a(?:c|(?<=(a))b)*", "", "abc", 0, true, 0, 3, 0, 1); + test("(a||b){15,18}c", "", "ababaabbaaac", 0, true, 0, 12, 11, 11); + test("(a||b){15,18}?c", "", "ababaabbaaac", 0, true, 0, 12, 10, 11); + test("(?:ab|c|^){103,104}", "", "abcababccabccabababccabcababcccccabcababababccccabcabcabccabcabcccabababccabababcababababccababccabcababcabcabccabababccccabcab", 0, true, 0, 0); + test("((?<=a)bec)*d", "", "abecd", 0, true, 1, 5, 1, 4); + test("(|(^|\\z){2,77}?)?", "", "empty", 0, true, 0, 0, 0, 0, -1, -1); + test("a(|a{15,36}){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1); + test("a(|a{15,36}?){10,11}", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 1, 1, 1); + test("a(|a{15,36}){10,11}$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0, true, 0, 66, 66, 66); + test("a(|a{15,36}?){10,11}b$", "", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", 0, true, 0, 67, 66, 66); + test("(?:a()|b??){22,26}c", "", "aabbbaabaaaaaabaaaac", 0, true, 0, 20, 19, 19); + test("b()(a\\1|){4,4}\\2c", "", "baaaac", 0, true, 0, 6, 1, 1, 3, 4); + test("a((?=b()|)[a-d])+", "", "abbbcbd", 0, true, 0, 7, 6, 7, 6, 6); + test("a(|b){5,7}c", "", "abbbc", 0, true, 0, 5, 4, 4); + test("a(|b){5,8}c", "", "abbbc", 0, true, 0, 5, 4, 4); + test("a(|b){5,9}c", "", "abbbc", 0, true, 0, 5, 4, 4); + test("a(|b){5,}c", "", "abbbc", 0, true, 0, 5, 4, 4); + test("a((?<=a)|b){5,7}c", "", "abbbc", 0, false); + test("a((?<=a)|b){5,8}c", "", "abbbc", 0, false); + test("a((?<=a)|b){5,9}c", "", "abbbc", 0, false); + test("a((?<=a)|b){5,}c", "", "abbbc", 0, false); + test("[ab]*?\\Z(?<=[^b][ab][^b])", "", "aaaaaa", 0, true, 0, 6); + test("(?<=a(b){3,3}?)", "", "abbb", 0, true, 4, 4, 3, 4); + test("\\A(?(?:%\\h\\h|[!$&-.0-9:;=@A-Z_a-z~/])){0}((?!/)\\g++)\\z", "x", "ftp://example.com/%2Ffoo", 0, true, 0, 24, 23, 24); + + /* GENERATED CODE END - KEEP THIS MARKER FOR AUTOMATIC UPDATES */ + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexLanguage.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexLanguage.java index 30c2e15a039..e6e31667180 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexLanguage.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexLanguage.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -182,9 +182,16 @@ public static RegexSource createRegexSource(Source source) { String pattern = srcStr.substring(firstSlash + 1, lastSlash); String flags = srcStr.substring(lastSlash + 1); // ECMAScript-specific: the 'u' and 'v' flags change the encoding - if (optBuilder.getFlavor() == ECMAScriptFlavor.INSTANCE && !optBuilder.isUtf16ExplodeAstralSymbols() && optBuilder.getEncoding() == Encodings.UTF_16_RAW && - (flags.indexOf('u') >= 0 || flags.indexOf('v') >= 0)) { - optBuilder.encoding(Encodings.UTF_16); + if (optBuilder.getFlavor() == ECMAScriptFlavor.INSTANCE) { + if (flags.indexOf('u') >= 0 || flags.indexOf('v') >= 0) { + if (!optBuilder.isUtf16ExplodeAstralSymbols() && optBuilder.getEncoding() == Encodings.UTF_16_RAW) { + optBuilder.encoding(Encodings.UTF_16); + } + } else { + if (optBuilder.getEncoding() == Encodings.UTF_16) { + optBuilder.encoding(Encodings.UTF_16_RAW); + } + } } return new RegexSource(pattern, flags, optBuilder.build(), source); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexSyntaxException.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexSyntaxException.java index a073c74fc57..ba5f62233b2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexSyntaxException.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexSyntaxException.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -52,22 +52,45 @@ @ExportLibrary(InteropLibrary.class) public final class RegexSyntaxException extends AbstractTruffleException { + public enum ErrorCode { + InvalidBackReference, + InvalidCharacterClass, + InvalidEscape, + InvalidFlag, + InvalidGroup, + InvalidInlineFlag, + InvalidLookbehind, + InvalidNamedGroup, + InvalidOption, + InvalidQuantifier, + InvalidSubexpressionCall, + UnfinishedSequence, + UnmatchedBracket, + UnmatchedParenthesis, + TRegexBailout; + + public int intValue() { + return -(ordinal() + 3); + } + } + private final SourceSection sourceSection; + private final ErrorCode errorCode; public static RegexSyntaxException createOptions(Source source, String msg, int position) { - return new RegexSyntaxException(msg, source, position); + return new RegexSyntaxException(msg, source, position, ErrorCode.InvalidOption); } - public static RegexSyntaxException createPattern(RegexSource source, String msg, int position) { - return new RegexSyntaxException(msg, patternSource(source), position); + public static RegexSyntaxException createPattern(RegexSource source, String msg, int position, ErrorCode errorCode) { + return new RegexSyntaxException(msg, patternSource(source), position, errorCode); } public static RegexSyntaxException createFlags(RegexSource source, String msg) { - return new RegexSyntaxException(msg, flagsSource(source), 0); + return new RegexSyntaxException(msg, flagsSource(source), 0, ErrorCode.InvalidFlag); } public static RegexSyntaxException createFlags(RegexSource source, String msg, int position) { - return new RegexSyntaxException(msg, flagsSource(source), position); + return new RegexSyntaxException(msg, flagsSource(source), position, ErrorCode.InvalidFlag); } @TruffleBoundary @@ -89,10 +112,22 @@ private static Source flagsSource(RegexSource regexSource) { } @TruffleBoundary - private RegexSyntaxException(String reason, Source src, int position) { + private RegexSyntaxException(String reason, Source src, int position, ErrorCode errorCode) { super(reason); assert position <= src.getLength(); this.sourceSection = src.createSection(position, src.getLength() - position); + this.errorCode = errorCode; + } + + @TruffleBoundary + private RegexSyntaxException(String reason, SourceSection sourceSection, ErrorCode errorCode) { + super(reason); + this.sourceSection = sourceSection; + this.errorCode = errorCode; + } + + public RegexSyntaxException withErrorCodeInMessage() { + return new RegexSyntaxException(errorCode.name() + ' ' + getMessage(), sourceSection, errorCode); } @ExportMessage @@ -112,6 +147,10 @@ SourceSection getSourceSection() { return sourceSection; } + public ErrorCode getErrorCode() { + return errorCode; + } + private static final long serialVersionUID = 1L; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/InputStringGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/InputStringGenerator.java index 10a041fcc17..8b5ccf69053 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/InputStringGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/analysis/InputStringGenerator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -143,7 +143,7 @@ public TruffleString toTString(Random rng, TruffleString.Encoding encoding) { int iRange = rng.nextInt(codePointSet.size()); codepoints[i] = rng.nextInt(codePointSet.getLo(iRange), codePointSet.getHi(iRange) + 1); } else if (e instanceof BackRefElement backRef) { - codepoints[i] = codepoints[backRef.ref]; + codepoints[i] = codepoints[backRef.ref + nPrepended]; } } return TruffleString.fromIntArrayUTF32Uncached(codepoints).switchEncodingUncached(encoding); @@ -642,7 +642,7 @@ private void processTerm(Term term) { } afterTerm(term); } else if (term.isSubexpressionCall()) { - processGroup(ast.getGroup(term.asSubexpressionCall().getGroupNr())); + processGroup(ast.getGroup(term.asSubexpressionCall().getGroupNr()).get(0)); } else { throw CompilerDirectives.shouldNotReachHere(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ClassSetContents.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ClassSetContents.java index ab5b8dc87f0..4d8fed4d4b1 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ClassSetContents.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/ClassSetContents.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -128,6 +128,10 @@ public ClassSetContents caseFold(CodePointSetAccumulator tmp) { return new ClassSetContents(kind, CaseFoldData.simpleCaseFold(codePointSet, tmp), foldedStrings, mayContainStrings); } + public boolean isEmpty() { + return codePointSet.isEmpty() && strings.isEmpty(); + } + public EconomicSet getStrings() { return strings; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java index 5e116bed6b1..2c5cd68d001 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/Constants.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -258,7 +258,7 @@ public final class Constants { public static final CodePointSet WORD_CHARS_UNICODE_SETS_IGNORE_CASE = CaseFoldData.simpleCaseFold(WORD_CHARS, new CodePointSetAccumulator()); - public static final CodePointSet NON_WORD_CHARS_UNICODE_SETS_IGNORE_CASE = WORD_CHARS_UNICODE_SETS_IGNORE_CASE.createInverse(CaseFoldData.FOLDABLE_CHARACTERS, + public static final CodePointSet NON_WORD_CHARS_UNICODE_SETS_IGNORE_CASE = WORD_CHARS_UNICODE_SETS_IGNORE_CASE.createInverse(CaseFoldData.FOLDED_CHARACTERS, new CompilationBuffer(Encodings.UTF_16)); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/errors/PyErrorMessages.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/errors/PyErrorMessages.java index 73948c4e518..02e3005a91f 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/errors/PyErrorMessages.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/errors/PyErrorMessages.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -59,6 +59,7 @@ public interface PyErrorMessages { String INLINE_FLAGS_CANNOT_USE_U_FLAG_WITH_A_BYTES_PATTERN = "bad inline flags: cannot use 'u' flag with a bytes pattern"; String INLINE_FLAGS_FLAGS_A_U_AND_L_ARE_INCOMPATIBLE = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"; String INLINE_FLAGS_FLAG_TURNED_ON_AND_OFF = "bad inline flags: flag turned on and off"; + String LOOK_BEHIND_REQUIRES_FIXED_WIDTH_PATTERN = "look-behind requires fixed-width pattern"; String MIN_REPEAT_GREATER_THAN_MAX_REPEAT = "min repeat greater than max repeat"; String MISSING_COLON = "missing :"; String MISSING_DASH_COLON_PAREN = "missing -, : or )"; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java index b54daf3f715..d6e5d2888b8 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/TRegexCompilationRequest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -278,16 +278,16 @@ TRegexExecNode.LazyCaptureGroupRegexSearchNode compileLazyDFAExecutor(TRegexExec boolean traceFinder = preCalculatedResults != null && preCalculatedResults.length > 0; final boolean trackLastGroup = ast.getOptions().getFlavor().usesLastGroupResultField(); executorNodeForward = createDFAExecutor(nfa, true, true, false, allowSimpleCG && !traceFinder && !(ast.getRoot().startsWithCaret() && !properties.hasCaptureGroups()), trackLastGroup); - final boolean createCaptureGroupTracker = !executorNodeForward.isSimpleCG() && (properties.hasCaptureGroups() || properties.hasLookAroundAssertions() || ast.getOptions().isMustAdvance()) && - !traceFinder; - if (createCaptureGroupTracker) { - executorNodeCaptureGroups = createDFAExecutor(nfa, true, false, true, false, trackLastGroup); - } if (traceFinder && preCalculatedResults.length > 1) { executorNodeBackward = createDFAExecutor(traceFinderNFA, false, false, false, false, false); } else if (!executorNodeForward.isAnchored() && !executorNodeForward.isSimpleCG() && (!traceFinder || !nfa.hasReverseUnAnchoredEntry()) && !source.getOptions().isBooleanMatch()) { executorNodeBackward = createDFAExecutor(nfa, false, false, false, allowSimpleCG && !(ast.getRoot().endsWithDollar() && !properties.hasCaptureGroups()), trackLastGroup); } + final boolean createCaptureGroupTracker = !executorNodeForward.isSimpleCG() && (executorNodeBackward == null || !executorNodeBackward.isSimpleCG()) && + (properties.hasCaptureGroups() || properties.hasLookAroundAssertions() || ast.getOptions().isMustAdvance()) && !traceFinder; + if (createCaptureGroupTracker) { + executorNodeCaptureGroups = createDFAExecutor(nfa, true, false, true, false, trackLastGroup); + } logAutomatonSizes(rootNode); return new TRegexExecNode.LazyCaptureGroupRegexSearchNode( language, source, ast.getFlags(), preCalculatedResults, @@ -367,58 +367,91 @@ public TRegexDFAExecutorNode createDFAExecutor(NFA nfaArg, TRegexDFAExecutorProp private void debugAST() { if (source.getOptions().isDumpAutomata()) { - Env env = RegexContext.get(null).getEnv(); - TruffleFile file = env.getPublicTruffleFile("./ast.tex"); - ASTLaTexExportVisitor.exportLatex(ast, file); - file = env.getPublicTruffleFile("ast.json"); - ast.getWrappedRoot().toJson().dump(file); + dumpAST(); } } + private void dumpAST() { + Env env = RegexContext.get(null).getEnv(); + TruffleFile file = env.getPublicTruffleFile("./ast.tex"); + ASTLaTexExportVisitor.exportLatex(ast, file); + file = env.getPublicTruffleFile("ast.json"); + ast.getWrappedRoot().toJson().dump(file); + } + private void debugPureNFA() { if (source.getOptions().isDumpAutomata()) { - Env env = RegexContext.get(null).getEnv(); - TruffleFile file = env.getPublicTruffleFile("pure_nfa.json"); - Json.obj(Json.prop("dfa", Json.obj( - Json.prop("pattern", source.toString()), - Json.prop("pureNfa", pureNFA.toJson(ast))))).dump(file); + dumpPureNFA(); + } + } + + private void dumpPureNFA() { + dumpPureNFA(pureNFA, "pure_nfa.json"); + PureNFA[] subtrees = pureNFA.getSubtrees(); + for (int i = 0; i < subtrees.length; i++) { + PureNFA subtree = subtrees[i]; + dumpPureNFA(subtree, String.format("pure_nfa_%d.json", i)); } } + private void dumpPureNFA(PureNFA subtree, String fileName) { + Env env = RegexContext.get(null).getEnv(); + TruffleFile file = env.getPublicTruffleFile(fileName); + Json.obj(Json.prop("dfa", Json.obj( + Json.prop("pattern", source.toString()), + Json.prop("pureNfa", subtree.toJson(ast))))).dump(file); + } + private void debugNFA() { if (source.getOptions().isDumpAutomata()) { - Env env = RegexContext.get(null).getEnv(); - TruffleFile file = env.getPublicTruffleFile("./nfa.gv"); - NFAExport.exportDot(nfa, file, true, false); - file = env.getPublicTruffleFile("./nfa.tex"); - NFAExport.exportLaTex(nfa, file, false, true); - file = env.getPublicTruffleFile("./nfa_reverse.gv"); - NFAExport.exportDotReverse(nfa, file, true, false); - file = env.getPublicTruffleFile("nfa.json"); - Json.obj(Json.prop("dfa", Json.obj(Json.prop("pattern", source.toString()), Json.prop("nfa", nfa.toJson(true))))).dump(file); + dumpNFA(); } } + private void dumpNFA() { + Env env = RegexContext.get(null).getEnv(); + TruffleFile file = env.getPublicTruffleFile("./nfa.gv"); + NFAExport.exportDot(nfa, file, true, false); + file = env.getPublicTruffleFile("./nfa.tex"); + NFAExport.exportLaTex(nfa, file, false, true); + file = env.getPublicTruffleFile("./nfa_reverse.gv"); + NFAExport.exportDotReverse(nfa, file, true, false); + file = env.getPublicTruffleFile("nfa.json"); + dumpNFAJson(file, nfa, true); + } + + private void dumpNFAJson(TruffleFile file, NFA dumpNFA, boolean forward) { + Json.obj(Json.prop("dfa", Json.obj(Json.prop("pattern", source.toString()), Json.prop("nfa", dumpNFA.toJson(forward))))).dump(file); + } + private void debugTraceFinder() { if (source.getOptions().isDumpAutomata()) { - Env env = RegexContext.get(null).getEnv(); - TruffleFile file = env.getPublicTruffleFile("./trace_finder.gv"); - NFAExport.exportDotReverse(traceFinderNFA, file, true, false); - file = env.getPublicTruffleFile("nfa_trace_finder.json"); - traceFinderNFA.toJson().dump(file); + dumpTraceFinder(); } } + private void dumpTraceFinder() { + Env env = RegexContext.get(null).getEnv(); + TruffleFile file = env.getPublicTruffleFile("./trace_finder.gv"); + NFAExport.exportDotReverse(traceFinderNFA, file, true, false); + file = env.getPublicTruffleFile("nfa_trace_finder.json"); + dumpNFAJson(file, traceFinderNFA, false); + } + private void debugDFA(DFAGenerator dfa, String debugDumpName) { if (source.getOptions().isDumpAutomata()) { - Env env = RegexContext.get(null).getEnv(); - TruffleFile file = env.getPublicTruffleFile("dfa_" + dfa.getDebugDumpName(debugDumpName) + ".gv"); - DFAExport.exportDot(dfa, file, false); - file = env.getPublicTruffleFile("dfa_" + dfa.getDebugDumpName(debugDumpName) + ".json"); - Json.obj(Json.prop("dfa", dfa.toJson())).dump(file); + dumpDFA(dfa, debugDumpName); } } + private static void dumpDFA(DFAGenerator dfa, String debugDumpName) { + Env env = RegexContext.get(null).getEnv(); + TruffleFile file = env.getPublicTruffleFile("dfa_" + dfa.getDebugDumpName(debugDumpName) + ".gv"); + DFAExport.exportDot(dfa, file, false); + file = env.getPublicTruffleFile("dfa_" + dfa.getDebugDumpName(debugDumpName) + ".json"); + Json.obj(Json.prop("dfa", dfa.toJson())).dump(file); + } + private static boolean shouldLogPhases() { return Loggers.LOG_PHASES.isLoggable(Level.FINER); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/LongArrayBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/LongArrayBuffer.java index c86eb395108..e2a61d2067a 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/LongArrayBuffer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/LongArrayBuffer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -120,6 +120,15 @@ public LongArrayBuffer asFixedSizeArray(int size, int initialValue) { return this; } + public boolean contains(long value) { + for (long v : this) { + if (v == value) { + return true; + } + } + return false; + } + public long[] toArray() { return isEmpty() ? EmptyArrays.LONG : Arrays.copyOf(buf, length); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java index eb3a5a1eaa7..71001dc12f2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/buffer/ObjectArrayBuffer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -44,6 +44,7 @@ import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; +import java.util.function.IntFunction; /** * This class is designed as a "scratchpad" for generating many Object arrays of unknown size. It @@ -151,6 +152,10 @@ public ST[] toArray(ST[] a) { return a; } + public ST[] toArray(IntFunction generator) { + return toArray(generator.apply(length)); + } + @Override public Iterator iterator() { return new ObjectBufferIterator<>(buf, length); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java index 9e3e3ce98bb..c3d98308e35 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAGenerator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -183,6 +183,10 @@ public DFAStateNodeBuilder[] getEntryStates() { return entryStates; } + private DFAStateNodeBuilder getMaxOffsetAnchoredInitialState() { + return entryStates[nfa.getAnchoredEntry().length - 1]; + } + private DFAStateNodeBuilder getUnanchoredInitialState() { return entryStates[nfa.getAnchoredEntry().length]; } @@ -335,6 +339,7 @@ public TRegexDFAExecutorNode createDFAExecutor() { assert states[0] == null; short[] entryStateIDs = new short[entryStates.length]; short[] cgLastTransition = isGenericCG() ? new short[entryStates.length] : null; + DFASimpleCGTransition[] initialStateSimpleCGTransitions = !isForward() && doSimpleCG ? new DFASimpleCGTransition[entryStates.length] : null; for (int i = 0; i < entryStates.length; i++) { if (entryStates[i] == null) { entryStateIDs[i] = -1; @@ -343,6 +348,8 @@ public TRegexDFAExecutorNode createDFAExecutor() { if (isGenericCG()) { DFACaptureGroupLazyTransitionBuilder lt = getLazyTransition(initialCGTransitions[i]); cgLastTransition[i] = lt.getLastTransitionIndex(); + } else if (!isForward() && doSimpleCG) { + initialStateSimpleCGTransitions[i] = DFASimpleCGTransition.create(entryStates[i].getNfaTransitionSet().getTransition(0), false); } } } @@ -350,7 +357,9 @@ public TRegexDFAExecutorNode createDFAExecutor() { assert getReplacement(getUnanchoredInitialState().getId()) instanceof DFAFindInnerLiteralStateNode; entryStateIDs = new short[]{(short) getUnanchoredInitialState().getId(), (short) getUnanchoredInitialState().getId()}; } - states[0] = new DFAInitialStateNode(entryStateIDs, cgLastTransition); + DFASimpleCG initialStateSimpleCG = initialStateSimpleCGTransitions == null ? null + : DFASimpleCG.create(initialStateSimpleCGTransitions, DFASimpleCGTransition.getEmptyInstance(), DFASimpleCGTransition.getEmptyInstance()); + states[0] = new DFAInitialStateNode(entryStateIDs, cgLastTransition, initialStateSimpleCG); if (TRegexOptions.TRegexEnableNodeSplitter) { states = tryMakeReducible(states); } @@ -384,14 +393,47 @@ private void createInitialStatesForward() { private void createInitialStatesBackward() { entryStates = new DFAStateNodeBuilder[]{null, null}; if (nfa.hasReverseUnAnchoredEntry()) { - entryStates[0] = createInitialState(createTransitionBuilder(createNFATransitionSet(nfa.getReverseAnchoredEntry(), nfa.getReverseUnAnchoredEntry()))); - entryStates[1] = createInitialState(createTransitionBuilder(createNFATransitionSet(nfa.getReverseUnAnchoredEntry()))); + entryStates[0] = createInitialStateBackward(nfa.getReverseAnchoredEntry(), nfa.getReverseUnAnchoredEntry()); + entryStates[1] = createInitialStateBackward(nfa.getReverseUnAnchoredEntry()); } else { - entryStates[0] = createInitialState(createTransitionBuilder(createNFATransitionSet(nfa.getReverseAnchoredEntry()))); + entryStates[0] = createInitialStateBackward(nfa.getReverseAnchoredEntry()); entryStates[1] = null; } } + private DFAStateNodeBuilder createInitialStateBackward(NFAStateTransition... entries) { + /* + * In forward mode, a DFA state consists of a set of NFA transitions, and a DFA state is + * considered final if one of its NFA transitions' targets _contains a subsequent + * transition_ to a NFA final state. + * + * In backward mode, we skip the NFA's final state transitions, so the backward DFA states + * consist of the same NFA transitions as the forward DFA states. As a consequence of this, + * backward DFA states are considered final if one of their NFA transition's targets _is_ a + * backward NFA final state. + */ + ObjectArrayBuffer transitionSet = compilationBuffer.getObjectBuffer1(); + StateSet stateSet = StateSet.create(nfa); + CodePointSet cps = null; + GroupBoundaries groupBoundaries = null; + for (NFAStateTransition entry : entries) { + // skip past reverse entry transitions (i.e. forward final transitions) + for (NFAStateTransition predecessor : entry.getTarget(false).getPredecessors()) { + if (groupBoundaries == null) { + cps = predecessor.getCodePointSet(); + groupBoundaries = predecessor.getGroupBoundaries(); + } else { + if (!predecessor.getGroupBoundaries().equals(groupBoundaries) || !predecessor.getCodePointSet().equals(cps)) { + hasAmbiguousStates = true; + } + } + stateSet.add(predecessor.getTarget(false)); + transitionSet.add(predecessor); + } + } + return createInitialState(createTransitionBuilder(new TransitionSet<>(transitionSet.toArray(NFAStateTransition[]::new), stateSet))); + } + private DFAStateNodeBuilder createInitialState(DFAStateTransitionBuilder transition) { DFAStateNodeBuilder lookup = lookupState(transition.getTransitionSet(), false); if (lookup == null) { @@ -413,15 +455,21 @@ private void expandState(DFAStateNodeBuilder state) { boolean allPrefixStateSuccessors = true; outer: for (NFAStateTransition transition : state.getNfaTransitionSet().getTransitions()) { NFAState nfaState = transition.getTarget(isForward()); - for (NFAStateTransition nfaTransition : nfaState.getSuccessors(isForward())) { - NFAState target = nfaTransition.getTarget(isForward()); - if (!target.isFinalState(isForward()) && (!state.isBackwardPrefixState() || target.hasPrefixStates())) { - anyPrefixStateSuccessors |= target.hasPrefixStates(); - allPrefixStateSuccessors &= target.hasPrefixStates(); - canonicalizer.addArgument(nfaTransition, isForward() ? nfaTransition.getCodePointSet() : target.getCharSet()); - } else if (isForward() && target.isUnAnchoredFinalState()) { - assert target == nfa.getReverseUnAnchoredEntry().getSource(); - break outer; + if (isForward()) { + for (NFAStateTransition nfaTransition : nfaState.getSuccessors(true)) { + NFAState target = nfaTransition.getTarget(true); + if (!target.isFinalState(true)) { + canonicalizer.addArgument(nfaTransition, nfaTransition.getCodePointSet()); + } else if (target.isUnAnchoredFinalState()) { + assert target == nfa.getReverseUnAnchoredEntry().getSource(); + break outer; + } + } + } else if (!nfaState.isFinalState(false) && (!state.isBackwardPrefixState() || nfaState.hasPrefixStates())) { + for (NFAStateTransition nfaTransition : nfaState.getSuccessors(false)) { + anyPrefixStateSuccessors |= nfaState.hasPrefixStates(); + allPrefixStateSuccessors &= nfaState.hasPrefixStates(); + canonicalizer.addArgument(nfaTransition, nfaTransition.getCodePointSet()); } } } @@ -619,19 +667,26 @@ private void tryInnerLiteralOptimization() { int literalStart = props.getInnerLiteralStart(); Sequence rootSeq = nfa.getAst().getRoot().getFirstAlternative(); - boolean prefixHasLookAhead = false; + boolean maybeOverlappingLookArounds = false; // find all parser tree nodes of the prefix StateSet prefixAstNodes = StateSet.create(nfa.getAst()); for (int i = 0; i < literalStart; i++) { Term t = rootSeq.getTerms().get(i); - prefixHasLookAhead |= t.hasLookAheads(); + maybeOverlappingLookArounds |= t.hasLookArounds(); AddToSetVisitor.addCharacterClasses(prefixAstNodes, t); } + boolean lookBehindsAfterLiteral = false; + for (int i = literalEnd; i < rootSeq.size(); i++) { + Term t = rootSeq.getTerms().get(i); + lookBehindsAfterLiteral |= t.hasLookBehinds(); + } + maybeOverlappingLookArounds |= lookBehindsAfterLiteral; // find NFA states of the prefix and the beginning and end of the literal StateSet prefixNFAStates = StateSet.create(nfa); - if (nfa.getUnAnchoredInitialState() != null) { - prefixNFAStates.add(nfa.getUnAnchoredInitialState()); + NFAState unAnchoredInitialState = nfa.getMaxOffsetUnAnchoredInitialState(); + if (unAnchoredInitialState != null) { + prefixNFAStates.add(unAnchoredInitialState); } NFAState literalFirstState = null; NFAState literalLastState = null; @@ -639,7 +694,7 @@ private void tryInnerLiteralOptimization() { if (s == null) { continue; } - if (!prefixHasLookAhead && !s.getStateSet().isEmpty() && prefixAstNodes.containsAll(s.getStateSet())) { + if (!maybeOverlappingLookArounds && !s.getStateSet().isEmpty() && prefixAstNodes.containsAll(s.getStateSet())) { prefixNFAStates.add(s); } if (s.getStateSet().contains(rootSeq.getTerms().get(literalStart))) { @@ -657,7 +712,7 @@ private void tryInnerLiteralOptimization() { literalLastState = s; } } - if (prefixHasLookAhead) { + if (maybeOverlappingLookArounds) { // If there are look-ahead assertions in the prefix, we cannot decide whether a // given NFA state belongs to the prefix just by its AST nodes alone, since a // look-ahead may be merged with nodes of the postfix as well. Therefore, we instead @@ -665,8 +720,9 @@ private void tryInnerLiteralOptimization() { // state to the literal's first state. ArrayList bfsCur = new ArrayList<>(prefixNFAStates); ArrayList bfsNext = new ArrayList<>(); - if (nfa.getAnchoredInitialState() != null) { - bfsCur.add(nfa.getAnchoredInitialState()); + NFAState anchoredInitialState = nfa.getMaxOffsetAnchoredInitialState(); + if (anchoredInitialState != null) { + bfsCur.add(anchoredInitialState); } while (!bfsCur.isEmpty()) { for (NFAState s : bfsCur) { @@ -725,7 +781,7 @@ private void tryInnerLiteralOptimization() { return; } - if (literalStart > 0) { + if (literalStart > 0 || lookBehindsAfterLiteral) { /* * Check if it is possible to match the literal beginning from the prefix, and bail out * if that is the case. Otherwise, the resulting DFA would produce wrong results on e.g. @@ -798,8 +854,8 @@ private void tryInnerLiteralOptimization() { nfa.getReverseAnchoredEntry().setSource(literalFirstState); nfa.getReverseUnAnchoredEntry().setSource(literalFirstState); assert innerLiteralPrefixMatcher == null; - innerLiteralPrefixMatcher = compilationRequest.createDFAExecutor(nfa, new TRegexDFAExecutorProperties(false, false, false, doSimpleCG, - false, rootSeq.getTerms().get(literalStart - 1).getMinPath()), "innerLiteralPrefix"); + int minResultLength = rootSeq.getTerms().get(literalStart).getMinPath() - 1; + innerLiteralPrefixMatcher = compilationRequest.createDFAExecutor(nfa, new TRegexDFAExecutorProperties(false, false, false, doSimpleCG, false, minResultLength), "innerLiteralPrefix"); innerLiteralPrefixMatcher.getProperties().setSimpleCGMustCopy(false); doSimpleCG = doSimpleCG && innerLiteralPrefixMatcher.isSimpleCG(); nfa.setInitialLoopBack(true); @@ -809,7 +865,6 @@ private void tryInnerLiteralOptimization() { executorProps.setCanFindStart(innerLiteralCanFindMatchStart(unanchoredInitialState, literalLastDFAState)); registerStateReplacement(unanchoredInitialState.getId(), new DFAFindInnerLiteralStateNode((short) unanchoredInitialState.getId(), new short[]{(short) literalLastDFAState.getId()}, nfa.getAst().extractInnerLiteral())); - } /** @@ -848,7 +903,7 @@ private boolean innerLiteralCanFindMatchStart(DFAStateNodeBuilder unanchoredInit } private boolean innerLiteralMatchesPrefix(StateSet prefixNFAStates) { - if (innerLiteralTryMatchPrefix(prefixNFAStates, entryStates[0].getNfaTransitionSet().getTargetStateSet().copy())) { + if (innerLiteralTryMatchPrefix(prefixNFAStates, getMaxOffsetAnchoredInitialState().getNfaTransitionSet().getTargetStateSet().copy())) { return true; } for (NFAState s : prefixNFAStates) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java index f987a860e06..52741d485c9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/dfa/DFAStateNodeBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -288,53 +288,44 @@ public void clearPreCalculatedResults() { public DFAStateNodeBuilder updateFinalStateData(DFAGenerator dfaGenerator) { boolean forward = dfaGenerator.isForward(); boolean traceFinder = dfaGenerator.getNfa().isTraceFinderNFA(); - for (NFAStateTransition t : nfaTransitionSet.getTransitions()) { - NFAState target = t.getTarget(forward); - if (target.hasTransitionToAnchoredFinalState(forward)) { - if (anchoredFinalStateTransition == null) { - if (traceFinder && isBackwardPrefixState()) { - for (NFAStateTransition t2 : target.getSuccessors(forward)) { - NFAState target2 = t2.getTarget(forward); - if (target2.isAnchoredFinalState(forward) && target2.hasPrefixStates()) { - setAnchoredFinalState(); - setAnchoredFinalStateTransition(t2); - } - } - } else { - setAnchoredFinalState(); - setAnchoredFinalStateTransition(target.getFirstTransitionToFinalState(forward)); - } + if (forward) { + for (NFAStateTransition t : nfaTransitionSet.getTransitions()) { + // In forward mode, a state is final if it contains a NFA transition to a NFA state + // that has a subsequent transition to a final state + NFAState target = t.getTarget(true); + if (target.hasTransitionToAnchoredFinalState(true) && anchoredFinalStateTransition == null) { + setAnchoredFinalState(); + setAnchoredFinalStateTransition(target.getFirstTransitionToFinalState(true)); } - } - if (target.hasTransitionToUnAnchoredFinalState(forward)) { - if (traceFinder && isBackwardPrefixState()) { - for (NFAStateTransition t2 : target.getSuccessors(forward)) { - NFAState target2 = t2.getTarget(forward); - if (target2.isUnAnchoredFinalState(forward) && target2.hasPrefixStates()) { - setUnAnchoredFinalState(); - setUnAnchoredFinalStateTransition(t2); - } - } - } else { + if (target.hasTransitionToUnAnchoredFinalState(true)) { setUnAnchoredFinalState(); - setUnAnchoredFinalStateTransition(target.getTransitionToUnAnchoredFinalState(forward)); - } - if (forward) { + setUnAnchoredFinalStateTransition(target.getTransitionToUnAnchoredFinalState(true)); return this; } } - if (traceFinder) { - for (NFAStateTransition t2 : target.getSuccessors(forward)) { - NFAState target2 = t2.getTarget(forward); - if (!isBackwardPrefixState() || target2.hasPrefixStates()) { - if (target2.isAnchoredFinalState(forward)) { - assert target2.hasPossibleResults() && target2.getPossibleResults().numberOfSetBits() == 1; - updatePreCalcAnchoredResult(target2.getPossibleResults().iterator().nextInt()); + } else { + for (NFAStateTransition t : nfaTransitionSet.getTransitions()) { + // In backward mode, a state is final if it contains a NFA transition to a NFA final + // state + NFAState target = t.getTarget(false); + if (target.isAnchoredFinalState(false)) { + if (!(traceFinder && isBackwardPrefixState()) || target.hasPrefixStates()) { + if (traceFinder) { + assert target.hasPossibleResults() && target.getPossibleResults().numberOfSetBits() == 1; + updatePreCalcAnchoredResult(target.getPossibleResults().iterator().nextInt()); } - if (target2.isUnAnchoredFinalState(forward)) { - assert target2.hasPossibleResults() && target2.getPossibleResults().numberOfSetBits() == 1; - updatePreCalcUnAnchoredResult(target2.getPossibleResults().iterator().nextInt()); + setAnchoredFinalState(); + setAnchoredFinalStateTransition(t); + } + } + if (target.isUnAnchoredFinalState(false)) { + if (!(traceFinder && isBackwardPrefixState()) || target.hasPrefixStates()) { + if (traceFinder) { + assert target.hasPossibleResults() && target.getPossibleResults().numberOfSetBits() == 1; + updatePreCalcUnAnchoredResult(target.getPossibleResults().iterator().nextInt()); } + setUnAnchoredFinalState(); + setUnAnchoredFinalStateTransition(t); } } } @@ -414,7 +405,7 @@ public String toString() { @Override public JsonValue toJson() { return Json.obj(Json.prop("id", getId()), - Json.prop("stateSet", Json.array(Arrays.stream(nfaTransitionSet.getTransitions()).map(x -> Json.val(x.getTarget().getId())))), + Json.prop("stateSet", Json.array(Arrays.stream(nfaTransitionSet.getTransitions()).map(x -> Json.val(x.getTarget(isForward()).getId())))), Json.prop("finalState", isUnAnchoredFinalState()), Json.prop("anchoredFinalState", isAnchoredFinalState()), Json.prop("transitions", Arrays.stream(getSuccessors()).map(x -> Json.val(x.getId())))); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFA.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFA.java index 321edc1f804..11ce7de2bb9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFA.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFA.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -117,10 +117,28 @@ public NFAState getUnAnchoredInitialState() { return unAnchoredEntry[0] == null ? null : unAnchoredEntry[0].getTarget(); } + public NFAState getMaxOffsetUnAnchoredInitialState() { + return getMaxOffsetInitialState(unAnchoredEntry); + } + public NFAState getAnchoredInitialState() { return anchoredEntry[0] == null ? null : anchoredEntry[0].getTarget(); } + public NFAState getMaxOffsetAnchoredInitialState() { + return getMaxOffsetInitialState(anchoredEntry); + } + + private static NFAState getMaxOffsetInitialState(NFAStateTransition[] entries) { + NFAState ret = null; + for (NFAStateTransition t : entries) { + if (t != null) { + ret = t.getTarget(); + } + } + return ret; + } + public boolean hasReverseUnAnchoredEntry() { return reverseUnAnchoredEntry != null && reverseUnAnchoredEntry.getSource().getPredecessors().length > 0; } @@ -264,8 +282,8 @@ public void setInitialLoopBack(boolean enable) { public boolean isFixedCodePointWidth() { boolean fixedCodePointWidth = true; - for (NFAState state : states) { - if (state != null && !ast.getEncoding().isFixedCodePointWidth(state.getCharSet())) { + for (NFAStateTransition transition : transitions) { + if (transition != null && !transition.getTarget().isFinalState() && !ast.getEncoding().isFixedCodePointWidth(transition.getCodePointSet())) { fixedCodePointWidth = false; break; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java index 985bf8e8c53..63d005ac183 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAGenerator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -50,6 +50,8 @@ import java.util.Map; import java.util.Objects; +import org.graalvm.collections.EconomicMap; + import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.automaton.StateSet; @@ -57,6 +59,7 @@ import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.Counter; import com.oracle.truffle.regex.tregex.parser.ast.CharacterClass; +import com.oracle.truffle.regex.tregex.parser.ast.GroupBoundaries; import com.oracle.truffle.regex.tregex.parser.ast.LookBehindAssertion; import com.oracle.truffle.regex.tregex.parser.ast.MatchFound; import com.oracle.truffle.regex.tregex.parser.ast.PositionAssertion; @@ -65,7 +68,6 @@ import com.oracle.truffle.regex.tregex.parser.ast.Sequence; import com.oracle.truffle.regex.tregex.parser.ast.Term; import com.oracle.truffle.regex.util.TBitSet; -import org.graalvm.collections.EconomicMap; public final class NFAGenerator { @@ -75,6 +77,7 @@ public final class NFAGenerator { private final NFAState dummyInitialState; private final NFAState[] anchoredInitialStates; private final NFAState[] initialStates; + private NFAState checkFinalTransitionState; /** * These are like {@link #initialStates}, but with {@code mustAdvance} set to {@code false}, * i.e. we have already advanced when we are in these states. In a regular expression with @@ -96,6 +99,7 @@ public final class NFAGenerator { private final ASTTransitionCanonicalizer astTransitionCanonicalizer; private final TBitSet transitionGBUpdateIndices; private final TBitSet transitionGBClearIndices; + private int lastGroup = -1; private final ArrayList transitionsBuffer = new ArrayList<>(); private final CompilationBuffer compilationBuffer; @@ -106,15 +110,15 @@ private NFAGenerator(RegexAST ast, CompilationBuffer compilationBuffer) { this.transitionGBClearIndices = new TBitSet(ast.getNumberOfCaptureGroups() * 2); this.astTransitionCanonicalizer = new ASTTransitionCanonicalizer(ast, true, false); this.compilationBuffer = compilationBuffer; - dummyInitialState = new NFAState((short) stateID.inc(), StateSet.create(ast, ast.getWrappedRoot()), CodePointSet.getEmpty(), Collections.emptySet(), false, ast.getOptions().isMustAdvance()); + dummyInitialState = new NFAState((short) stateID.inc(), StateSet.create(ast, ast.getWrappedRoot()), Collections.emptySet(), false, ast.getOptions().isMustAdvance()); nfaStates.put(NFAStateID.create(dummyInitialState), dummyInitialState); anchoredFinalState = createFinalState(StateSet.create(ast, ast.getRoot().getSubTreeParent().getAnchoredFinalState()), false); anchoredFinalState.setAnchoredFinalState(); finalState = createFinalState(StateSet.create(ast, ast.getRoot().getSubTreeParent().getMatchFound()), false); finalState.setUnAnchoredFinalState(); assert transitionGBUpdateIndices.isEmpty() && transitionGBClearIndices.isEmpty(); - anchoredReverseEntry = createTransition(anchoredFinalState, dummyInitialState, ast.getEncoding().getFullSet(), -1); - unAnchoredReverseEntry = createTransition(finalState, dummyInitialState, ast.getEncoding().getFullSet(), -1); + anchoredReverseEntry = createTransition(anchoredFinalState, dummyInitialState, ast.getEncoding().getFullSet()); + unAnchoredReverseEntry = createTransition(finalState, dummyInitialState, ast.getEncoding().getFullSet()); int nEntries = ast.getWrappedPrefixLength() + 1; initialStates = new NFAState[nEntries]; advancedInitialState = ast.getOptions().isMustAdvance() ? createFinalState(StateSet.create(ast, ast.getNFAUnAnchoredInitialState(0)), false) : null; @@ -122,7 +126,7 @@ private NFAGenerator(RegexAST ast, CompilationBuffer compilationBuffer) { for (int i = 0; i < initialStates.length; i++) { initialStates[i] = createFinalState(StateSet.create(ast, ast.getNFAUnAnchoredInitialState(i)), ast.getOptions().isMustAdvance()); initialStates[i].setUnAnchoredInitialState(true); - unAnchoredEntries[i] = createTransition(dummyInitialState, initialStates[i], ast.getEncoding().getFullSet(), -1); + unAnchoredEntries[i] = createTransition(dummyInitialState, initialStates[i], ast.getEncoding().getFullSet()); if (i > 0) { initialStates[i].setHasPrefixStates(true); } @@ -141,7 +145,7 @@ private NFAGenerator(RegexAST ast, CompilationBuffer compilationBuffer) { if (i > 0) { initialStates[i].setHasPrefixStates(true); } - anchoredEntries[i] = createTransition(dummyInitialState, anchoredInitialStates[i], ast.getEncoding().getFullSet(), -1); + anchoredEntries[i] = createTransition(dummyInitialState, anchoredInitialStates[i], ast.getEncoding().getFullSet()); } NFAStateTransition[] dummyInitNext = Arrays.copyOf(anchoredEntries, nEntries * 2); System.arraycopy(unAnchoredEntries, 0, dummyInitNext, nEntries, nEntries); @@ -151,6 +155,14 @@ private NFAGenerator(RegexAST ast, CompilationBuffer compilationBuffer) { dummyInitialState.setPredecessors(dummyInitPrev); } + private NFAState getFinalCheckedTransitionState() { + if (checkFinalTransitionState == null) { + checkFinalTransitionState = createFinalState(StateSet.create(ast, ast.getRoot().getSubTreeParent().getMatchFoundChecked()), ast.getOptions().isMustAdvance()); + checkFinalTransitionState.setSuccessors(new NFAStateTransition[]{createNoCGTransition(checkFinalTransitionState, finalState, ast.getEncoding().getFullSet())}, true); + } + return checkFinalTransitionState; + } + public static NFA createNFA(RegexAST ast, CompilationBuffer compilationBuffer) { return new NFAGenerator(ast, compilationBuffer).doCreateNFA(); } @@ -174,9 +186,9 @@ private NFA doCreateNFA() { } if (ast.getOptions().isMustAdvance()) { addNewLoopBackTransition(initialStates[0], advancedInitialState); - initialLoopBack = createTransition(advancedInitialState, advancedInitialState, ast.getEncoding().getFullSet(), -1); + initialLoopBack = createTransition(advancedInitialState, advancedInitialState, ast.getEncoding().getFullSet()); } else { - initialLoopBack = createTransition(initialStates[0], initialStates[0], ast.getEncoding().getFullSet(), -1); + initialLoopBack = createTransition(initialStates[0], initialStates[0], ast.getEncoding().getFullSet()); } for (NFAState s : nfaStates.values()) { @@ -262,17 +274,19 @@ private NFAStateTransition[] createNFATransitions(NFAState sourceState, ASTStep boolean containsPositionAssertion = false; boolean containsMatchFound = false; boolean containsPrefixStates = false; - int lastGroup = -1; + boolean allCCInLookBehind = true; EconomicMap matchedConditionGroupsMap = ast.getProperties().hasConditionalBackReferences() ? EconomicMap.create() : null; for (ASTTransition astTransition : mergeBuilder.getTransitionSet().getTransitions()) { Term target = astTransition.getTarget(); + boolean inLookBehindAssertion = target.isInLookBehindAssertion(); if (target instanceof CharacterClass) { if (stateSetCC == null) { stateSetCC = StateSet.create(ast); finishedLookBehinds = StateSet.create(ast); } stateSetCC.add((CharacterClass) target); - if (target.isInLookBehindAssertion() && target == ((Sequence) target.getParent()).getLastTerm()) { + allCCInLookBehind &= inLookBehindAssertion; + if (inLookBehindAssertion && target == ((Sequence) target.getParent()).getLastTerm()) { finishedLookBehinds.add((LookBehindAssertion) target.getSubTreeParent()); } } else if (target instanceof PositionAssertion) { @@ -283,7 +297,7 @@ private NFAStateTransition[] createNFATransitions(NFAState sourceState, ASTStep } containsPrefixStates |= target.isPrefix(); astTransition.getGroupBoundaries().updateBitSets(transitionGBUpdateIndices, transitionGBClearIndices); - if (!target.isInLookAheadAssertion() && !target.isInLookBehindAssertion()) { + if (!target.isInLookAroundAssertion()) { lastGroup = astTransition.getGroupBoundaries().getLastGroup(); } if (ast.getProperties().hasConditionalBackReferences()) { @@ -291,45 +305,70 @@ private NFAStateTransition[] createNFATransitions(NFAState sourceState, ASTStep } } if (!(sourceState.isMustAdvance() && transitionGBUpdateIndices.get(0) && transitionGBUpdateIndices.get(1))) { - if (stateSetCC == null) { - if (containsPositionAssertion) { - transitionsBuffer.add(createTransition(sourceState, anchoredFinalState, ast.getEncoding().getFullSet(), lastGroup)); - } else if (containsMatchFound) { - transitionsBuffer.add(createTransition(sourceState, finalState, ast.getEncoding().getFullSet(), lastGroup)); - // Transitions dominated by a transition to a final state will never end - // up being used and so we can skip generating them and return the - // current list of transitions. - transitionGBUpdateIndices.clear(); - transitionGBClearIndices.clear(); - return transitionsBuffer.toArray(new NFAStateTransition[transitionsBuffer.size()]); + if (containsPositionAssertion) { + if (stateSetCC == null || allCCInLookBehind) { + transitionsBuffer.add(createTransition(sourceState, anchoredFinalState, ast.getEncoding().getFullSet())); + } + } else if (stateSetCC == null) { + if (containsMatchFound) { + if (mergeBuilder.getCodePointSet().matchesEverything(ast.getEncoding())) { + transitionsBuffer.add(createTransition(sourceState, finalState, ast.getEncoding().getFullSet())); + // Transitions dominated by a transition to a final state will never + // end up being used, so we can skip generating them and return the + // current list of transitions. + clearGroupBoundaries(); + return transitionsBuffer.toArray(new NFAStateTransition[transitionsBuffer.size()]); + } + // This case is only reachable when merging a lookbehind with an empty + // transition to the final state. + // The issue is that the priority between transitions after running the + // canonicalizer is lost, but that doesn't matter since + // they have disjoint code point sets. But when the transition reaches + // the final state, then it's cps is not checked. + // In that case we use this special checkFinalTransitionState, which + // will still check that last code point set matches. + transitionsBuffer.add(createTransition(sourceState, getFinalCheckedTransitionState(), mergeBuilder.getCodePointSet())); + } + } else { + if (containsMatchFound && allCCInLookBehind) { + // possible when a not fully matched lookbehind is still being tracked + // when the main expression already reached a final state. + transitionsBuffer.add(createTransition(sourceState, finalState, ast.getEncoding().getFullSet())); } - } else if (!containsPositionAssertion) { assert mergeBuilder.getCodePointSet().matchesSomething(); - NFAState targetState = registerMatcherState(stateSetCC, mergeBuilder.getCodePointSet(), finishedLookBehinds, containsPrefixStates, + NFAState targetState = registerMatcherState(stateSetCC, finishedLookBehinds, containsPrefixStates, sourceState.isMustAdvance() && !ast.getHardPrefixNodes().isDisjoint(stateSetCC), matchedConditionGroupsMap); - transitionsBuffer.add(createTransition(sourceState, targetState, mergeBuilder.getCodePointSet(), lastGroup)); + transitionsBuffer.add(createTransition(sourceState, targetState, mergeBuilder.getCodePointSet())); } } - transitionGBUpdateIndices.clear(); - transitionGBClearIndices.clear(); + clearGroupBoundaries(); } } return transitionsBuffer.toArray(new NFAStateTransition[transitionsBuffer.size()]); } + private void clearGroupBoundaries() { + transitionGBUpdateIndices.clear(); + transitionGBClearIndices.clear(); + lastGroup = -1; + } + private NFAState createFinalState(StateSet stateSet, boolean mustAdvance) { - NFAState state = new NFAState((short) stateID.inc(), stateSet, ast.getEncoding().getFullSet(), Collections.emptySet(), false, mustAdvance); + NFAState state = new NFAState((short) stateID.inc(), stateSet, Collections.emptySet(), false, mustAdvance); assert !nfaStates.containsKey(NFAStateID.create(state)); nfaStates.put(NFAStateID.create(state), state); return state; } - private NFAStateTransition createTransition(NFAState source, NFAState target, CodePointSet codePointSet, int lastGroup) { - return new NFAStateTransition((short) transitionID.inc(), source, target, codePointSet, ast.createGroupBoundaries(transitionGBUpdateIndices, transitionGBClearIndices, lastGroup)); + private NFAStateTransition createTransition(NFAState source, NFAState target, CodePointSet codePointSet) { + return new NFAStateTransition((short) transitionID.inc(), source, target, codePointSet, ast.createGroupBoundaries(transitionGBUpdateIndices, transitionGBClearIndices, -1, lastGroup)); + } + + private NFAStateTransition createNoCGTransition(NFAState source, NFAState target, CodePointSet codePointSet) { + return new NFAStateTransition((short) transitionID.inc(), source, target, codePointSet, GroupBoundaries.getEmptyInstance(ast.getLanguage())); } private NFAState registerMatcherState(StateSet stateSetCC, - CodePointSet matcherBuilder, StateSet finishedLookBehinds, boolean containsPrefixStates, boolean mustAdvance, @@ -338,7 +377,7 @@ private NFAState registerMatcherState(StateSet stateSe if (nfaStates.containsKey(nfaStateID)) { return nfaStates.get(nfaStateID); } else { - NFAState state = new NFAState((short) stateID.inc(), stateSetCC, matcherBuilder, finishedLookBehinds, containsPrefixStates, mustAdvance, matchedConditionGroupsMap); + NFAState state = new NFAState((short) stateID.inc(), stateSetCC, finishedLookBehinds, containsPrefixStates, mustAdvance, matchedConditionGroupsMap); expansionQueue.push(state); nfaStates.put(nfaStateID, state); return state; @@ -346,7 +385,7 @@ private NFAState registerMatcherState(StateSet stateSe } private void addNewLoopBackTransition(NFAState source, NFAState target) { - source.addLoopBackNext(createTransition(source, target, ast.getEncoding().getFullSet(), -1)); + source.addLoopBackNext(createTransition(source, target, ast.getEncoding().getFullSet())); if (ast.getHardPrefixNodes().isDisjoint(source.getStateSet()) || ast.getFlags().isSticky()) { target.incPredecessors(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAState.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAState.java index 2d3856c8c4d..2ee28a95ed0 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAState.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFAState.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -46,9 +46,10 @@ import java.util.Set; import java.util.stream.Collectors; +import org.graalvm.collections.EconomicMap; + import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.automaton.BasicState; import com.oracle.truffle.regex.tregex.automaton.StateSet; @@ -60,7 +61,6 @@ import com.oracle.truffle.regex.tregex.util.json.JsonConvertible; import com.oracle.truffle.regex.tregex.util.json.JsonObject; import com.oracle.truffle.regex.util.TBitSet; -import org.graalvm.collections.EconomicMap; /** * Represents a single state in the NFA form of a regular expression. States may either be matcher @@ -85,27 +85,24 @@ public final class NFAState extends BasicState imp @CompilationFinal private short revTransitionToAnchoredFinalState = -1; @CompilationFinal private short revTransitionToUnAnchoredFinalState = -1; private TBitSet possibleResults; - private final CodePointSet matcherBuilder; private final Set finishedLookBehinds; private final EconomicMap matchedConditionGroupsMap; public NFAState(short id, StateSet stateSet, - CodePointSet matcherBuilder, Set finishedLookBehinds, boolean hasPrefixStates, boolean mustAdvance) { - this(id, stateSet, initFlags(hasPrefixStates, mustAdvance), null, matcherBuilder, finishedLookBehinds, initMatchedConditionGroupsMap(stateSet)); + this(id, stateSet, initFlags(hasPrefixStates, mustAdvance), null, finishedLookBehinds, initMatchedConditionGroupsMap(stateSet)); } public NFAState(short id, StateSet stateSet, - CodePointSet matcherBuilder, Set finishedLookBehinds, boolean hasPrefixStates, boolean mustAdvance, EconomicMap matchedConditionGroupsMap) { - this(id, stateSet, initFlags(hasPrefixStates, mustAdvance), null, matcherBuilder, finishedLookBehinds, matchedConditionGroupsMap); + this(id, stateSet, initFlags(hasPrefixStates, mustAdvance), null, finishedLookBehinds, matchedConditionGroupsMap); } private static EconomicMap initMatchedConditionGroupsMap(StateSet stateSet) { @@ -126,34 +123,27 @@ private static byte initFlags(boolean hasPrefixStates, boolean mustAdvance) { private NFAState(short id, StateSet stateSet, short flags, - CodePointSet matcherBuilder, Set finishedLookBehinds, EconomicMap matchedConditionGroupsMap) { - this(id, stateSet, flags, null, matcherBuilder, finishedLookBehinds, matchedConditionGroupsMap); + this(id, stateSet, flags, null, finishedLookBehinds, matchedConditionGroupsMap); } private NFAState(short id, StateSet stateSet, short flags, TBitSet possibleResults, - CodePointSet matcherBuilder, Set finishedLookBehinds, EconomicMap matchedConditionGroupsMap) { super(id, EMPTY_TRANSITIONS); setFlag(flags); this.stateSet = stateSet; this.possibleResults = possibleResults; - this.matcherBuilder = matcherBuilder; this.finishedLookBehinds = finishedLookBehinds; this.matchedConditionGroupsMap = matchedConditionGroupsMap; } public NFAState createTraceFinderCopy(short copyID) { - return new NFAState(copyID, getStateSet(), getFlags(), matcherBuilder, finishedLookBehinds, matchedConditionGroupsMap); - } - - public CodePointSet getCharSet() { - return matcherBuilder; + return new NFAState(copyID, getStateSet(), getFlags(), finishedLookBehinds, matchedConditionGroupsMap); } public Set getFinishedLookBehinds() { @@ -366,7 +356,6 @@ public NFAState(NFAState original) { this.revTransitionToAnchoredFinalState = original.revTransitionToAnchoredFinalState; this.revTransitionToUnAnchoredFinalState = original.revTransitionToUnAnchoredFinalState; this.possibleResults = original.possibleResults; - this.matcherBuilder = original.matcherBuilder; this.finishedLookBehinds = original.finishedLookBehinds; this.matchedConditionGroupsMap = original.matchedConditionGroupsMap; } @@ -409,7 +398,6 @@ public JsonObject toJson() { Json.prop("stateSet", getStateSet().stream().map(x -> Json.val(x.getId()))), Json.prop("mustAdvance", isMustAdvance()), Json.prop("sourceSections", sourceSectionsToJson()), - Json.prop("matcherBuilder", matcherBuilder.toString()), Json.prop("forwardAnchoredFinalState", isAnchoredFinalState()), Json.prop("forwardUnAnchoredFinalState", isUnAnchoredFinalState()), Json.prop("reverseAnchoredFinalState", isAnchoredInitialState()), @@ -422,9 +410,9 @@ public JsonObject toJson() { public JsonObject toJson(boolean forward) { return Json.obj(Json.prop("id", getId()), Json.prop("stateSet", getStateSet().stream().map(x -> Json.val(x.getId()))), + Json.prop("matcherBuilder", Arrays.stream(getPredecessors()).findFirst().map(t -> t.getCodePointSet().toString()).orElse("")), Json.prop("mustAdvance", isMustAdvance()), Json.prop("sourceSections", sourceSectionsToJson()), - Json.prop("matcherBuilder", matcherBuilder.toString()), Json.prop("anchoredFinalState", isAnchoredFinalState(forward)), Json.prop("unAnchoredFinalState", isUnAnchoredFinalState(forward)), Json.prop("transitions", Arrays.stream(getSuccessors(forward)).map(x -> Json.val(x.getId())))); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFATraceFinderGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFATraceFinderGenerator.java index 0ede6998d9c..c1e108ade93 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFATraceFinderGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/NFATraceFinderGenerator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -241,21 +241,23 @@ private NFA run() { final NFAStateTransition pathTransition = graphPath.get(i).getTransition(); NFAState copy = copy(pathTransition.getTarget(), resultID); createTransition(lastCopied, copy, pathTransition, result, iResult); - iResult += getEncodedSize(copy); + iResult += getEncodedSize(pathTransition); lastCopied = copy; } // link the copied path to the existing tree createTransition(lastCopied, duplicate, curElement.getTransition(), result, iResult); // traverse the existing tree to the root to complete the pre-calculated // result. + NFAStateTransition parentTransition = curElement.getTransition(); NFAState treeNode = duplicate; while (!treeNode.isFinalState()) { - iResult += getEncodedSize(treeNode); + iResult += getEncodedSize(parentTransition); assert treeNode.getSuccessors().length == 1; + parentTransition = treeNode.getSuccessors()[0]; treeNode.addPossibleResult(resultID); - GroupBoundaries groupBoundaries = treeNode.getSuccessors()[0].getGroupBoundaries(); + GroupBoundaries groupBoundaries = parentTransition.getGroupBoundaries(); groupBoundaries.applyToResultFactory(result, iResult, trackLastGroup); - treeNode = treeNode.getSuccessors()[0].getTarget(); + treeNode = parentTransition.getTarget(); } treeNode.addPossibleResult(resultID); result.setLength(iResult); @@ -331,9 +333,9 @@ private void registerCopy(NFAState original, NFAState copy) { assert states.get(copy.getId()) == copy; } - private int getEncodedSize(NFAState s) { + private int getEncodedSize(NFAStateTransition t) { Encoding encoding = originalNFA.getAst().getEncoding(); - assert encoding.isFixedCodePointWidth(s.getCharSet()); - return encoding.getEncodedSize(s.getCharSet().getMin()); + assert encoding.isFixedCodePointWidth(t.getCodePointSet()); + return encoding.getEncodedSize(t.getCodePointSet().getMin()); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFA.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFA.java index 3323ef3c518..c554ee4851e 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFA.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFA.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -63,6 +63,7 @@ public final class PureNFA implements StateIndex { private static final PureNFA[] NO_SUBTREES = {}; private final int globalSubTreeId; private final int subTreeId; + private final int fixedWidth; @CompilationFinal(dimensions = 1) private final PureNFAState[] states; @CompilationFinal(dimensions = 1) private final PureNFATransition[] transitions; @CompilationFinal(dimensions = 1) private final PureNFA[] subtrees; @@ -73,9 +74,14 @@ public PureNFA(RegexASTSubtreeRootNode astSubRoot, Counter.ThresholdCounter transitionIDCounter) { this.globalSubTreeId = astSubRoot.getGlobalSubTreeId(); this.subTreeId = astSubRoot.getSubTreeId(); + if (astSubRoot.isFixedWidth()) { + this.fixedWidth = astSubRoot.getGroup().getMinPath(); + } else { + this.fixedWidth = -1; + } this.states = new PureNFAState[stateIDCounter.getCount()]; this.transitions = new PureNFATransition[transitionIDCounter.getCount()]; - this.subtrees = astSubRoot.getSubtrees().size() == 0 ? NO_SUBTREES : new PureNFA[astSubRoot.getSubtrees().size()]; + this.subtrees = astSubRoot.getSubtrees().isEmpty() ? NO_SUBTREES : new PureNFA[astSubRoot.getSubtrees().size()]; for (PureNFAState s : states) { if (s == null) { continue; @@ -109,6 +115,14 @@ public RegexASTSubtreeRootNode getASTSubtree(RegexAST ast) { return isRoot() ? ast.getRoot().getSubTreeParent() : ast.getSubtrees().get(globalSubTreeId); } + public boolean isFixedWidth() { + return fixedWidth >= 0; + } + + public int getFixedWidth() { + return fixedWidth; + } + /** * Get this NFA's "dummy initial state". Since {@link DFAGenerator} works on sets of NFA * transitions, we need pseudo-transitions to the NFA's initial states as entry points for the diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java index 5b82f191364..77e0322c32a 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFAGenerator.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -131,13 +131,14 @@ private PureNFA createNFA(RegexASTSubtreeRootNode root) { Arrays.fill(nfaStates, null); stateID.reset(); transitionID.reset(); - transitionGen.setReverse(root.isLookBehindAssertion()); + boolean createReverseNFA = root.isLookBehindAssertion() && !(ast.getFlavor().lookBehindsRunLeftToRight() && root.isFixedWidth()); + transitionGen.setReverse(createReverseNFA); PureNFAState dummyInitialState = new PureNFAState(stateID.inc(), ast.getWrappedRoot()); nfaStates[ast.getWrappedRoot().getId()] = dummyInitialState; assert dummyInitialState.getId() == 0; - if (root.isLookBehindAssertion()) { + if (createReverseNFA) { if (root.hasCaret()) { anchoredFinalState = createFinalState(root.getAnchoredInitialState(), false); anchoredFinalState.setAnchoredFinalState(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFATransition.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFATransition.java index ecb488f4ec9..e637490ecab 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFATransition.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/PureNFATransition.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -147,6 +147,6 @@ public JsonValue toJson(RegexAST ast) { Json.prop("target", target.getId()), Json.prop("groupBoundaries", groupBoundaries), Json.prop("sourceSections", groupBoundaries.indexUpdateSourceSectionsToJson(ast)), - Json.prop("guards", Arrays.stream(guards).mapToObj(TransitionGuard::toJson))); + Json.prop("guards", guards.length == 0 ? Json.array(Json.val("no guards")) : Json.array(Arrays.stream(guards).mapToObj(TransitionGuard::toJson)))); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/TransitionGuard.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/TransitionGuard.java index 4cfcb51369c..c943dde14d5 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/TransitionGuard.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nfa/TransitionGuard.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -59,26 +59,33 @@ public final class TransitionGuard { public enum Kind { /** - * Transition represents a back-edge in the quantifier loop. Check if the loop count is - * below {@link Quantifier#getMax()}, then increase loop count. + * Increment loop count. */ - loop, + countInc, /** - * Transition represents either a first entry into a quantified expression, or a back-edge - * in a quantifier loop without upper bound, i.e. quantifiers where - * {@link Quantifier#isInfiniteLoop()} is {@code true}. Just increase the loop count. + * Set loop count to 1. */ - loopInc, + countSet1, /** - * Transition is leaving a quantified expression. Check if the loop count is above - * {@link Quantifier#getMin()}, then reset the loop count. + * Set loop count to the quantifier's minimum number of iterations + 1. The extra iteration + * is added because this guard is executed when entering the optional part of a split + * quantifier, i.e. this guard represents a counter value initialization to {@code min} with + * an immediate increment, analogous to how {@link #countSet1} represents a counter value + * initialization to 0 followed by an immediate increment. */ - exit, + countSetMin, /** - * Transition is leaving a quantified expression without lower bound, i.e. quantifiers where - * {@link Quantifier#getMin()} {@code == 0}. Just reset the loop count. + * Check if the loop count is less than {@link Quantifier#getMin()}. */ - exitReset, + countLtMin, + /** + * Check if the loop count is greater or equal to {@link Quantifier#getMin()}. + */ + countGeMin, + /** + * Check if the loop count is less than {@link Quantifier#getMax()}. + */ + countLtMax, /** * Transition is entering a quantified expression that may match the empty string. Save the * current index. @@ -122,32 +129,64 @@ public enum Kind { * {@link ConditionalBackReferenceGroup}. The capture group identified by * {@link #getGroupNumber(long)} must be *not* matched in order to proceed. */ - checkGroupNotMatched + checkGroupNotMatched, } @CompilationFinal(dimensions = 1) private static final Kind[] KIND_VALUES = Arrays.copyOf(Kind.values(), Kind.values().length); - private static final EnumSet QUANTIFIER_GUARDS = EnumSet.of(Kind.loop, Kind.loopInc, Kind.exit, Kind.exitReset); + private static final EnumSet QUANTIFIER_GUARDS = EnumSet.of(Kind.countInc, Kind.countSet1, Kind.countSetMin, Kind.countLtMin, Kind.countGeMin, Kind.countLtMax); private static final EnumSet ZERO_WIDTH_QUANTIFIER_GUARDS = EnumSet.of(Kind.enterZeroWidth, Kind.exitZeroWidth, Kind.escapeZeroWidth); private static final EnumSet GROUP_NUMBER_GUARDS = EnumSet.of(Kind.updateRecursiveBackrefPointer, Kind.checkGroupMatched, Kind.checkGroupNotMatched); private static final EnumSet GROUP_BOUNDARY_INDEX_GUARDS = EnumSet.of(Kind.updateCG); public static final long[] NO_GUARDS = {}; - public static long createLoop(Quantifier quantifier) { - return create(Kind.loop, quantifier); + public static long createCountInc(Quantifier quantifier) { + return create(Kind.countInc, quantifier); + } + + public static long createCountInc(int quantifierIndex) { + return create(Kind.countInc, quantifierIndex); + } + + public static long createCountSet1(Quantifier quantifier) { + return create(Kind.countSet1, quantifier); + } + + public static long createCountSet1(int quantifierIndex) { + return create(Kind.countSet1, quantifierIndex); + } + + public static long createCountSetMin(Quantifier quantifier) { + return create(Kind.countSetMin, quantifier); + } + + public static long createCountSetMin(int quantifierIndex) { + return create(Kind.countSetMin, quantifierIndex); } - public static long createLoopInc(Quantifier quantifier) { - return create(Kind.loopInc, quantifier); + public static long createCountLtMin(Quantifier quantifier) { + return create(Kind.countLtMin, quantifier); } - public static long createExit(Quantifier quantifier) { - return create(Kind.exit, quantifier); + public static long createCountLtMin(int quantifierIndex) { + return create(Kind.countLtMin, quantifierIndex); } - public static long createExitReset(Quantifier quantifier) { - return create(Kind.exitReset, quantifier); + public static long createCountGeMin(Quantifier quantifier) { + return create(Kind.countGeMin, quantifier); + } + + public static long createCountGeMin(int quantifierIndex) { + return create(Kind.countGeMin, quantifierIndex); + } + + public static long createCountLtMax(Quantifier quantifier) { + return create(Kind.countLtMax, quantifier); + } + + public static long createCountLtMax(int quantifierIndex) { + return create(Kind.countLtMax, quantifierIndex); } public static long createEnterZeroWidth(Quantifier quantifier) { @@ -167,6 +206,11 @@ public static long createEscapeZeroWidth(Quantifier quantifier) { return createZeroWidth(Kind.escapeZeroWidth, quantifier); } + public static long createEscapeZeroWidthFromEnter(long guard) { + assert is(guard, Kind.enterZeroWidth); + return create(Kind.escapeZeroWidth, getZeroWidthQuantifierIndex(guard)); + } + public static long createUpdateCG(int index) { return create(Kind.updateCG, index); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecNode.java index 3b786c30271..f2bded8a84e 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/TRegexExecNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -64,6 +64,7 @@ import com.oracle.truffle.regex.tregex.TRegexCompiler; import com.oracle.truffle.regex.tregex.nfa.NFA; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexDFAExecutorNode; +import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexLazyBackwardSimpleCGRootNode; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexLazyCaptureGroupsRootNode; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexLazyFindStartRootNode; import com.oracle.truffle.regex.tregex.nodes.dfa.TRegexTraceFinderRootNode; @@ -441,6 +442,8 @@ public LazyCaptureGroupRegexSearchNode(RegexLanguage language, final RegexBodyNode bodyNode; if (preCalculatedResults != null) { bodyNode = new TRegexTraceFinderRootNode(language, source, preCalculatedResults, backwardNode); + } else if (getBackwardExecutor().isSimpleCG()) { + bodyNode = new TRegexLazyBackwardSimpleCGRootNode(language, source, backwardNode); } else { bodyNode = new TRegexLazyFindStartRootNode(language, source, backwardNode, captureGroupNode == null); } @@ -498,17 +501,18 @@ private RegexResult executeForward(VirtualFrame frame, TruffleString input, int return preCalculatedResults[0].createFromEnd((int) end); } if (preCalculatedResults == null && captureGroupEntryNode == null) { - if (end == fromIndex) { // zero-length match + if ((backwardCallTarget == null || getForwardExecutor().getNumberOfCaptureGroups() == 1) && end == fromIndex) { + // zero-length match return RegexResult.create((int) end, (int) end); } if (getForwardExecutor().isAnchored() || flags.isSticky()) { return RegexResult.create(fromIndex, (int) end); } - if (getForwardExecutor().canFindStart()) { + if (backwardCallTarget == null && getForwardExecutor().canFindStart()) { return RegexResult.create((int) (end >>> 32), (int) end); - } else { - return RegexResult.createLazy(input, fromIndex, regionFrom, regionTo, -1, (int) end, backwardCallTarget); } + assert backwardCallTarget != null; + return RegexResult.createLazy(input, fromIndex, regionFrom, regionTo, -1, (int) end, backwardCallTarget); } else { if (preCalculatedResults != null) { // traceFinder return RegexResult.createLazy(input, fromIndex, regionFrom, regionTo, -1, (int) end, backwardCallTarget); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAInitialStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAInitialStateNode.java index 40a6197b111..464ad3ab05b 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAInitialStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAInitialStateNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -59,11 +59,13 @@ public class DFAInitialStateNode extends DFAAbstractStateNode { @CompilationFinal(dimensions = 1) private final short[] cgLastTransition; private final boolean hasUnanchoredEntry; + private final DFASimpleCG simpleCG; - public DFAInitialStateNode(short[] successors, short[] cgLastTransition) { + public DFAInitialStateNode(short[] successors, short[] cgLastTransition, DFASimpleCG simpleCG) { super((short) 0, successors); this.cgLastTransition = cgLastTransition; this.hasUnanchoredEntry = initUnanchoredEntry(successors); + this.simpleCG = simpleCG; } private static boolean initUnanchoredEntry(short[] successors) { @@ -76,7 +78,7 @@ private static boolean initUnanchoredEntry(short[] successors) { } private DFAInitialStateNode(DFAInitialStateNode copy) { - this(Arrays.copyOf(copy.successors, copy.successors.length), copy.cgLastTransition); + this(Arrays.copyOf(copy.successors, copy.successors.length), copy.cgLastTransition, copy.simpleCG); } public short[] getCgLastTransition() { @@ -91,6 +93,10 @@ public boolean hasUnAnchoredEntry() { return hasUnanchoredEntry; } + public DFASimpleCG getSimpleCG() { + return simpleCG; + } + /** * Creates a node split copy of this initial state as described in {@link DFAAbstractStateNode}, * but ignores copyID, since having two initial states in a DFA is not supported. Therefore, diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFASimpleCGTransition.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFASimpleCGTransition.java index 70f4b8a32c8..d90a108c87a 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFASimpleCGTransition.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFASimpleCGTransition.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -85,7 +85,7 @@ public static DFASimpleCGTransition getEmptyInstance() { return EMPTY_INSTANCE; } - public void apply(int[] result, int currentIndex, boolean trackLastGroup) { + public void apply(int[] result, int currentIndex, boolean trackLastGroup, boolean forward) { CompilerAsserts.partialEvaluationConstant(this); if (indexClears == FULL_CLEAR_ARRAY) { Arrays.fill(result, -1); @@ -94,11 +94,11 @@ public void apply(int[] result, int currentIndex, boolean trackLastGroup) { } applyIndexUpdate(result, currentIndex); if (trackLastGroup && lastGroup != -1) { - result[result.length - 1] = lastGroup; + applyLastGroup(result, forward); } } - public void applyFinal(DFACaptureGroupTrackingData cgData, int currentIndex, boolean simpleCGMustCopy, boolean trackLastGroup) { + public void applyFinal(DFACaptureGroupTrackingData cgData, int currentIndex, boolean simpleCGMustCopy, boolean trackLastGroup, boolean forward) { CompilerAsserts.partialEvaluationConstant(this); int[] result = simpleCGMustCopy ? cgData.currentResult : cgData.results; if (indexClears == FULL_CLEAR_ARRAY) { @@ -109,13 +109,19 @@ public void applyFinal(DFACaptureGroupTrackingData cgData, int currentIndex, boo applyIndexUpdate(result, currentIndex); if (trackLastGroup && lastGroup != -1) { if (simpleCGMustCopy) { - cgData.currentResult[cgData.currentResult.length - 1] = lastGroup; + applyLastGroup(cgData.currentResult, forward); } else { - cgData.results[cgData.results.length - 1] = lastGroup; + applyLastGroup(cgData.results, forward); } } } + private void applyLastGroup(int[] result, boolean forward) { + if (forward || result[result.length - 1] == -1) { + result[result.length - 1] = lastGroup; + } + } + @ExplodeLoop private void applyIndexUpdate(int[] result, int currentIndex) { for (int i = 0; i < indexUpdates.length; i++) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java index 94afa97a9aa..995660e1675 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/DFAStateNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -263,11 +263,12 @@ void storeResult(TRegexDFAExecutorLocals locals, TRegexDFAExecutorNode executor, } void applySimpleCGTransition(DFASimpleCGTransition transition, TRegexDFAExecutorNode executor, TRegexDFAExecutorLocals locals) { - transition.apply(locals.getCGData().results, locals.getIndex(), executor.getProperties().tracksLastGroup()); + int index = executor.isForward() ? locals.getIndex() : locals.getNextIndex(); + transition.apply(locals.getCGData().results, index, executor.getProperties().tracksLastGroup(), executor.isForward()); } void applySimpleCGFinalTransition(DFASimpleCGTransition transition, TRegexDFAExecutorNode executor, TRegexDFAExecutorLocals locals) { - transition.applyFinal(locals.getCGData(), locals.getIndex(), executor.getProperties().isSimpleCGMustCopy(), executor.getProperties().tracksLastGroup()); + transition.applyFinal(locals.getCGData(), locals.getIndex(), executor.getProperties().isSimpleCGMustCopy(), executor.getProperties().tracksLastGroup(), executor.isForward()); } @TruffleBoundary diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorDebugRecorder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorDebugRecorder.java index d6c36f3e83e..eab65929aa0 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorDebugRecorder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorDebugRecorder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -41,6 +41,7 @@ package com.oracle.truffle.regex.tregex.nodes.dfa; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; @@ -51,7 +52,9 @@ import com.oracle.truffle.regex.tregex.dfa.DFAGenerator; import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorBaseNode; import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorLocals; +import com.oracle.truffle.regex.tregex.string.Encodings; import com.oracle.truffle.regex.tregex.util.json.Json; +import com.oracle.truffle.regex.tregex.util.json.JsonArray; import com.oracle.truffle.regex.tregex.util.json.JsonConvertible; import com.oracle.truffle.regex.tregex.util.json.JsonValue; @@ -68,18 +71,27 @@ public final class TRegexDFAExecutorDebugRecorder implements JsonConvertible { private static final class Recording implements JsonConvertible { - private final String input; + private final TruffleString input; + private final Encodings.Encoding encoding; private final int fromIndex; private int initialIndex; private final int maxIndex; - private final List transitions; + private final boolean forward; + private final int[] transitions; + private final int[] cgPartialTransitions; - private Recording(String input, int fromIndex, int initialIndex, int maxIndex) { + private Recording(TruffleString input, Encodings.Encoding encoding, int fromIndex, int initialIndex, int maxIndex, boolean forward) { this.input = input; + this.encoding = encoding; this.fromIndex = fromIndex; this.initialIndex = initialIndex; this.maxIndex = maxIndex; - transitions = new ArrayList<>(); + this.forward = forward; + int codepoints = input.codePointLengthUncached(encoding.getTStringEncoding()); + transitions = new int[codepoints]; + cgPartialTransitions = new int[codepoints]; + Arrays.fill(transitions, -1); + Arrays.fill(cgPartialTransitions, -1); } @TruffleBoundary @@ -87,76 +99,47 @@ public void setInitialIndex(int initialIndex) { this.initialIndex = initialIndex; } - @TruffleBoundary - private int getLowestIndex() { - return initialIndex < maxIndex ? initialIndex : maxIndex; - } - - @TruffleBoundary - private void initUpToIndex(int currentIndex) { - for (int i = transitions.size(); i <= currentIndex - getLowestIndex(); i++) { - transitions.add(new RecordedTransition(getLowestIndex() + i)); - } - } - - @TruffleBoundary - private RecordedTransition getTransition(int currentIndex) { - RecordedTransition transition = transitions.get(currentIndex - getLowestIndex()); - assert transition.currentIndex == currentIndex; - return transition; - } - @TruffleBoundary public void recordTransition(int currentIndex, int transitionID) { - initUpToIndex(currentIndex); - getTransition(currentIndex).setTransitionID(transitionID); + transitions[toCodePointIndex(currentIndex)] = transitionID; } @TruffleBoundary public void recordCGPartialTransition(int currentIndex, int cgPartialTransitionIndex) { - initUpToIndex(currentIndex); - getTransition(currentIndex).setCgPartialTransitionID(cgPartialTransitionIndex); + cgPartialTransitions[toCodePointIndex(currentIndex)] = cgPartialTransitionIndex; + } + + private int toCodePointIndex(int currentIndex) { + return input.byteIndexToCodePointIndexUncached(0, currentIndex << encoding.getStride(), encoding.getTStringEncoding()) - (forward ? 0 : 1); } @TruffleBoundary @Override public JsonValue toJson() { - return Json.obj(Json.prop("input", input), + JsonArray jsonTransitions = Json.array(); + if (forward) { + for (int i = 0; i < transitions.length; i++) { + appendJsonTransition(i, jsonTransitions); + } + } else { + for (int i = transitions.length - 1; i >= 0; i--) { + appendJsonTransition(i, jsonTransitions); + } + } + return Json.obj(Json.prop("input", input.toJavaStringUncached()), Json.prop("fromIndex", fromIndex), Json.prop("initialIndex", initialIndex), Json.prop("maxIndex", maxIndex), - Json.prop("transitions", transitions)); + Json.prop("transitions", jsonTransitions)); } - } - - private static final class RecordedTransition implements JsonConvertible { - private final int currentIndex; - private int transitionID = -1; - private int cgPartialTransitionID = -1; - - @TruffleBoundary - private RecordedTransition(int currentIndex) { - this.currentIndex = currentIndex; - } - - @TruffleBoundary - public void setTransitionID(int transitionID) { - this.transitionID = transitionID; - } - - @TruffleBoundary - public void setCgPartialTransitionID(int cgPartialTransitionID) { - assert this.cgPartialTransitionID == -1 || this.cgPartialTransitionID == 0; - this.cgPartialTransitionID = cgPartialTransitionID; - } - - @TruffleBoundary - @Override - public JsonValue toJson() { - return Json.obj(Json.prop("currentIndex", currentIndex), - Json.prop("transitionID", transitionID), - Json.prop("cgPartialTransitionID", cgPartialTransitionID)); + private void appendJsonTransition(int i, JsonArray jsonTransitions) { + if (transitions[i] >= 0) { + jsonTransitions.append(Json.obj( + Json.prop("currentIndex", i), + Json.prop("transitionID", transitions[i]), + Json.prop("cgPartialTransitionID", cgPartialTransitions[i]))); + } } } @@ -170,11 +153,11 @@ private TRegexDFAExecutorDebugRecorder(DFAGenerator dfa) { } private final DFAGenerator dfa; - private List recordings = new ArrayList<>(); + private final List recordings = new ArrayList<>(); @TruffleBoundary public void startRecording(TRegexDFAExecutorLocals locals) { - recordings.add(new Recording(locals.getInput().toString(), locals.getFromIndex(), locals.getIndex(), locals.getMaxIndex())); + recordings.add(new Recording(locals.getInput(), dfa.getOptions().getEncoding(), locals.getFromIndex(), locals.getIndex(), locals.getMaxIndex(), dfa.isForward())); } @TruffleBoundary diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorNode.java index 9cb70edeaf5..cbe35fcc917 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexDFAExecutorNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -735,6 +735,11 @@ private short initialStateSuccessor(TRegexDFAExecutorLocals locals, DFAAbstractS if (lastTransition >= 0) { locals.setLastTransition(lastTransition); } + } else if (isSimpleCG()) { + DFASimpleCG simpleCG = ((DFAInitialStateNode) curState).getSimpleCG(); + if (simpleCG != null) { + simpleCG.getTransitions()[i].apply(locals.getCGData().results, locals.getIndex(), getProperties().tracksLastGroup(), isForward()); + } } return successors[i]; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexLazyBackwardSimpleCGRootNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexLazyBackwardSimpleCGRootNode.java new file mode 100644 index 00000000000..de93b8ff21e --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/dfa/TRegexLazyBackwardSimpleCGRootNode.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.tregex.nodes.dfa; + +import com.oracle.truffle.api.frame.VirtualFrame; +import com.oracle.truffle.regex.RegexBodyNode; +import com.oracle.truffle.regex.RegexLanguage; +import com.oracle.truffle.regex.RegexSource; +import com.oracle.truffle.regex.result.RegexResult; +import com.oracle.truffle.regex.tregex.nodes.TRegexExecutorEntryNode; + +public class TRegexLazyBackwardSimpleCGRootNode extends RegexBodyNode { + + @Child private TRegexExecutorEntryNode entryNode; + + public TRegexLazyBackwardSimpleCGRootNode(RegexLanguage language, RegexSource source, TRegexExecutorEntryNode backwardNode) { + super(language, source); + this.entryNode = insert(backwardNode); + } + + @Override + public final Object execute(VirtualFrame frame) { + final Object[] args = frame.getArguments(); + assert args.length == 1; + final RegexResult receiver = (RegexResult) args[0]; + int[] result = (int[]) entryNode.execute(frame, receiver.getInput(), receiver.getFromIndex(), receiver.getEnd(), receiver.getRegionFrom(), receiver.getRegionTo(), receiver.getEnd()); + receiver.setResult(result); + return null; + } + + @Override + public String getEngineLabel() { + return "TRegex bck"; + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorLocals.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorLocals.java index 82531e1998b..f6b15ba3794 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorLocals.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorLocals.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -500,10 +500,6 @@ public void setQuantifierCount(int quantifierIndex, int count) { stack()[offsetQuantifierCount(quantifierIndex)] = count; } - public void resetQuantifierCount(int quantifierIndex) { - stack()[offsetQuantifierCount(quantifierIndex)] = 0; - } - public void incQuantifierCount(int quantifierIndex) { stack()[offsetQuantifierCount(quantifierIndex)]++; } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java index d672910fb7d..e7173aacc47 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/nodes/nfa/TRegexBacktrackingNFAExecutorNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -75,7 +75,6 @@ import com.oracle.truffle.regex.tregex.parser.Token.Quantifier; import com.oracle.truffle.regex.tregex.parser.ast.Group; import com.oracle.truffle.regex.tregex.parser.ast.InnerLiteral; -import com.oracle.truffle.regex.tregex.parser.ast.LookBehindAssertion; import com.oracle.truffle.regex.tregex.parser.ast.QuantifiableTerm; import com.oracle.truffle.regex.tregex.parser.ast.RegexAST; import com.oracle.truffle.regex.tregex.parser.ast.RegexASTSubtreeRootNode; @@ -87,21 +86,22 @@ */ public final class TRegexBacktrackingNFAExecutorNode extends TRegexBacktrackerSubExecutorNode { - private static final int FLAG_WRITES_CAPTURE_GROUPS = 1 << 0; - private static final int FLAG_FORWARD = 1 << 1; - private static final int FLAG_BACKREF_WITH_NULL_TARGET_FAILS = 1 << 2; - private static final int FLAG_MONITOR_CAPTURE_GROUPS_IN_EMPTY_CHECK = 1 << 3; - private static final int FLAG_TRANSITION_MATCHES_STEP_BY_STEP = 1 << 4; - private static final int FLAG_EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS = 1 << 5; - private static final int FLAG_TRACK_LAST_GROUP = 1 << 6; - private static final int FLAG_RETURNS_FIRST_GROUP = 1 << 7; + private static final int FLAG_BACKREF_IGNORE_CASE_MULTI_CHARACTER_EXPANSION = 1 << 0; + private static final int FLAG_BACKREF_WITH_NULL_TARGET_FAILS = 1 << 1; + private static final int FLAG_EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS = 1 << 2; + private static final int FLAG_FORWARD = 1 << 3; + private static final int FLAG_LONE_SURROGATES = 1 << 4; + private static final int FLAG_LOOPBACK_INITIAL_STATE = 1 << 5; + private static final int FLAG_MATCH_BOUNDARY_ASSERTIONS = 1 << 6; + private static final int FLAG_MONITOR_CAPTURE_GROUPS_IN_EMPTY_CHECK = 1 << 7; private static final int FLAG_MUST_ADVANCE = 1 << 8; - private static final int FLAG_LONE_SURROGATES = 1 << 9; - private static final int FLAG_LOOPBACK_INITIAL_STATE = 1 << 10; - private static final int FLAG_USE_MERGE_EXPLODE = 1 << 11; - private static final int FLAG_RECURSIVE_BACK_REFERENCES = 1 << 12; - private static final int FLAG_BACKREF_IGNORE_CASE_MULTI_CHARACTER_EXPANSION = 1 << 13; - private static final int FLAG_MATCH_BOUNDARY_ASSERTIONS = 1 << 14; + private static final int FLAG_RECURSIVE_BACK_REFERENCES = 1 << 9; + private static final int FLAG_RETURNS_FIRST_GROUP = 1 << 10; + private static final int FLAG_REWIND_FIXED_WIDTH_LOOK_BEHIND = 1 << 11; + private static final int FLAG_TRACK_LAST_GROUP = 1 << 12; + private static final int FLAG_TRANSITION_MATCHES_STEP_BY_STEP = 1 << 13; + private static final int FLAG_USE_MERGE_EXPLODE = 1 << 14; + private static final int FLAG_WRITES_CAPTURE_GROUPS = 1 << 15; private final PureNFA nfa; private final int numberOfStates; @@ -141,8 +141,8 @@ public TRegexBacktrackingNFAExecutorNode(RegexAST ast, PureNFA nfa, int numberOf QuantifiableTerm quantifiable = zeroWidthQuantifiables.get(i); if (quantifiable.isGroup()) { Group group = quantifiable.asGroup(); - this.zeroWidthTermEnclosedCGLow[i] = group.getCaptureGroupsLow(); - offset += 2 * (group.getCaptureGroupsHigh() - group.getCaptureGroupsLow()); + this.zeroWidthTermEnclosedCGLow[i] = group.getCaptureGroupsLo(); + offset += 2 * (group.getCaptureGroupsHi() - group.getCaptureGroupsLo()); } this.zeroWidthQuantifierCGOffsets[i + 1] = offset; this.zeroWidthQuantifiers[quantifiable.getQuantifier().getZeroWidthIndex()] = quantifiable.getQuantifier(); @@ -152,9 +152,9 @@ public TRegexBacktrackingNFAExecutorNode(RegexAST ast, PureNFA nfa, int numberOf } else { this.innerLiteral = null; } - this.equalsIgnoreCase = ast.getOptions().getFlavor().getEqualsIgnoreCasePredicate(ast); + this.equalsIgnoreCase = ast.getFlavor().getEqualsIgnoreCasePredicate(ast); if (isBackreferenceIgnoreCaseMultiCharExpansion() && ast.getProperties().hasBackReferences()) { - this.multiCharacterExpansionCaseFoldAlgorithm = ast.getOptions().getFlavor().getCaseFoldAlgorithm(ast); + this.multiCharacterExpansionCaseFoldAlgorithm = ast.getFlavor().getCaseFoldAlgorithm(ast); } else { this.multiCharacterExpansionCaseFoldAlgorithm = null; } @@ -213,21 +213,23 @@ public int getNumberOfStates() { } private static int createFlags(RegexAST ast, PureNFA nfa, boolean mustAdvance, RegexASTSubtreeRootNode subtree, int nStates, int nTransitions) { + RegexFlavor flavor = ast.getFlavor(); int flags = 0; flags = setFlag(flags, FLAG_WRITES_CAPTURE_GROUPS, subtree.hasCaptureGroups()); - flags = setFlag(flags, FLAG_FORWARD, !(subtree instanceof LookBehindAssertion)); - flags = setFlag(flags, FLAG_BACKREF_WITH_NULL_TARGET_FAILS, ast.getOptions().getFlavor().backreferencesToUnmatchedGroupsFail()); - flags = setFlag(flags, FLAG_MONITOR_CAPTURE_GROUPS_IN_EMPTY_CHECK, ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups()); - flags = setFlag(flags, FLAG_TRANSITION_MATCHES_STEP_BY_STEP, ast.getOptions().getFlavor().matchesTransitionsStepByStep()); - flags = setFlag(flags, FLAG_EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS, ast.getOptions().getFlavor().emptyChecksOnMandatoryLoopIterations()); - flags = setFlag(flags, FLAG_TRACK_LAST_GROUP, ast.getOptions().getFlavor().usesLastGroupResultField()); - flags = setFlag(flags, FLAG_RETURNS_FIRST_GROUP, !isFlagSet(flags, FLAG_FORWARD) && ast.getOptions().getFlavor().lookBehindsRunLeftToRight()); + flags = setFlag(flags, FLAG_REWIND_FIXED_WIDTH_LOOK_BEHIND, subtree.isLookBehindAssertion() && flavor.lookBehindsRunLeftToRight() && nfa.isFixedWidth()); + flags = setFlag(flags, FLAG_FORWARD, !subtree.isLookBehindAssertion() || isFlagSet(flags, FLAG_REWIND_FIXED_WIDTH_LOOK_BEHIND)); + flags = setFlag(flags, FLAG_BACKREF_WITH_NULL_TARGET_FAILS, flavor.backreferencesToUnmatchedGroupsFail()); + flags = setFlag(flags, FLAG_MONITOR_CAPTURE_GROUPS_IN_EMPTY_CHECK, flavor.emptyChecksMonitorCaptureGroups()); + flags = setFlag(flags, FLAG_TRANSITION_MATCHES_STEP_BY_STEP, flavor.matchesTransitionsStepByStep()); + flags = setFlag(flags, FLAG_EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS, flavor.emptyChecksOnMandatoryLoopIterations()); + flags = setFlag(flags, FLAG_TRACK_LAST_GROUP, flavor.usesLastGroupResultField()); + flags = setFlag(flags, FLAG_RETURNS_FIRST_GROUP, !isFlagSet(flags, FLAG_FORWARD) && flavor.lookBehindsRunLeftToRight()); flags = setFlag(flags, FLAG_MUST_ADVANCE, mustAdvance); flags = setFlag(flags, FLAG_LONE_SURROGATES, ast.getProperties().hasLoneSurrogates()); flags = setFlag(flags, FLAG_LOOPBACK_INITIAL_STATE, nfa.isRoot() && !ast.getFlags().isSticky() && !ast.getRoot().startsWithCaret()); flags = setFlag(flags, FLAG_USE_MERGE_EXPLODE, nStates <= ast.getOptions().getMaxBackTrackerCompileSize() && nTransitions <= ast.getOptions().getMaxBackTrackerCompileSize()); flags = setFlag(flags, FLAG_RECURSIVE_BACK_REFERENCES, ast.getProperties().hasRecursiveBackReferences()); - flags = setFlag(flags, FLAG_BACKREF_IGNORE_CASE_MULTI_CHARACTER_EXPANSION, ast.getOptions().getFlavor().backreferenceIgnoreCaseMultiCharExpansion() && ast.getProperties().hasBackReferences()); + flags = setFlag(flags, FLAG_BACKREF_IGNORE_CASE_MULTI_CHARACTER_EXPANSION, flavor.backreferenceIgnoreCaseMultiCharExpansion() && ast.getProperties().hasBackReferences()); flags = setFlag(flags, FLAG_MATCH_BOUNDARY_ASSERTIONS, ast.getProperties().hasMatchBoundaryAssertions()); return flags; } @@ -309,6 +311,10 @@ public boolean isMatchBoundaryAssertions() { return isFlagSet(FLAG_MATCH_BOUNDARY_ASSERTIONS); } + public boolean isRewindFixedWidthLookBehind() { + return isFlagSet(FLAG_REWIND_FIXED_WIDTH_LOOK_BEHIND); + } + private boolean isFlagSet(int flag) { return isFlagSet(flags, flag); } @@ -346,6 +352,12 @@ public TRegexExecutorLocals createLocals(TruffleString input, int fromIndex, int @Override public Object execute(VirtualFrame frame, TRegexExecutorLocals abstractLocals, TruffleString.CodeRange codeRange) { TRegexBacktrackingNFAExecutorLocals locals = (TRegexBacktrackingNFAExecutorLocals) abstractLocals; + if (isRewindFixedWidthLookBehind()) { + assert isForward(); + if (rewindUpTo(locals, 0, nfa.getFixedWidth(), codeRange) != nfa.getFixedWidth()) { + return null; + } + } if (innerLiteral != null) { locals.setIndex(locals.getFromIndex()); int innerLiteralIndex = findInnerLiteral(locals); @@ -803,26 +815,29 @@ protected boolean transitionMatches(VirtualFrame frame, TRegexBacktrackingNFAExe TransitionGuard.Kind kind = TransitionGuard.getKind(guard); CompilerAsserts.partialEvaluationConstant(kind); switch (kind) { - case loop -> { - // retreat if quantifier count is at maximum - if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) == getQuantifier(guard).getMax()) { + case countLtMin -> { + // retreat if quantifier count is greater or equal to minimum + if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) >= getQuantifier(guard).getMin()) { return false; } } - case exit -> { + case countGeMin -> { // retreat if quantifier count is less than minimum if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) < getQuantifier(guard).getMin()) { return false; } } + case countLtMax -> { + // retreat if quantifier count is at maximum + if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) >= getQuantifier(guard).getMax()) { + return false; + } + } case exitZeroWidth -> { Quantifier q = getZeroWidthQuantifier(guard); CompilerAsserts.partialEvaluationConstant(q); if (locals.getZeroWidthQuantifierGuardIndex(TransitionGuard.getZeroWidthQuantifierIndex(guard)) == index && (!isMonitorCaptureGroupsInEmptyCheck() || locals.isResultUnmodifiedByZeroWidthQuantifier(TransitionGuard.getZeroWidthQuantifierIndex(guard))) && - // In JS, we allow this guard to pass if we are still in the - // optional part of the quantifier. This allows JS to fast- - // forward past all the empty mandatory iterations. (isEmptyChecksOnMandatoryLoopIterations() || !q.hasIndex() || locals.getQuantifierCount(q.getIndex()) > q.getMin())) { return false; } @@ -833,16 +848,20 @@ protected boolean transitionMatches(VirtualFrame frame, TRegexBacktrackingNFAExe return false; } } - case checkGroupMatched -> { - if (getBackRefBoundary(locals, transition, Group.groupNumberToBoundaryIndexStart(TransitionGuard.getGroupNumber(guard)), index) == -1 || - getBackRefBoundary(locals, transition, Group.groupNumberToBoundaryIndexEnd(TransitionGuard.getGroupNumber(guard)), index) == -1) { - return false; - } - } - case checkGroupNotMatched -> { - if (getBackRefBoundary(locals, transition, Group.groupNumberToBoundaryIndexStart(TransitionGuard.getGroupNumber(guard)), index) != -1 && - getBackRefBoundary(locals, transition, Group.groupNumberToBoundaryIndexEnd(TransitionGuard.getGroupNumber(guard)), index) != -1) { - return false; + case checkGroupMatched, checkGroupNotMatched -> { + int start = getBackRefBoundary(locals, transition, Group.groupNumberToBoundaryIndexStart(TransitionGuard.getGroupNumber(guard)), index); + int end = getBackRefBoundary(locals, transition, Group.groupNumberToBoundaryIndexEnd(TransitionGuard.getGroupNumber(guard)), index); + switch (kind) { + case checkGroupMatched -> { + if (start == -1 || end == -1) { + return false; + } + } + case checkGroupNotMatched -> { + if (start != -1 && end != -1) { + return false; + } + } } } default -> { @@ -891,41 +910,19 @@ protected void updateState(TRegexBacktrackingNFAExecutorLocals locals, PureNFATr for (long guard : transition.getGuards()) { CompilerAsserts.partialEvaluationConstant(guard); switch (TransitionGuard.getKind(guard)) { - case loop, loopInc -> { + case countInc -> { locals.incQuantifierCount(TransitionGuard.getQuantifierIndex(guard)); } - case exit, exitReset -> { - locals.resetQuantifierCount(TransitionGuard.getQuantifierIndex(guard)); + case countSet1 -> { + locals.setQuantifierCount(TransitionGuard.getQuantifierIndex(guard), 1); + } + case countSetMin -> { + locals.setQuantifierCount(TransitionGuard.getQuantifierIndex(guard), getQuantifier(guard).getMin() + 1); } case enterZeroWidth -> { locals.setZeroWidthQuantifierGuardIndex(TransitionGuard.getZeroWidthQuantifierIndex(guard)); locals.setZeroWidthQuantifierResults(TransitionGuard.getZeroWidthQuantifierIndex(guard)); } - case exitZeroWidth -> { - Quantifier q = getZeroWidthQuantifier(guard); - CompilerAsserts.partialEvaluationConstant(q); - boolean emptyCheckFailed = locals.getZeroWidthQuantifierGuardIndex(TransitionGuard.getZeroWidthQuantifierIndex(guard)) == index && - (!isMonitorCaptureGroupsInEmptyCheck() || locals.isResultUnmodifiedByZeroWidthQuantifier(TransitionGuard.getZeroWidthQuantifierIndex(guard))); - boolean advancePastOptionalIterations = !isEmptyChecksOnMandatoryLoopIterations() && q.hasIndex() && locals.getQuantifierCount(q.getIndex()) < q.getMin(); - if (emptyCheckFailed && advancePastOptionalIterations && !transition.hasCaretGuard() && !transition.hasDollarGuard()) { - // We advance the counter to min - 1 to skip past all but one mandatory - // iteration. We do not skip the last mandatory iteration and set the - // counter to min, because of the way JavaScript regexes are executed. The - // JavaScript flavor does not set matchesTransitionStepByStep and therefore - // all guards are tested against the same original state. In the case of the - // last mandatory iteration, we would like it to be possible to match the - // exitZeroWidth guard followed by the exit guard, so that it is possible to - // hit the exact minimum number of iterations. However, this relies on first - // updating the state with exitZeroWidth and then testing this new state - // with the exit guard. This would mean having to enable - // matchesTransitionStepByStep for JavaScript and implementing this logic in - // tryUpdateState instead, which would lead to degraded performance for JS - // regexps. Instead, we choose to advance the counter to just before the - // last mandatory iteration so that this fast-forwarding behavior does not - // coincide with an exit guard that should pass. - locals.setQuantifierCount(q.getIndex(), q.getMin() - 1); - } - } default -> { } } @@ -1011,64 +1008,81 @@ protected boolean tryUpdateState(VirtualFrame frame, TRegexBacktrackingNFAExecut locals.setMatchEndAssertionTraversed(); } for (long guard : transition.getGuards()) { - switch (TransitionGuard.getKind(guard)) { - case loopInc: + TransitionGuard.Kind kind = TransitionGuard.getKind(guard); + switch (kind) { + case countInc -> { locals.incQuantifierCount(TransitionGuard.getQuantifierIndex(guard)); - break; - case loop: - // retreat if quantifier count is at maximum - if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) == getQuantifier(guard).getMax()) { + } + case countSet1 -> { + locals.setQuantifierCount(TransitionGuard.getQuantifierIndex(guard), 1); + } + case countSetMin -> { + locals.setQuantifierCount(TransitionGuard.getQuantifierIndex(guard), getQuantifier(guard).getMin() + 1); + } + case countLtMin -> { + // retreat if quantifier count is greater or equal to minimum + if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) >= getQuantifier(guard).getMin()) { return false; } - locals.incQuantifierCount(TransitionGuard.getQuantifierIndex(guard)); - break; - case exit: + } + case countGeMin -> { // retreat if quantifier count is less than minimum if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) < getQuantifier(guard).getMin()) { return false; } - locals.resetQuantifierCount(TransitionGuard.getQuantifierIndex(guard)); - break; - case exitReset: - locals.resetQuantifierCount(TransitionGuard.getQuantifierIndex(guard)); - break; - case updateCG: + } + case countLtMax -> { + // retreat if quantifier count is at maximum + if (locals.getQuantifierCount(TransitionGuard.getQuantifierIndex(guard)) >= getQuantifier(guard).getMax()) { + return false; + } + } + case updateCG -> { locals.setCaptureGroupBoundary(TransitionGuard.getGroupBoundaryIndex(guard), index); if (isTrackLastGroup() && TransitionGuard.getGroupBoundaryIndex(guard) % 2 != 0 && TransitionGuard.getGroupBoundaryIndex(guard) > 1) { locals.setLastGroup(TransitionGuard.getGroupBoundaryIndex(guard) / 2); } - break; - case updateRecursiveBackrefPointer: + } + case updateRecursiveBackrefPointer -> { locals.saveRecursiveBackrefGroupStart(TransitionGuard.getGroupNumber(guard)); - break; - case enterZeroWidth: + } + case enterZeroWidth -> { locals.setZeroWidthQuantifierGuardIndex(TransitionGuard.getZeroWidthQuantifierIndex(guard)); locals.setZeroWidthQuantifierResults(TransitionGuard.getZeroWidthQuantifierIndex(guard)); - break; - case exitZeroWidth: + } + case exitZeroWidth -> { + Quantifier q = getZeroWidthQuantifier(guard); + CompilerAsserts.partialEvaluationConstant(q); if (locals.getZeroWidthQuantifierGuardIndex(TransitionGuard.getZeroWidthQuantifierIndex(guard)) == index && - (!isMonitorCaptureGroupsInEmptyCheck() || locals.isResultUnmodifiedByZeroWidthQuantifier(TransitionGuard.getZeroWidthQuantifierIndex(guard)))) { + (!isMonitorCaptureGroupsInEmptyCheck() || locals.isResultUnmodifiedByZeroWidthQuantifier(TransitionGuard.getZeroWidthQuantifierIndex(guard))) && + (isEmptyChecksOnMandatoryLoopIterations() || !q.hasIndex() || locals.getQuantifierCount(q.getIndex()) > q.getMin())) { return false; } - break; - case escapeZeroWidth: + } + case escapeZeroWidth -> { if (locals.getZeroWidthQuantifierGuardIndex(TransitionGuard.getZeroWidthQuantifierIndex(guard)) != index || (isMonitorCaptureGroupsInEmptyCheck() && !locals.isResultUnmodifiedByZeroWidthQuantifier(TransitionGuard.getZeroWidthQuantifierIndex(guard)))) { return false; } - break; - case checkGroupMatched: - if (locals.getCaptureGroupStart(TransitionGuard.getGroupNumber(guard)) == -1 || locals.getCaptureGroupEnd(TransitionGuard.getGroupNumber(guard)) == -1) { - return false; - } - break; - case checkGroupNotMatched: - if (locals.getCaptureGroupStart(TransitionGuard.getGroupNumber(guard)) != -1 && locals.getCaptureGroupEnd(TransitionGuard.getGroupNumber(guard)) != -1) { - return false; + } + case checkGroupMatched, checkGroupNotMatched -> { + int start = locals.getCaptureGroupStart(TransitionGuard.getGroupNumber(guard)); + int end = locals.getCaptureGroupEnd(TransitionGuard.getGroupNumber(guard)); + switch (kind) { + case checkGroupMatched -> { + if (start == -1 || end == -1) { + return false; + } + } + case checkGroupNotMatched -> { + if (start != -1 && end != -1) { + return false; + } + } } - break; - default: - break; + } + default -> { + } } } locals.saveIndex(getNewIndex(locals, target, index)); @@ -1156,6 +1170,7 @@ private boolean canInlineBackReferenceIntoTransition(PureNFAState backRef) { private boolean matchBackReferenceSimple(TRegexBacktrackingNFAExecutorLocals locals, PureNFAState backReference, PureNFATransition transition, int index) { assert backReference.isBackReference(); assert canInlineBackReferenceIntoTransition(backReference); + assert !isRecursiveBackreferences(); Pair backRefBounds = getBackRefBounds(locals, backReference, transition, index); final int backrefStart = backRefBounds.getLeft(); final int backrefEnd = backRefBounds.getRight(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexLexer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexLexer.java index 039b14dcf04..830b750dfab 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexLexer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexLexer.java @@ -49,6 +49,7 @@ import com.oracle.truffle.regex.RegexFlags; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.charset.ClassSetContents; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.CodePointSetAccumulator; @@ -209,6 +210,15 @@ protected int getMaxBackReferenceDigits() { @Override protected CodePointSet getPredefinedCharClass(char c) { + CodePointSet predefinedCharClass = getPredefinedCharClassCPS(c); + if (featureEnabledIgnoreCase()) { + return caseFoldUnfold(predefinedCharClass); + } else { + return predefinedCharClass; + } + } + + private CodePointSet getPredefinedCharClassCPS(char c) { switch (c) { case 's': if (source.getOptions().isU180EWhitespace()) { @@ -250,12 +260,12 @@ protected CodePointSet getPredefinedCharClass(char c) { @Override protected void checkClassSetCharacter(int codePoint) throws RegexSyntaxException { if (CLASS_SET_SYNTAX_CHARS.get(codePoint)) { - throw syntaxError(JsErrorMessages.unexpectedCharacterInClassSet(codePoint)); + throw syntaxError(JsErrorMessages.unexpectedCharacterInClassSet(codePoint), ErrorCode.InvalidCharacterClass); } if (CLASS_SET_RESERVED_DOUBLE_PUNCTUATORS.get(codePoint)) { String punctuator = Character.toString(codePoint); if (lookahead(punctuator)) { - throw syntaxError(JsErrorMessages.unexpectedDoublePunctuatorInClassSet(punctuator)); + throw syntaxError(JsErrorMessages.unexpectedDoublePunctuatorInClassSet(punctuator), ErrorCode.InvalidCharacterClass); } } } @@ -267,13 +277,13 @@ protected long boundedQuantifierMaxValue() { @Override protected RegexSyntaxException handleBoundedQuantifierOutOfOrder() { - return syntaxError(JsErrorMessages.QUANTIFIER_OUT_OF_ORDER); + return syntaxError(JsErrorMessages.QUANTIFIER_OUT_OF_ORDER, ErrorCode.InvalidQuantifier); } @Override protected Token handleBoundedQuantifierEmptyOrMissingMin() throws RegexSyntaxException { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.INCOMPLETE_QUANTIFIER); + throw syntaxError(JsErrorMessages.INCOMPLETE_QUANTIFIER, ErrorCode.InvalidQuantifier); } position = getLastTokenPosition() + 1; return literalChar('{'); @@ -296,13 +306,13 @@ protected Token handleBoundedQuantifierOverflowMin(long min, long max) { @Override protected RegexSyntaxException handleCCRangeOutOfOrder(int startPos) { - return syntaxError(JsErrorMessages.CHAR_CLASS_RANGE_OUT_OF_ORDER); + return syntaxError(JsErrorMessages.CHAR_CLASS_RANGE_OUT_OF_ORDER, ErrorCode.InvalidCharacterClass); } @Override protected void handleCCRangeWithPredefCharClass(int startPos, ClassSetContents firstAtom, ClassSetContents secondAtom) { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.INVALID_CHARACTER_CLASS); + throw syntaxError(JsErrorMessages.INVALID_CHARACTER_CLASS, ErrorCode.InvalidCharacterClass); } } @@ -323,7 +333,7 @@ protected void validatePOSIXEquivalenceClass(String sequence) { @Override protected RegexSyntaxException handleComplementOfStringSet() { - return syntaxError(JsErrorMessages.invalidRegularExpression(source, JsErrorMessages.COMPLEMENT_OF_STRING_SET)); + return syntaxError(JsErrorMessages.invalidRegularExpression(source, JsErrorMessages.COMPLEMENT_OF_STRING_SET), ErrorCode.InvalidCharacterClass); } @Override @@ -334,43 +344,43 @@ protected void handleGroupRedefinition(String name, int newId, int oldId) { @Override protected void handleIncompleteEscapeX() { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.INVALID_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_ESCAPE, ErrorCode.InvalidEscape); } } @Override protected Token handleInvalidBackReference(int reference) { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.MISSING_GROUP_FOR_BACKREFERENCE); + throw syntaxError(JsErrorMessages.MISSING_GROUP_FOR_BACKREFERENCE, ErrorCode.InvalidBackReference); } return null; } @Override protected RegexSyntaxException handleInvalidCharInCharClass() { - return syntaxError(JsErrorMessages.INVALID_CHARACTER_IN_CHARACTER_CLASS); + return syntaxError(JsErrorMessages.INVALID_CHARACTER_IN_CHARACTER_CLASS, ErrorCode.InvalidCharacterClass); } private int handleInvalidEscape(int c) { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.INVALID_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_ESCAPE, ErrorCode.InvalidEscape); } return c; } @Override protected RegexSyntaxException handleInvalidGroupBeginQ() { - return syntaxError(JsErrorMessages.INVALID_GROUP); + return syntaxError(JsErrorMessages.INVALID_GROUP, ErrorCode.InvalidGroup); } @Override protected RegexSyntaxException handleMixedClassSetOperators(ClassSetOperator leftOperator, ClassSetOperator rightOperator) { - return syntaxError(JsErrorMessages.mixedOperatorsInClassSet(leftOperator, rightOperator)); + return syntaxError(JsErrorMessages.mixedOperatorsInClassSet(leftOperator, rightOperator), ErrorCode.InvalidCharacterClass); } @Override protected RegexSyntaxException handleMissingClassSetOperand(ClassSetOperator operator) { - return syntaxError(JsErrorMessages.missingClassSetOperand(operator)); + return syntaxError(JsErrorMessages.missingClassSetOperand(operator), ErrorCode.InvalidCharacterClass); } @Override @@ -379,12 +389,12 @@ protected void handleOctalOutOfRange() { @Override protected RegexSyntaxException handleRangeAsClassSetOperand(ClassSetOperator operator) { - return syntaxError(JsErrorMessages.rangeAsClassSetOperand(operator)); + return syntaxError(JsErrorMessages.rangeAsClassSetOperand(operator), ErrorCode.InvalidCharacterClass); } @Override protected void handleUnfinishedEscape() { - throw syntaxError(JsErrorMessages.ENDS_WITH_UNFINISHED_ESCAPE_SEQUENCE); + throw syntaxError(JsErrorMessages.ENDS_WITH_UNFINISHED_ESCAPE_SEQUENCE, ErrorCode.InvalidEscape); } @Override @@ -393,12 +403,12 @@ protected void handleUnfinishedGroupComment() { @Override protected RegexSyntaxException handleUnfinishedGroupQ() { - return syntaxError(JsErrorMessages.INVALID_GROUP); + return syntaxError(JsErrorMessages.INVALID_GROUP, ErrorCode.InvalidGroup); } @Override protected RegexSyntaxException handleUnfinishedRangeInClassSet() { - return syntaxError(JsErrorMessages.UNTERMINATED_CHARACTER_RANGE); + return syntaxError(JsErrorMessages.UNTERMINATED_CHARACTER_RANGE, ErrorCode.InvalidCharacterClass); } @Override @@ -408,19 +418,19 @@ protected void handleUnmatchedRightBrace() { // cannot be used as atomic patterns. However, Annex B relaxes this condition // and allows the use of unmatched '}' and ']', which then match themselves. // Nevertheless, in Unicode mode, we should still be strict. - throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_BRACE); + throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_BRACE, ErrorCode.InvalidQuantifier); } } @Override protected RegexSyntaxException handleUnmatchedLeftBracket() { - return syntaxError(JsErrorMessages.UNMATCHED_LEFT_BRACKET); + return syntaxError(JsErrorMessages.UNMATCHED_LEFT_BRACKET, ErrorCode.UnmatchedBracket); } @Override protected void handleUnmatchedRightBracket() { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_BRACKET); + throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_BRACKET, ErrorCode.UnmatchedBracket); } } @@ -429,7 +439,7 @@ protected int parseCodePointInGroupName() throws RegexSyntaxException { if (consumingLookahead("\\u")) { final int unicodeEscape = parseUnicodeEscapeChar(true); if (unicodeEscape < 0) { - throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE, ErrorCode.InvalidEscape); } else { return unicodeEscape; } @@ -442,13 +452,13 @@ private String jsParseGroupName() { ParseGroupNameResult result = parseGroupName('>'); switch (result.state) { case empty: - throw syntaxError(JsErrorMessages.EMPTY_GROUP_NAME); + throw syntaxError(JsErrorMessages.EMPTY_GROUP_NAME, ErrorCode.InvalidNamedGroup); case unterminated: - throw syntaxError(JsErrorMessages.UNTERMINATED_GROUP_NAME); + throw syntaxError(JsErrorMessages.UNTERMINATED_GROUP_NAME, ErrorCode.InvalidNamedGroup); case invalidStart: - throw syntaxError(JsErrorMessages.INVALID_GROUP_NAME_START); + throw syntaxError(JsErrorMessages.INVALID_GROUP_NAME_START, ErrorCode.InvalidNamedGroup); case invalidRest: - throw syntaxError(JsErrorMessages.INVALID_GROUP_NAME_PART); + throw syntaxError(JsErrorMessages.INVALID_GROUP_NAME_PART, ErrorCode.InvalidNamedGroup); case valid: return result.groupName; default: @@ -468,7 +478,7 @@ protected Token parseCustomEscape(char c) { handleUnfinishedEscape(); } if (consumeChar() != '<') { - throw syntaxError(JsErrorMessages.MISSING_GROUP_NAME); + throw syntaxError(JsErrorMessages.MISSING_GROUP_NAME, ErrorCode.InvalidNamedGroup); } String groupName = jsParseGroupName(); // backward reference @@ -480,7 +490,7 @@ protected Token parseCustomEscape(char c) { if (allNamedCaptureGroups != null && allNamedCaptureGroups.containsKey(groupName)) { return Token.createBackReference(allNamedCaptureGroups.get(groupName).stream().mapToInt(x -> x).toArray(), false); } - throw syntaxError(JsErrorMessages.MISSING_GROUP_FOR_BACKREFERENCE); + throw syntaxError(JsErrorMessages.MISSING_GROUP_FOR_BACKREFERENCE, ErrorCode.InvalidBackReference); } else { return literalChar(c); } @@ -493,7 +503,7 @@ protected int parseCustomEscapeChar(char c, boolean inCharClass) { switch (c) { case '0': if (flags.isEitherUnicode() && lookahead(RegexLexer::isDecimalDigit, 1)) { - throw syntaxError(JsErrorMessages.INVALID_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_ESCAPE, ErrorCode.InvalidEscape); } if (!flags.isEitherUnicode() && lookahead(RegexLexer::isOctalDigit, 1)) { return parseOctal(0, 2); @@ -546,7 +556,7 @@ protected int parseCustomEscapeCharFallback(int c, boolean inCharClass) { private char handleInvalidControlEscape() throws RegexSyntaxException { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.INVALID_CONTROL_CHAR_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_CONTROL_CHAR_ESCAPE, ErrorCode.InvalidEscape); } return '\\'; } @@ -575,7 +585,7 @@ private int parseUnicodeEscapeChar(boolean unicodeMode) throws RegexSyntaxExcept if (unicodeMode && consumingLookahead("{")) { final int value = parseHexUnicode(1, Integer.MAX_VALUE, 0x10ffff); if (!consumingLookahead("}")) { - throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE, ErrorCode.InvalidEscape); } return value; } else { @@ -601,10 +611,10 @@ private int parseUnicodeEscapeChar(boolean unicodeMode) throws RegexSyntaxExcept private int parseHexUnicode(int minDigits, int maxDigits, int maxValue) { return parseHex(minDigits, maxDigits, maxValue, () -> { if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE, ErrorCode.InvalidEscape); } }, () -> { - throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE); + throw syntaxError(JsErrorMessages.INVALID_UNICODE_ESCAPE, ErrorCode.InvalidEscape); }); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexParser.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexParser.java index 552ce68283f..9bee7bd289d 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexParser.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexParser.java @@ -44,6 +44,7 @@ import java.util.List; import java.util.Map; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import org.graalvm.collections.EconomicMap; import org.graalvm.collections.Equivalence; @@ -175,16 +176,16 @@ private RegexAST parse(boolean rootCapture) throws RegexSyntaxException { break; case quantifier: if (astBuilder.getCurTerm() == null || !QUANTIFIER_PREV.contains(prevKind)) { - throw syntaxError(JsErrorMessages.QUANTIFIER_WITHOUT_TARGET); + throw syntaxError(JsErrorMessages.QUANTIFIER_WITHOUT_TARGET, ErrorCode.InvalidQuantifier); } if (prevKind == Token.Kind.quantifier) { - throw syntaxError(JsErrorMessages.QUANTIFIER_ON_QUANTIFIER); + throw syntaxError(JsErrorMessages.QUANTIFIER_ON_QUANTIFIER, ErrorCode.InvalidQuantifier); } if (flags.isEitherUnicode() && astBuilder.getCurTerm().isLookAheadAssertion()) { - throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKAHEAD_ASSERTION); + throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKAHEAD_ASSERTION, ErrorCode.InvalidQuantifier); } if (astBuilder.getCurTerm().isLookBehindAssertion()) { - throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKBEHIND_ASSERTION); + throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKBEHIND_ASSERTION, ErrorCode.InvalidQuantifier); } astBuilder.addQuantifier((Token.Quantifier) token); break; @@ -205,12 +206,12 @@ private RegexAST parse(boolean rootCapture) throws RegexSyntaxException { break; case groupEnd: if (astBuilder.getCurGroup().getParent() instanceof RegexASTRootNode) { - throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS); + throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } astBuilder.popGroup(token); break; case literalChar: - literalChar(((Token.LiteralCharacter) token).getCodePoint()); + literalChar((Token.LiteralCharacter) token); break; case charClass: astBuilder.addCharClass((Token.CharacterClass) token); @@ -227,7 +228,7 @@ private RegexAST parse(boolean rootCapture) throws RegexSyntaxException { lexer.caseFoldUnfold(curCharClass); } CodePointSet cps = curCharClass.toCodePointSet(); - astBuilder.addCharClass(lexer.isCurCharClassInverted() ? cps.createInverse(source.getEncoding()) : cps, wasSingleChar); + astBuilder.addCharClass(lexer.isCurCharClassInverted() ? cps.createInverse(source.getEncoding()) : cps, wasSingleChar, token.getSourceSection()); break; case classSet: astBuilder.addClassSet((Token.ClassSet) token, flags.isIgnoreCase() ? CaseFoldData.CaseFoldUnfoldAlgorithm.ECMAScriptUnicode : null); @@ -237,21 +238,21 @@ private RegexAST parse(boolean rootCapture) throws RegexSyntaxException { } } if (!astBuilder.curGroupIsRoot()) { - throw syntaxError(JsErrorMessages.UNTERMINATED_GROUP); + throw syntaxError(JsErrorMessages.UNTERMINATED_GROUP, ErrorCode.UnmatchedParenthesis); } RegexAST ast = astBuilder.popRootGroup(); checkNamedCaptureGroups(ast); return ast; } - private void literalChar(int codePoint) { + private void literalChar(Token.LiteralCharacter literalCharacter) { if (flags.isIgnoreCase()) { curCharClass.clear(); - curCharClass.addCodePoint(codePoint); + curCharClass.addCodePoint(literalCharacter.getCodePoint()); lexer.caseFoldUnfold(curCharClass); - astBuilder.addCharClass(curCharClass.toCodePointSet(), true); + astBuilder.addCharClass(curCharClass.toCodePointSet(), true, literalCharacter.getSourceSection()); } else { - astBuilder.addCharClass(CodePointSet.create(codePoint)); + astBuilder.addLiteralChar(literalCharacter); } } @@ -271,8 +272,8 @@ private void checkNamedCaptureGroups(RegexAST ast) { for (Map.Entry> entry : lexer.getNamedCaptureGroups().entrySet()) { for (int i = 0; i < entry.getValue().size() - 1; i++) { for (int j = i + 1; j < entry.getValue().size(); j++) { - if (canBothParticipate(ast.getGroup(entry.getValue().get(i)), ast.getGroup(entry.getValue().get(j)))) { - throw syntaxError(JsErrorMessages.MULTIPLE_GROUPS_SAME_NAME); + if (canBothParticipate(ast.getGroup(entry.getValue().get(i)).get(0), ast.getGroup(entry.getValue().get(j)).get(0))) { + throw syntaxError(JsErrorMessages.MULTIPLE_GROUPS_SAME_NAME, ErrorCode.InvalidNamedGroup); } } } @@ -306,7 +307,7 @@ private static boolean canBothParticipate(Group a, Group b) { throw CompilerDirectives.shouldNotReachHere("no common ancestor found for named capture groups in regexp"); } - private RegexSyntaxException syntaxError(String msg) { - return RegexSyntaxException.createPattern(source, msg, lexer.getLastTokenPosition()); + private RegexSyntaxException syntaxError(String msg, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, lexer.getLastTokenPosition(), errorCode); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexValidator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexValidator.java index 99770375d5a..28ee901204c 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexValidator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/JSRegexValidator.java @@ -50,6 +50,7 @@ import com.oracle.truffle.regex.RegexLanguage; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.errors.JsErrorMessages; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; @@ -134,14 +135,14 @@ private void parseDryRun() throws RegexSyntaxException { case quantifier: switch (curTermState) { case Null: - throw syntaxError(JsErrorMessages.QUANTIFIER_WITHOUT_TARGET); + throw syntaxError(JsErrorMessages.QUANTIFIER_WITHOUT_TARGET, ErrorCode.InvalidQuantifier); case LookAheadAssertion: if (flags.isEitherUnicode()) { - throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKAHEAD_ASSERTION); + throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKAHEAD_ASSERTION, ErrorCode.InvalidQuantifier); } break; case LookBehindAssertion: - throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKBEHIND_ASSERTION); + throw syntaxError(JsErrorMessages.QUANTIFIER_ON_LOOKBEHIND_ASSERTION, ErrorCode.InvalidQuantifier); case Other: break; } @@ -165,7 +166,7 @@ private void parseDryRun() throws RegexSyntaxException { break; case groupEnd: if (syntaxStack.isEmpty()) { - throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS); + throw syntaxError(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } RegexStackElem poppedElem = syntaxStack.remove(syntaxStack.size() - 1); switch (poppedElem) { @@ -185,10 +186,10 @@ private void parseDryRun() throws RegexSyntaxException { } } if (lexer.inCharacterClass()) { - throw syntaxError(JsErrorMessages.UNMATCHED_LEFT_BRACKET); + throw syntaxError(JsErrorMessages.UNMATCHED_LEFT_BRACKET, ErrorCode.UnmatchedBracket); } if (!syntaxStack.isEmpty()) { - throw syntaxError(JsErrorMessages.UNTERMINATED_GROUP); + throw syntaxError(JsErrorMessages.UNTERMINATED_GROUP, ErrorCode.UnmatchedParenthesis); } checkNamedCaptureGroups(); } @@ -207,7 +208,7 @@ private void checkNamedCaptureGroups() { } } - private RegexSyntaxException syntaxError(String msg) { - return RegexSyntaxException.createPattern(source, msg, lexer.getLastTokenPosition()); + private RegexSyntaxException syntaxError(String msg, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, lexer.getLastTokenPosition(), errorCode); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTBuilder.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTBuilder.java index 39083fd466e..7c751c1d506 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTBuilder.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTBuilder.java @@ -44,6 +44,7 @@ import java.util.Arrays; import java.util.Comparator; +import com.oracle.truffle.regex.tregex.parser.ast.visitors.MarkAsDeadVisitor; import org.graalvm.collections.EconomicMap; import org.graalvm.collections.Equivalence; @@ -216,7 +217,7 @@ public void pushRootGroup(boolean rootCapture) { */ public RegexAST popRootGroup() { optimizeGroup(curGroup); - ast.getRoot().setEnclosedCaptureGroupsHigh(groupCount.getCount()); + ast.getRoot().setEnclosedCaptureGroupsHi(groupCount.getCount()); return ast; } @@ -332,7 +333,7 @@ private Group pushGroup(Token token, Group group, RegexASTSubtreeRootNode parent addSourceSection(group, token); setGroupStartPosition(group, token); curGroup = group; - curGroup.setEnclosedCaptureGroupsLow(groupCount.getCount()); + curGroup.setEnclosedCaptureGroupsLo(groupCount.getCount()); if (openFirstSequence) { nextSequence(); } else { @@ -361,7 +362,7 @@ public void popGroup(Token token) { ast.getNodeCount().dec(); } optimizeGroup(curGroup); - curGroup.setEnclosedCaptureGroupsHigh(groupCount.getCount()); + curGroup.setEnclosedCaptureGroupsHi(groupCount.getCount()); addSourceSection(curGroup, token); if (curGroup.getParent().isSubtreeRoot()) { addSourceSection(curGroup.getParent(), token); @@ -422,10 +423,20 @@ public void addCharClass(CodePointSet charSet, boolean wasSingleChar) { addCharClass(Token.createCharClass(charSet, wasSingleChar)); } + public void addCharClass(CodePointSet charSet, boolean wasSingleChar, SourceSection sourceSection) { + Token.CharacterClass charClass = Token.createCharClass(charSet, wasSingleChar); + charClass.setSourceSection(sourceSection); + addCharClass(charClass); + } + public void addCharClass(CodePointSet charSet) { addCharClass(charSet, charSet.matchesSingleChar()); } + public void addLiteralChar(Token.LiteralCharacter literalCharacter) { + addCharClass(CodePointSet.create(literalCharacter.getCodePoint()), true, literalCharacter.getSourceSection()); + } + private CodePointSet pruneCharClass(CodePointSet cps) { return encoding.getFullSet().createIntersection(cps, compilationBuffer); } @@ -448,8 +459,8 @@ private Term translateUnicodeCharClass(CodePointSet codePointSet, Token token, b return createCharClass(codePointSet, token, wasSingleChar); } Group group = ast.createGroup(); - group.setEnclosedCaptureGroupsLow(groupCount.getCount()); - group.setEnclosedCaptureGroupsHigh(groupCount.getCount()); + group.setEnclosedCaptureGroupsLo(groupCount.getCount()); + group.setEnclosedCaptureGroupsHi(groupCount.getCount()); IntRangesBuffer tmp = compilationBuffer.getIntRangesBuffer1(); CodePointSet bmpRanges = codePointSet.createIntersection(Constants.BMP_WITHOUT_SURROGATES, tmp); CodePointSet astralRanges = codePointSet.createIntersection(Constants.ASTRAL_SYMBOLS, tmp); @@ -556,6 +567,10 @@ public void addClassSet(Token.ClassSet token, CaseFoldData.CaseFoldUnfoldAlgorit CodePointSetAccumulator buf = compilationBuffer.getCodePointSetAccumulator1(); ClassSetContents contents = token.getContents(); + if (contents.isEmpty()) { + addCharClass(CodePointSet.getEmpty()); + return; + } pushGroup(false); String[] sortedStrings = new String[contents.getStrings().size()]; @@ -812,11 +827,11 @@ private void setQuantifier(QuantifiableTerm term, Token.Quantifier quantifier) { private Group wrapTermInGroup(Term term) { Group wrapperGroup = ast.createGroup(); if (term.isGroup()) { - wrapperGroup.setEnclosedCaptureGroupsLow(term.asGroup().getCaptureGroupsLow()); - wrapperGroup.setEnclosedCaptureGroupsHigh(term.asGroup().getCaptureGroupsHigh()); + wrapperGroup.setEnclosedCaptureGroupsLo(term.asGroup().getCaptureGroupsLo()); + wrapperGroup.setEnclosedCaptureGroupsHi(term.asGroup().getCaptureGroupsHi()); } else if (term.isAtomicGroup()) { - wrapperGroup.setEnclosedCaptureGroupsLow(term.asAtomicGroup().getEnclosedCaptureGroupsLow()); - wrapperGroup.setEnclosedCaptureGroupsHigh(term.asAtomicGroup().getEnclosedCaptureGroupsHigh()); + wrapperGroup.setEnclosedCaptureGroupsLo(term.asAtomicGroup().getEnclosedCaptureGroupsLow()); + wrapperGroup.setEnclosedCaptureGroupsHi(term.asAtomicGroup().getEnclosedCaptureGroupsHigh()); } Sequence wrapperSequence = wrapperGroup.addSequence(ast); term.getParent().asSequence().replace(term.getSeqIndex(), wrapperGroup); @@ -844,6 +859,7 @@ public void addCopy(Token token, Group sourceGroup) { */ public void removeCurTerm() { ast.getNodeCount().dec(countVisitor.count(curSequence.getLastTerm())); + MarkAsDeadVisitor.markAsDead(curSequence.getLastTerm()); curSequence.removeLastTerm(); curTerm = curSequence.isEmpty() ? null : curSequence.getLastTerm(); } @@ -968,7 +984,7 @@ public void addWordNonBoundaryAssertionPython(CodePointSet wordChars, CodePointS /* optimizations */ private void optimizeGroup(Group group) { - if (group.isConditionalBackReferenceGroup()) { + if (group.isConditionalBackReferenceGroup() || group.isInLookBehindAssertion()) { return; } sortAlternatives(group); @@ -1103,9 +1119,9 @@ private void mergeCommonPrefixes(Group group) { copy.add(t); if (t.isGroup()) { Group g = t.asGroup(); - if (g.getEnclosedCaptureGroupsLow() != g.getEnclosedCaptureGroupsHigh()) { - enclosedCGLo = Math.min(enclosedCGLo, g.getEnclosedCaptureGroupsLow()); - enclosedCGHi = Math.max(enclosedCGHi, g.getEnclosedCaptureGroupsHigh()); + if (g.getEnclosedCaptureGroupsLo() != g.getEnclosedCaptureGroupsHi()) { + enclosedCGLo = Math.min(enclosedCGLo, g.getEnclosedCaptureGroupsLo()); + enclosedCGHi = Math.max(enclosedCGHi, g.getEnclosedCaptureGroupsHi()); } if (g.isCapturing()) { enclosedCGLo = Math.min(enclosedCGLo, g.getGroupNumber()); @@ -1117,8 +1133,8 @@ private void mergeCommonPrefixes(Group group) { } } if (enclosedCGLo != Integer.MAX_VALUE) { - innerGroup.setEnclosedCaptureGroupsLow(enclosedCGLo); - innerGroup.setEnclosedCaptureGroupsHigh(enclosedCGHi); + innerGroup.setEnclosedCaptureGroupsLo(enclosedCGLo); + innerGroup.setEnclosedCaptureGroupsHi(enclosedCGHi); } if (!innerGroup.isEmpty() && !(innerGroup.size() == 1 && innerGroup.getFirstAlternative().isEmpty())) { optimizeGroup(innerGroup); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTPostProcessor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTPostProcessor.java index ddb56803a78..be8605c65b9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTPostProcessor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexASTPostProcessor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -49,6 +49,7 @@ import com.oracle.truffle.regex.tregex.TRegexOptions; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.BackReference; +import com.oracle.truffle.regex.tregex.parser.ast.CalcASTFlagsVisitor; import com.oracle.truffle.regex.tregex.parser.ast.CalcASTPropsVisitor; import com.oracle.truffle.regex.tregex.parser.ast.CharacterClass; import com.oracle.truffle.regex.tregex.parser.ast.Group; @@ -56,6 +57,7 @@ import com.oracle.truffle.regex.tregex.parser.ast.PositionAssertion; import com.oracle.truffle.regex.tregex.parser.ast.QuantifiableTerm; import com.oracle.truffle.regex.tregex.parser.ast.RegexAST; +import com.oracle.truffle.regex.tregex.parser.ast.RegexASTNode; import com.oracle.truffle.regex.tregex.parser.ast.Sequence; import com.oracle.truffle.regex.tregex.parser.ast.SubexpressionCall; import com.oracle.truffle.regex.tregex.parser.ast.Term; @@ -64,6 +66,7 @@ import com.oracle.truffle.regex.tregex.parser.ast.visitors.InitIDVisitor; import com.oracle.truffle.regex.tregex.parser.ast.visitors.MarkLookBehindEntriesVisitor; import com.oracle.truffle.regex.tregex.parser.ast.visitors.NodeCountVisitor; +import com.oracle.truffle.regex.tregex.parser.ast.visitors.PropagateDeadFlagVisitor; import com.oracle.truffle.regex.tregex.string.Encodings; public class RegexASTPostProcessor { @@ -81,6 +84,7 @@ public RegexASTPostProcessor(RegexAST ast, CompilationBuffer compilationBuffer) } public void prepareForDFA() { + CalcASTFlagsVisitor.run(ast); if (ast.getOptions().isBooleanMatch()) { DisableCaptureGroupsVisitor.disableCaptureGroups(ast); } @@ -89,6 +93,7 @@ public void prepareForDFA() { UnrollQuantifiersVisitor.unrollQuantifiers(ast); } CalcASTPropsVisitor.run(ast, compilationBuffer); + PropagateDeadFlagVisitor.propagateDeadFlag(ast.getRoot()); ast.createPrefix(); InitIDVisitor.init(ast); if (ast.canTransformToDFA()) { @@ -160,7 +165,7 @@ protected void visit(CharacterClass characterClass) { @Override protected void leave(Group group) { if (group.hasQuantifier()) { - quantifierExpander.expandQuantifier(group, shouldUnroll(group) && shouldUnrollVisitor.shouldUnroll(group)); + quantifierExpander.expandQuantifier(group, group.getQuantifier().isUnrollTrivial() || shouldUnroll(group) && shouldUnrollVisitor.shouldUnroll(group)); } } @@ -204,6 +209,7 @@ private static final class QuantifierExpander { private final RegexAST ast; private final CopyVisitor copyVisitor; + private final ClearRegisteredCaptureGroupsVisitor clearRegisteredCaptureGroupsVisitor; private Group curGroup; private Sequence curSequence; private Term curTerm; @@ -211,6 +217,7 @@ private static final class QuantifierExpander { QuantifierExpander(RegexAST ast) { this.ast = ast; this.copyVisitor = new CopyVisitor(ast); + this.clearRegisteredCaptureGroupsVisitor = new ClearRegisteredCaptureGroupsVisitor(ast); } private void pushGroup() { @@ -249,40 +256,49 @@ private void addTermCopyAsGroup(Term term) { addTerm(copyVisitor.copy(term)); popGroup(); if (term.isGroup()) { - curTerm.asGroup().setEnclosedCaptureGroupsLow(term.asGroup().getCaptureGroupsLow()); - curTerm.asGroup().setEnclosedCaptureGroupsHigh(term.asGroup().getCaptureGroupsHigh()); + curTerm.asGroup().setEnclosedCaptureGroupsLo(term.asGroup().getCaptureGroupsLo()); + curTerm.asGroup().setEnclosedCaptureGroupsHi(term.asGroup().getCaptureGroupsHi()); } } } - private void createOptionalBranch(QuantifiableTerm term, Token.Quantifier quantifier, boolean unroll, int recurse) { + private void createOptionalBranch(QuantifiableTerm term, Token.Quantifier quantifier, boolean unroll, boolean mandatory, boolean optional, int recurse) { // We wrap the quantified term in a group, as NFATraversalRegexASTVisitor is set up // to expect quantifier guards only on group boundaries. + if (term.isInLookBehindAssertion()) { + createOptional(term, quantifier, unroll, mandatory, optional, recurse - 1); + } addTermCopyAsGroup(term); curTerm.asGroup().setQuantifier(quantifier); curTerm.setExpandedQuantifier(unroll); - curTerm.setMandatoryUnrolledQuantifier(false); + curTerm.setMandatoryQuantifier(mandatory); + curTerm.setOptionalQuantifier(optional); curTerm.setEmptyGuard(true); - createOptional(term, quantifier, unroll, recurse - 1); + if (!term.isInLookBehindAssertion()) { + createOptional(term, quantifier, unroll, mandatory, optional, recurse - 1); + } } - private void createOptional(QuantifiableTerm term, Token.Quantifier quantifier, boolean unroll, int recurse) { + private void createOptional(QuantifiableTerm term, Token.Quantifier quantifier, boolean unroll, boolean mandatory, boolean optional, int recurse) { if (recurse < 0) { return; } pushGroup(); if (term.isGroup()) { - curGroup.setEnclosedCaptureGroupsLow(term.asGroup().getCaptureGroupsLow()); - curGroup.setEnclosedCaptureGroupsHigh(term.asGroup().getCaptureGroupsHigh()); + curGroup.setEnclosedCaptureGroupsLo(term.asGroup().getCaptureGroupsLo()); + curGroup.setEnclosedCaptureGroupsHi(term.asGroup().getCaptureGroupsHi()); } - if (quantifier.isGreedy()) { - createOptionalBranch(term, quantifier, unroll, recurse); + if (quantifier.isGreedy() || mandatory) { + createOptionalBranch(term, quantifier, unroll, mandatory, optional, recurse); nextSequence(); curSequence.setQuantifierPassThroughSequence(true); } else { curSequence.setQuantifierPassThroughSequence(true); nextSequence(); - createOptionalBranch(term, quantifier, unroll, recurse); + createOptionalBranch(term, quantifier, unroll, false, optional, recurse); + } + if (!unroll && !mandatory && recurse == 0) { + curGroup.setLoop(true); } popGroup(); } @@ -290,6 +306,7 @@ private void createOptional(QuantifiableTerm term, Token.Quantifier quantifier, private void expandQuantifier(QuantifiableTerm toExpand, boolean unroll) { assert toExpand.hasQuantifier(); assert !unroll || toExpand.isUnrollingCandidate(); + clearRegisteredCaptureGroupsVisitor.clear(toExpand); Token.Quantifier quantifier = toExpand.getQuantifier(); toExpand.setQuantifier(null); @@ -300,17 +317,33 @@ private void expandQuantifier(QuantifiableTerm toExpand, boolean unroll) { // replace the term to expand with a new wrapper group replaceCurTermWithNewGroup(); + boolean mandatoryOptionalSplit = !unroll && !ast.getFlavor().emptyChecksOnMandatoryLoopIterations() && quantifier.getMin() > 0 && toExpand.mayMatchEmptyString(); + + if (toExpand.isInLookBehindAssertion()) { + unrollOptional(toExpand, quantifier, unroll, mandatoryOptionalSplit); + unrollMandatory(toExpand, quantifier, unroll, mandatoryOptionalSplit); + } else { + unrollMandatory(toExpand, quantifier, unroll, mandatoryOptionalSplit); + unrollOptional(toExpand, quantifier, unroll, mandatoryOptionalSplit); + } + } + + private void unrollMandatory(QuantifiableTerm toExpand, Token.Quantifier quantifier, boolean unroll, boolean mandatoryOptionalSplit) { // unroll mandatory part ( x{3} -> xxx ) if (unroll) { - // unroll non-optional part ( x{3} -> xxx ) for (int i = 0; i < quantifier.getMin(); i++) { addTermCopyAsGroup(toExpand); curTerm.asGroup().setQuantifier(quantifier); curTerm.setExpandedQuantifier(true); - curTerm.setMandatoryUnrolledQuantifier(true); + curTerm.setMandatoryQuantifier(true); } + } else if (mandatoryOptionalSplit) { + createOptional(toExpand, quantifier, false, true, false, 0); + ((Group) curTerm).setLoop(true); } + } + private void unrollOptional(QuantifiableTerm toExpand, Token.Quantifier quantifier, boolean unroll, boolean mandatoryOptionalSplit) { // unroll optional part ( x{0,3} -> (x(x(x|)|)|) ) // In flavors like Python or Ruby, loops can be repeated past the point where the // position in the string keeps advancing (i.e. we are matching at least one @@ -319,9 +352,13 @@ private void expandQuantifier(QuantifiableTerm toExpand, boolean unroll) { // iteration is run because there is no backtracking after failing the empty check. // We can emulate this behavior by dropping empty guards in small bounded loops, // such as is the case for unrolled loops. - createOptional(toExpand, quantifier, unroll, !unroll || quantifier.isInfiniteLoop() ? 0 : (quantifier.getMax() - quantifier.getMin()) - 1); - if (!unroll || quantifier.isInfiniteLoop()) { - ((Group) curTerm).setLoop(true); + if (unroll) { + createOptional(toExpand, quantifier, true, false, false, quantifier.isInfiniteLoop() ? 0 : quantifier.getMax() - quantifier.getMin() - 1); + if (quantifier.isInfiniteLoop()) { + ((Group) curTerm).setLoop(true); + } + } else if (quantifier.isInfiniteLoop() || quantifier.getMax() > quantifier.getMin() || !mandatoryOptionalSplit) { + createOptional(toExpand, quantifier, false, false, mandatoryOptionalSplit, 0); } } } @@ -395,25 +432,39 @@ private static LookAroundOptimization replace(Term replacement) { private LookAroundOptimization optimizeLookAround(LookAroundAssertion lookaround) { Group group = lookaround.getGroup(); - // Drop empty lookarounds: - // * (?=) -> NOP - // * (?<=) -> NOP - // * (?!) -> DEAD - // * (? DEAD - if (group.size() == 1 && group.getFirstAlternative().isEmpty()) { - if (lookaround.isNegated()) { - // empty negative lookarounds never match - ast.getNodeCount().dec(countVisitor.count(lookaround)); - return LookAroundOptimization.replace(ast.createCharacterClass(CodePointSet.getEmpty())); - } else { - // empty positive lookarounds are no-ops - ast.getNodeCount().dec(countVisitor.count(lookaround)); - return LookAroundOptimization.NO_OP; + // Simplify lookarounds with empty branches: + boolean hasCaptureGroups = false; + for (int i = 0; i < group.size(); i++) { + Sequence s = group.getAlternatives().get(i); + // we also check for s.isEmpty here, because a previous lookaround optimization may + // already have removed the capture groups, and we don't re-run CalcAstPropsVisitor + // between these optimizations. + // Example: in /(?<=(?=|()))/, we first remove the inner lookahead, so the outer + // lookbehind is empty but still has the hasGroups flag set. + hasCaptureGroups |= s.hasCaptureGroups() && !s.isEmpty(); + if (s.isEmpty()) { + if (lookaround.isNegated()) { + // negative lookarounds with empty branches never match + ast.getNodeCount().dec(countVisitor.count(lookaround)); + return LookAroundOptimization.replace(ast.createCharacterClass(CodePointSet.getEmpty())); + } else { + // positive lookarounds with empty branches are no-ops, but we still have to + // keep higher priority branches if they have capture groups + if (hasCaptureGroups) { + if (group.size() > i + 1) { + group.getAlternatives().subList(i + 1, group.size()).clear(); + } + break; + } else { + ast.getNodeCount().dec(countVisitor.count(lookaround)); + return LookAroundOptimization.NO_OP; + } + } } } // Extract position assertions from positive lookarounds - if (!lookaround.isNegated()) { + if (!lookaround.isNegated() && !lookaround.hasCaptureGroups()) { if (group.size() == 1 && group.getFirstAlternative().size() == 1 && group.getFirstAlternative().getFirstTerm().isPositionAssertion()) { // unwrap positive lookarounds containing only a position assertion // * (?=$) -> $ @@ -436,10 +487,11 @@ private LookAroundOptimization optimizeLookAround(LookAroundAssertion lookaround if (innerPositionAssertion >= 0) { Sequence removed = group.getAlternatives().remove(innerPositionAssertion); Group wrapGroup = ast.createGroup(); - wrapGroup.setEnclosedCaptureGroupsLow(group.getCaptureGroupsLow()); - wrapGroup.setEnclosedCaptureGroupsHigh(group.getCaptureGroupsHigh()); + wrapGroup.setEnclosedCaptureGroupsLo(group.getCaptureGroupsLo()); + wrapGroup.setEnclosedCaptureGroupsHi(group.getCaptureGroupsHi()); wrapGroup.add(removed); Sequence wrapSeq = wrapGroup.addSequence(ast); + assert !group.isEmpty(); wrapSeq.add(lookaround); return LookAroundOptimization.replace(wrapGroup); } @@ -488,9 +540,30 @@ public static void disableCaptureGroups(RegexAST ast) { @Override protected void visit(Group group) { - if (group.isCapturing() && !ast.isGroupReferenced(group.getGroupNumber())) { + if (group.isCapturing() && !ast.isGroupReferenced(group.getGroupNumber()) && + !(group.getGroupNumber() == 0 && (ast.getProperties().hasMatchBoundaryAssertions() || ast.getOptions().isMustAdvance()))) { group.clearGroupNumber(); } } } + + private static final class ClearRegisteredCaptureGroupsVisitor extends DepthFirstTraversalRegexASTVisitor { + + private final RegexAST ast; + + private ClearRegisteredCaptureGroupsVisitor(RegexAST ast) { + this.ast = ast; + } + + public void clear(RegexASTNode root) { + run(root); + } + + @Override + protected void visit(Group group) { + if (group.isCapturing()) { + ast.clearRegisteredCaptureGroups(group.getGroupNumber()); + } + } + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java index 75d82d3219d..d60f5f036a9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexLexer.java @@ -46,6 +46,7 @@ import java.util.Map; import java.util.function.IntPredicate; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import org.graalvm.collections.EconomicSet; import com.oracle.truffle.api.ArrayUtils; @@ -760,16 +761,19 @@ protected Token literalChar(int codePoint) { private Token charClass(CodePointSet codePointSet) { if (featureEnabledIgnoreCase()) { - curCharClass.clear(); - curCharClass.addSet(codePointSet); - boolean wasSingleChar = curCharClass.matchesSingleChar(); - caseFoldUnfold(curCharClass); - return Token.createCharClass(curCharClass.toCodePointSet(), wasSingleChar); + return Token.createCharClass(caseFoldUnfold(codePointSet), codePointSet.matchesSingleChar()); } else { return Token.createCharClass(codePointSet); } } + protected CodePointSet caseFoldUnfold(CodePointSet codePointSet) { + curCharClass.clear(); + curCharClass.addSet(codePointSet); + caseFoldUnfold(curCharClass); + return curCharClass.toCodePointSet(); + } + /* lexer */ private Token getNext() throws RegexSyntaxException { @@ -1253,8 +1257,9 @@ protected ClassSetContents parseClassSetExpression() throws RegexSyntaxException } } + boolean atStart = position == startPos; ClassSetOperator newOperator = parseClassSetOperator(); - if (position == startPos) { + if (atStart) { if (newOperator != ClassSetOperator.Union) { throw handleMissingClassSetOperand(newOperator); } @@ -1356,11 +1361,11 @@ private ClassSetContents parseClassSetStrings(char c) { strings.add(string); } if (atEnd()) { - throw syntaxError(JsErrorMessages.UNTERMINATED_STRING_SET); + throw syntaxError(JsErrorMessages.UNTERMINATED_STRING_SET, ErrorCode.InvalidCharacterClass); } } while (consumingLookahead('|')); if (atEnd()) { - throw syntaxError(JsErrorMessages.UNTERMINATED_STRING_SET); + throw syntaxError(JsErrorMessages.UNTERMINATED_STRING_SET, ErrorCode.InvalidCharacterClass); } assert curChar() == '}'; advance(); @@ -1387,14 +1392,14 @@ private String parseClassSetString() { protected ClassSetContents parseUnicodeCharacterProperty(boolean invert) throws RegexSyntaxException { if (!consumingLookahead("{")) { - throw syntaxError(JsErrorMessages.INVALID_UNICODE_PROPERTY); + throw syntaxError(JsErrorMessages.INVALID_UNICODE_PROPERTY, ErrorCode.InvalidCharacterClass); } int namePos = position; while (!atEnd() && curChar() != '}') { advance(); } if (!consumingLookahead("}")) { - throw syntaxError(JsErrorMessages.ENDS_WITH_UNFINISHED_UNICODE_PROPERTY); + throw syntaxError(JsErrorMessages.ENDS_WITH_UNFINISHED_UNICODE_PROPERTY, ErrorCode.InvalidCharacterClass); } try { String propertyName = pattern.substring(namePos, position - 1); @@ -1415,7 +1420,7 @@ protected ClassSetContents parseUnicodeCharacterProperty(boolean invert) throws return ClassSetContents.createCharacterClass(invert ? propertySet.createInverse(encoding) : propertySet); } } catch (IllegalArgumentException e) { - throw syntaxError(e.getMessage()); + throw syntaxError(e.getMessage(), ErrorCode.InvalidCharacterClass); } } @@ -1512,8 +1517,8 @@ private boolean isEscapeCharClass(char c) { return isPredefCharClass(c) || (featureEnabledUnicodePropertyEscapes() && (c == 'p' || c == 'P')); } - public RegexSyntaxException syntaxError(String msg) { - return RegexSyntaxException.createPattern(source, msg, getLastAtomPosition()); + public RegexSyntaxException syntaxError(String msg, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, getLastAtomPosition(), errorCode); } public static boolean isDecimalDigit(int c) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexProperties.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexProperties.java index ab87144fab5..7f1e4769015 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexProperties.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/RegexProperties.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -63,10 +63,10 @@ public class RegexProperties implements JsonConvertible { private static final int FLAG_FIXED_CODEPOINT_WIDTH = 1 << 12; private static final int FLAG_CAPTURE_GROUPS_IN_LOOK_AROUND_ASSERTIONS = 1 << 13; private static final int FLAG_EMPTY_CAPTURE_GROUPS = 1 << 14; - private static final int FLAG_ATOMIC_GROUPS = 1 << 15; - private static final int FLAG_BACK_REFERENCES = 1 << 16; - private static final int FLAG_RECURSIVE_BACK_REFERENCES = 1 << 17; - private static final int FLAG_NESTED_LOOK_BEHIND_ASSERTIONS = 1 << 18; + private static final int FLAG_BACK_REFERENCES = 1 << 15; + private static final int FLAG_RECURSIVE_BACK_REFERENCES = 1 << 16; + private static final int FLAG_NESTED_LOOK_BEHIND_ASSERTIONS = 1 << 17; + private static final int FLAG_LOOK_AROUND_WITH_CAPTURE_GROUPS_NESTED_IN_QUANTIFIER = 1 << 18; private static final int FLAG_CONDITIONAL_BACKREFERENCES = 1 << 19; private static final int FLAG_CONDITIONAL_REFERENCES_INTO_LOOK_AHEADS = 1 << 20; private static final int FLAG_MATCH_BOUNDARY_ASSERTIONS = 1 << 21; @@ -111,14 +111,6 @@ public void setEmptyCaptureGroups() { setFlag(FLAG_EMPTY_CAPTURE_GROUPS); } - public boolean hasAtomicGroups() { - return getFlag(FLAG_ATOMIC_GROUPS); - } - - public void setAtomicGroups() { - setFlag(FLAG_ATOMIC_GROUPS); - } - public boolean hasCharClasses() { return getFlag(FLAG_CHAR_CLASSES); } @@ -264,6 +256,14 @@ public void setNestedLookBehindAssertions() { setFlag(FLAG_NESTED_LOOK_BEHIND_ASSERTIONS); } + public boolean hasLookAroundWithCaptureGroupsNestedInQuantifier() { + return getFlag(FLAG_LOOK_AROUND_WITH_CAPTURE_GROUPS_NESTED_IN_QUANTIFIER); + } + + public void setLookAroundWithCaptureGroupsNestedInQuantifier() { + setFlag(FLAG_LOOK_AROUND_WITH_CAPTURE_GROUPS_NESTED_IN_QUANTIFIER); + } + public boolean hasConditionalBackReferences() { return getFlag(FLAG_CONDITIONAL_BACKREFERENCES); } @@ -303,6 +303,7 @@ public JsonValue toJson() { Json.prop("captureGroupsInLookAroundAssertions", hasCaptureGroupsInLookAroundAssertions()), Json.prop("backReferences", hasBackReferences()), Json.prop("nestedLookBehindAssertions", hasNestedLookBehindAssertions()), + Json.prop("lookAroundWithCaptureGroupsNestedInQuantifier", hasLookAroundWithCaptureGroupsNestedInQuantifier()), Json.prop("conditionalBackReferences", hasConditionalBackReferences()), Json.prop("conditionalReferencesIntoLookAheads", hasConditionalReferencesIntoLookAheads())); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java index 0c0fba7ae4e..7e9318e1560 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/Token.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -297,6 +297,10 @@ public boolean isGreedy() { return greedy; } + public boolean isLazy() { + return !greedy; + } + public boolean isPossessive() { return possessive; } @@ -333,6 +337,14 @@ public void setZeroWidthIndex(int zeroWidthIndex) { this.zeroWidthIndex = zeroWidthIndex; } + /** + * Returns {@code true} if {@link #getMax()} is infinite or greater than the given + * threshold. + */ + public boolean isMaxGreaterThan(int threshold) { + return Integer.compareUnsigned(max, threshold) > 0; + } + /** * Returns {@code true} iff both {@link #getMin()} and {@link #getMax()} are less or equal * to the given threshold, or infinite {@link #isInfiniteLoop()}. diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/AtomicGroup.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/AtomicGroup.java index ff5db6105c0..449359ab683 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/AtomicGroup.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/AtomicGroup.java @@ -83,14 +83,14 @@ public Term copyRecursive(RegexAST ast, CompilationBuffer compilationBuffer) { * Gets the (inclusive) lower bound of the range of capture groups contained within this group. */ public int getEnclosedCaptureGroupsLow() { - return getGroup().getEnclosedCaptureGroupsLow(); + return getGroup().getEnclosedCaptureGroupsLo(); } /** * Gets the (exclusive) upper bound of the range of capture groups contained within this group. */ public int getEnclosedCaptureGroupsHigh() { - return getGroup().getEnclosedCaptureGroupsHigh(); + return getGroup().getEnclosedCaptureGroupsHi(); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java index 06c7bf95ab8..e3aa903f008 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/BackReference.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -173,6 +173,6 @@ public String toString() { @TruffleBoundary @Override public JsonValue toJson() { - return toJson("BackReference").append(Json.prop("groupNumbers", Arrays.stream(groupNumbers).mapToObj(x -> Json.val(x)))); + return toJson("BackReference").append(Json.prop("groupNumbers", Arrays.stream(groupNumbers).mapToObj(Json::val))); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTFlagsVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTFlagsVisitor.java new file mode 100644 index 00000000000..4285ec01c5d --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTFlagsVisitor.java @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.tregex.parser.ast; + +import static com.oracle.truffle.regex.tregex.parser.ast.CalcASTPropsVisitor.OR_FLAGS; +import static com.oracle.truffle.regex.tregex.parser.ast.CalcASTPropsVisitor.setFlagsLookAroundAssertion; + +import com.oracle.truffle.regex.tregex.parser.ast.visitors.DepthFirstTraversalRegexASTVisitor; + +/** + * Reduced version of {@link CalcASTPropsVisitor}. This version calculates + * {@link CalcASTPropsVisitor#OR_FLAGS} and {@link RegexASTNode#mayMatchEmptyString()} only. + */ +public class CalcASTFlagsVisitor extends DepthFirstTraversalRegexASTVisitor { + + private static final int OR_FLAGS_GROUP = OR_FLAGS | RegexASTNode.FLAG_MAY_MATCH_EMPTY_STRING; + private final RegexAST ast; + + public CalcASTFlagsVisitor(RegexAST ast) { + this.ast = ast; + } + + public static void run(RegexAST ast) { + CalcASTFlagsVisitor visitor = new CalcASTFlagsVisitor(ast); + visitor.run(ast.getRoot()); + } + + @Override + protected void visit(BackReference backReference) { + backReference.setHasBackReferences(); + backReference.getParent().setHasBackReferences(); + backReference.setMayMatchEmptyString(true); + } + + @Override + protected void visit(Sequence sequence) { + sequence.setMayMatchEmptyString(true); + } + + @Override + protected void leave(Group group) { + int flags = 0; + for (Sequence s : group.getAlternatives()) { + flags |= s.getFlags(OR_FLAGS_GROUP); + } + if (group.isLoop()) { + flags |= RegexASTNode.FLAG_HAS_LOOPS; + } + if (group.isCapturing()) { + flags |= RegexASTNode.FLAG_HAS_CAPTURE_GROUPS; + } + group.setFlags(flags, OR_FLAGS_GROUP); + if (group.getParent() != null) { + if (!group.mayMatchEmptyString() && !group.hasMin0Quantifier()) { + group.getParent().setMayMatchEmptyString(false); + } + group.getParent().setFlags(group.getParent().getFlags(OR_FLAGS) | (flags & ~RegexASTNode.FLAG_MAY_MATCH_EMPTY_STRING), OR_FLAGS); + } + } + + @Override + protected void visit(CharacterClass characterClass) { + if (!characterClass.hasMin0Quantifier()) { + characterClass.getParent().setMayMatchEmptyString(false); + } + } + + @Override + protected void visit(PositionAssertion assertion) { + switch (assertion.type) { + case CARET -> assertion.getParent().setHasCaret(); + case DOLLAR -> assertion.getParent().setHasDollar(); + case MATCH_BEGIN, MATCH_END -> ast.getProperties().setMatchBoundaryAssertions(); + } + } + + @Override + protected void visit(LookBehindAssertion assertion) { + assertion.setHasLookBehinds(); + assertion.getParent().setHasLookBehinds(); + } + + @Override + protected void leave(LookBehindAssertion assertion) { + setFlagsLookAroundAssertion(assertion); + } + + @Override + protected void visit(LookAheadAssertion assertion) { + assertion.setHasLookAheads(); + assertion.getParent().setHasLookAheads(); + } + + @Override + protected void leave(LookAheadAssertion assertion) { + setFlagsLookAroundAssertion(assertion); + } + + @Override + protected void leave(AtomicGroup atomicGroup) { + atomicGroup.setHasAtomicGroups(); + atomicGroup.getParent().setHasAtomicGroups(); + CalcASTPropsVisitor.setFlagsSubtreeRootNode(atomicGroup, OR_FLAGS); + if (!atomicGroup.mayMatchEmptyString()) { + atomicGroup.getParent().setMayMatchEmptyString(false); + } + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTPropsVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTPropsVisitor.java index 0987f51c4b2..3107f104977 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTPropsVisitor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/CalcASTPropsVisitor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -41,6 +41,7 @@ package com.oracle.truffle.regex.tregex.parser.ast; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.graalvm.collections.EconomicMap; @@ -48,10 +49,13 @@ import org.graalvm.collections.Equivalence; import com.oracle.truffle.api.CompilerDirectives; +import com.oracle.truffle.regex.RegexSyntaxException; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.Constants; +import com.oracle.truffle.regex.errors.PyErrorMessages; import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer; import com.oracle.truffle.regex.tregex.parser.ast.visitors.DepthFirstTraversalRegexASTVisitor; +import com.oracle.truffle.regex.tregex.parser.flavors.PythonFlavor; import com.oracle.truffle.regex.tregex.string.Encodings; /** @@ -124,31 +128,38 @@ public class CalcASTPropsVisitor extends DepthFirstTraversalRegexASTVisitor { * When processing a {@link Group}, these flags will be set in the group iff they are set in * all of its alternatives. */ - private static final int AND_FLAGS = RegexASTNode.FLAG_STARTS_WITH_CARET | RegexASTNode.FLAG_ENDS_WITH_DOLLAR | RegexASTNode.FLAG_DEAD; + static final int AND_FLAGS = RegexASTNode.FLAG_STARTS_WITH_CARET | RegexASTNode.FLAG_ENDS_WITH_DOLLAR | RegexASTNode.FLAG_DEAD; /** * When processing a {@link Group}, these flags will be set in the group iff they are set in * any of its alternatives. */ - private static final int OR_FLAGS = RegexASTNode.FLAG_HAS_CARET | + static final int OR_FLAGS = RegexASTNode.FLAG_HAS_CARET | RegexASTNode.FLAG_HAS_DOLLAR | + RegexASTNode.FLAG_HAS_ATOMIC_GROUPS | RegexASTNode.FLAG_HAS_LOOPS | RegexASTNode.FLAG_HAS_QUANTIFIERS | RegexASTNode.FLAG_HAS_CAPTURE_GROUPS | RegexASTNode.FLAG_HAS_LOOK_AHEADS | RegexASTNode.FLAG_HAS_LOOK_BEHINDS | RegexASTNode.FLAG_HAS_BACK_REFERENCES; - private static final int CHANGED_FLAGS = AND_FLAGS | OR_FLAGS; + static final int CHANGED_FLAGS = AND_FLAGS | OR_FLAGS; private final RegexAST ast; + private final int[] captureGroupsMinWidth; + private final int[] captureGroupsMaxWidth; private final CompilationBuffer compilationBuffer; private final EconomicMap> conditionalBackReferences; private final EconomicMap> conditionGroups; public CalcASTPropsVisitor(RegexAST ast, CompilationBuffer compilationBuffer) { this.ast = ast; + this.captureGroupsMinWidth = new int[ast.getNumberOfCaptureGroups()]; + this.captureGroupsMaxWidth = new int[ast.getNumberOfCaptureGroups()]; this.compilationBuffer = compilationBuffer; this.conditionalBackReferences = EconomicMap.create(ast.getConditionGroups().numberOfSetBits()); this.conditionGroups = EconomicMap.create(ast.getConditionGroups().numberOfSetBits()); + Arrays.fill(captureGroupsMinWidth, -1); + Arrays.fill(captureGroupsMaxWidth, -1); } public static void run(RegexAST ast, CompilationBuffer compilationBuffer) { @@ -173,8 +184,46 @@ protected void visit(BackReference backReference) { } backReference.setHasBackReferences(); backReference.getParent().setHasBackReferences(); - if (backReference.hasQuantifier()) { - // TODO: maybe check if the referenced group can produce a zero-width match + + int minWidth = 0; + if (ast.getFlavor().backreferencesToUnmatchedGroupsFail()) { + /* + * Calculate the back-reference's min and max path by checking the referenced group's + * min and max width. This is useful only if + * ast.getFlavor().backreferencesToUnmatchedGroupsFail(), because otherwise + * back-references to groups that have not been matched yet will always match the empty + * string, and we would have to calculate the expression's dominator tree to check + * whether it is possible to reach the back-reference without matching the referenced + * group. + */ + minWidth = Integer.MAX_VALUE; + int maxWidth = 0; + boolean isDead = true; + for (int groupNumber : backReference.getGroupNumbers()) { + if (ast.getGroup(groupNumber).stream().allMatch(RegexASTNode::isDead)) { + continue; + } else { + isDead = false; + } + if (captureGroupsMinWidth[groupNumber] < 0) { + assert isReverse(); + minWidth = 0; + maxWidth = 0; + break; + } + minWidth = Math.min(minWidth, captureGroupsMinWidth[groupNumber]); + maxWidth = Math.max(minWidth, captureGroupsMaxWidth[groupNumber]); + } + if (isDead) { + backReference.markAsDead(); + backReference.getParent().markAsDead(); + return; + } + backReference.getParent().incMinPath(minWidth); + backReference.getParent().incMaxPath(maxWidth); + } + backReference.setMayMatchEmptyString(minWidth == 0); + if (minWidth == 0 && backReference.hasQuantifier()) { setZeroWidthQuantifierIndex(backReference); } if (backReference.hasNotUnrolledQuantifier()) { @@ -187,6 +236,7 @@ protected void visit(BackReference backReference) { @Override protected void visit(Group group) { + clearORFlags(group); if (group.getParent().isSequence() || group.getParent().isAtomicGroup()) { group.setMinPath(group.getParent().getMinPath()); group.setMaxPath(group.getParent().getMaxPath()); @@ -195,6 +245,9 @@ protected void visit(Group group) { group.setMinPath(0); group.setMaxPath(0); } + if (isForward() && group.hasQuantifier()) { + group.setEnclosedZeroWidthGroupsLo(ast.getGroupsWithGuards().size()); + } } @Override @@ -225,12 +278,6 @@ protected void leave(Group group) { conditionalBackReferences.get(referencedGroupNumber).add(group); } } - if (group.isDead()) { - if (group.getParent() != null) { - group.getParent().markAsDead(); - } - return; - } int minPath = Integer.MAX_VALUE; int maxPath = 0; int prefixLengthMin = 0; @@ -240,6 +287,12 @@ protected void leave(Group group) { if (s.isDead()) { continue; } + if (s.isQuantifierPassThroughSequence()) { + QuantifiableTerm term = s.quantifierPassThroughGetQuantifiedTerm(); + if (!term.isExpandedQuantifier() && !term.isOptionalQuantifier() && term.getQuantifier().getMin() > 0) { + continue; + } + } flags = (flags & (s.getFlags(AND_FLAGS) | ~AND_FLAGS)) | s.getFlags(OR_FLAGS); minPath = Math.min(minPath, s.getMinPath()); maxPath = Math.max(maxPath, s.getMaxPath()); @@ -248,11 +301,35 @@ protected void leave(Group group) { prefixLengthMax = Math.max(prefixLengthMax, s.getPrefixLengthMax()); } } + if ((flags & RegexASTNode.FLAG_DEAD) != 0) { + group.markAsDead(); + if (group.getParent() != null) { + group.getParent().markAsDead(); + } + return; + } + if (group.isCapturing()) { + captureGroupsMinWidth[group.getGroupNumber()] = minPath - group.getMinPath(); + captureGroupsMaxWidth[group.getGroupNumber()] = maxPath - group.getMaxPath(); + flags |= RegexASTNode.FLAG_HAS_CAPTURE_GROUPS; + if (group.getMinPath() == minPath && group.getMaxPath() == maxPath) { + ast.getProperties().setEmptyCaptureGroups(); + } + } if (group.hasQuantifier()) { + /* + * If a quantifier can produce a zero-width match, we have to check this in + * back-tracking mode. In flavors more complex than JS (where empty loop iterations can + * be admitted), we have to check this at all times. In JS, we can afford to only do + * this check when the expression contains back-references or lookarounds. + */ + if (minPath - group.getMinPath() == 0 || ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups()) { + setZeroWidthQuantifierIndex(group); + } if (!group.isExpandedQuantifier()) { flags |= RegexASTNode.FLAG_HAS_QUANTIFIERS; setQuantifierIndex(group); - if (group.getQuantifier().getMin() == 0) { + if (group.getQuantifier().getMin() == 0 || group.isOptionalQuantifier()) { flags &= ~(RegexASTNode.FLAG_STARTS_WITH_CARET | RegexASTNode.FLAG_ENDS_WITH_DOLLAR); } /* @@ -261,28 +338,16 @@ protected void leave(Group group) { * summed up with min and max path of the group, so sequence.minPath - group.minPath * is the sequence's "own" minPath */ - minPath = group.getMinPath() + ((minPath - group.getMinPath()) * group.getQuantifier().getMin()); + minPath = group.getMinPath() + ((minPath - group.getMinPath()) * (group.isOptionalQuantifier() ? 0 : group.getQuantifier().getMin())); if (group.getQuantifier().isInfiniteLoop()) { flags |= RegexASTNode.FLAG_HAS_LOOPS; + // Just increase maxPath by one loop iteration; It's enough to determine + // whether a given sub-expression is fixed-width. + maxPath = group.getMaxPath() + ((maxPath - group.getMaxPath()) * (group.getQuantifier().getMin() + 1)); } else { maxPath = group.getMaxPath() + ((maxPath - group.getMaxPath()) * group.getQuantifier().getMax()); } } - /* - * If a quantifier can produce a zero-width match, we have to check this in - * back-tracking mode. In flavors more complex than JS (where empty loop iterations can - * be admitted), we have to check this at all times. In JS, we can afford to only do - * this check when the expression contains back-references or lookarounds. - */ - if (minPath - group.getMinPath() == 0) { - setZeroWidthQuantifierIndex(group); - } - } - if (group.isCapturing()) { - flags |= RegexASTNode.FLAG_HAS_CAPTURE_GROUPS; - if (group.getMinPath() == minPath && group.getMaxPath() == maxPath) { - ast.getProperties().setEmptyCaptureGroups(); - } } group.setFlags(flags, CHANGED_FLAGS); group.setMinPath(minPath); @@ -304,13 +369,17 @@ protected void leave(Group group) { group.getParent().setPrefixLengthMax(prefixLengthMax); } } - if (isForward() && (group.hasEmptyGuard() || group.isLoop())) { + if (isForward() && group.hasQuantifier()) { + group.setEnclosedZeroWidthGroupsHi(ast.getGroupsWithGuards().size()); + } + if (isForward() && (group.hasEmptyGuard() || group.isLoop() || group.hasQuantifier())) { ast.registerGroupWithGuards(group); } } @Override protected void visit(Sequence sequence) { + clearORFlags(sequence); sequence.setMinPath(sequence.getParent().getMinPath()); sequence.setMaxPath(sequence.getParent().getMaxPath()); } @@ -372,10 +441,6 @@ protected void visit(PositionAssertion assertion) { } } break; - case MATCH_BEGIN: - case MATCH_END: - ast.getProperties().setMatchBoundaryAssertions(); - break; } assertion.setMinPath(assertion.getParent().getMinPath()); assertion.setMaxPath(assertion.getParent().getMaxPath()); @@ -383,6 +448,8 @@ protected void visit(PositionAssertion assertion) { @Override protected void visit(LookBehindAssertion assertion) { + clearORFlags(assertion); + assertion.setHasLookBehinds(); assertion.getParent().setHasLookBehinds(); assertion.setMinPath(assertion.getParent().getMinPath()); assertion.setMaxPath(assertion.getParent().getMaxPath()); @@ -416,10 +483,15 @@ protected void leave(LookBehindAssertion assertion) { } } leaveLookAroundAssertion(assertion); + if (isForward() && !assertion.isDead() && ast.getFlavor() == PythonFlavor.INSTANCE && !assertion.isFixedWidth()) { + throw RegexSyntaxException.createPattern(ast.getSource(), PyErrorMessages.LOOK_BEHIND_REQUIRES_FIXED_WIDTH_PATTERN, 0, RegexSyntaxException.ErrorCode.InvalidLookbehind); + } } @Override protected void visit(LookAheadAssertion assertion) { + clearORFlags(assertion); + assertion.setHasLookAheads(); assertion.getParent().setHasLookAheads(); assertion.setMinPath(assertion.getParent().getMinPath()); assertion.setMaxPath(assertion.getParent().getMaxPath()); @@ -439,6 +511,7 @@ protected void leave(LookAheadAssertion assertion) { @Override protected void visit(AtomicGroup atomicGroup) { + clearORFlags(atomicGroup); atomicGroup.setMinPath(atomicGroup.getParent().getMinPath()); atomicGroup.setMaxPath(atomicGroup.getParent().getMaxPath()); } @@ -446,9 +519,10 @@ protected void visit(AtomicGroup atomicGroup) { @Override protected void leave(AtomicGroup atomicGroup) { if (isForward() && !atomicGroup.isDead()) { - ast.getProperties().setAtomicGroups(); + atomicGroup.setHasAtomicGroups(); + atomicGroup.getParent().setHasAtomicGroups(); } - leaveSubtreeRootNode(atomicGroup, CHANGED_FLAGS); + setFlagsSubtreeRootNode(atomicGroup, CHANGED_FLAGS); atomicGroup.getParent().setMinPath(atomicGroup.getMinPath()); atomicGroup.getParent().setMaxPath(atomicGroup.getMaxPath()); } @@ -456,15 +530,47 @@ protected void leave(AtomicGroup atomicGroup) { private void leaveLookAroundAssertion(LookAroundAssertion assertion) { if (assertion.hasCaptureGroups()) { ast.getProperties().setCaptureGroupsInLookAroundAssertions(); + if (!ast.getOptions().getFlavor().nestedCaptureGroupsKeptOnLoopReentry() && !assertion.isNegated()) { + RegexASTNode parent = assertion.getParent(); + boolean innerGroupMayBeSkipped = assertion.getGroup().size() > 1; + while (parent != null) { + if (parent.isGroup()) { + Group parentGroup = parent.asGroup(); + innerGroupMayBeSkipped |= parentGroup.size() > 1; + if (innerGroupMayBeSkipped && parentGroup.hasQuantifier() && parentGroup.getQuantifier().isMaxGreaterThan(1)) { + /* + * This is a corner case we currently don't support in DFA mode: In + * ECMAScript, nested capture groups are cleared on every loop + * iteration, but merged look-around assertions may "spill" from one + * loop iteration into the next, which would require keeping track of + * loop enter/exit bounds across NFA states. This is probably doable, + * but not worth the effort, since these kinds of expressions are very + * rare. + * + * Example: matching regex a(?:c|b(?=(c)))* against "abc". The inner + * capture group would be set on the first iteration and cleared again + * on the second, but right now we don't keep track of that and set the + * capture group bounds simultaneously with matching 'c'. + */ + ast.getProperties().setLookAroundWithCaptureGroupsNestedInQuantifier(); + } + } + parent = parent.getParent(); + } + } } + setFlagsLookAroundAssertion(assertion); + } + + static void setFlagsLookAroundAssertion(LookAroundAssertion assertion) { // flag propagation to parent sequences: // - LookAhead expressions propagate all flags // - LookBehind expressions omit "startsWithCaret" and "endsWithDollar" // - negated lookarounds additionally don't propagate the "dead" flag - leaveSubtreeRootNode(assertion, assertion.isNegated() ? OR_FLAGS : assertion.isLookBehindAssertion() ? OR_FLAGS | RegexASTNode.FLAG_DEAD : CHANGED_FLAGS); + setFlagsSubtreeRootNode(assertion, assertion.isNegated() ? OR_FLAGS : assertion.isLookBehindAssertion() ? OR_FLAGS | RegexASTNode.FLAG_DEAD : CHANGED_FLAGS); } - private static void leaveSubtreeRootNode(RegexASTSubtreeRootNode subtreeRootNode, int flagMask) { + static void setFlagsSubtreeRootNode(RegexASTSubtreeRootNode subtreeRootNode, int flagMask) { subtreeRootNode.getParent().setFlags(subtreeRootNode.getFlags(flagMask) | subtreeRootNode.getParent().getFlags(flagMask), flagMask); } @@ -487,7 +593,9 @@ protected void visit(CharacterClass characterClass) { if (characterClass.hasNotUnrolledQuantifier()) { characterClass.getParent().setHasQuantifiers(); setQuantifierIndex(characterClass); - characterClass.getParent().incMinPath(characterClass.getQuantifier().getMin()); + if (!characterClass.isOptionalQuantifier()) { + characterClass.getParent().incMinPath(characterClass.getQuantifier().getMin()); + } if (characterClass.getQuantifier().isInfiniteLoop()) { characterClass.setHasLoops(); characterClass.getParent().setHasLoops(); @@ -596,4 +704,12 @@ private void registerConditionGroupsInLookAheadAssertions() { } } } + + private void clearORFlags(RegexASTNode node) { + // unset flags set by previous invocations of CalcASTFlagsVisitor, to account for removed or + // dead nodes + if (isReverse()) { + node.clearFlags(OR_FLAGS); + } + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java index 980a8c9052b..83e3ff670fa 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Group.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -77,8 +77,10 @@ public class Group extends QuantifiableTerm implements RegexASTVisitorIterable { private short visitorIterationIndex = 0; private short groupNumber = -1; private short groupsWithGuardsIndex = -1; - private short enclosedCaptureGroupsLow; - private short enclosedCaptureGroupsHigh; + private int enclosedCaptureGroupsLo; + private int enclosedCaptureGroupsHi; + private int enclosedZeroWidthGroupsLo; + private int enclosedZeroWidthGroupsHi; /** * Creates an empty non-capturing group. @@ -98,13 +100,17 @@ public class Group extends QuantifiableTerm implements RegexASTVisitorIterable { protected Group(Group copy) { super(copy); groupNumber = copy.groupNumber; - enclosedCaptureGroupsLow = copy.enclosedCaptureGroupsLow; - enclosedCaptureGroupsHigh = copy.enclosedCaptureGroupsHigh; + enclosedCaptureGroupsLo = copy.enclosedCaptureGroupsLo; + enclosedCaptureGroupsHi = copy.enclosedCaptureGroupsHi; } @Override public Group copy(RegexAST ast) { - return ast.register(new Group(this)); + Group copy = new Group(this); + if (isCapturing()) { + ast.registerCaptureGroupCopy(copy); + } + return ast.register(copy); } @Override @@ -206,8 +212,6 @@ public boolean isCapturing() { /** * Marks this {@link Group} as capturing and sets its group number. - * - * @param groupNumber */ public void setGroupNumber(int groupNumber) { assert groupNumber <= TRegexOptions.TRegexMaxNumberOfCaptureGroups; @@ -237,51 +241,67 @@ public void clearGroupNumber() { /** * Gets the (inclusive) lower bound of the range of capture groups contained within this group. */ - public int getEnclosedCaptureGroupsLow() { - return enclosedCaptureGroupsLow; + public int getEnclosedCaptureGroupsLo() { + return enclosedCaptureGroupsLo; } /** * Gets the (inclusive) lower bound of the range of capture groups in this term. In contrast to - * {@link #getEnclosedCaptureGroupsLow()}, this range contains the group itself if it is a + * {@link #getEnclosedCaptureGroupsLo()}, this range contains the group itself if it is a * capturing group. */ - public int getCaptureGroupsLow() { - return isCapturing() ? getGroupNumber() : enclosedCaptureGroupsLow; + public int getCaptureGroupsLo() { + return isCapturing() ? getGroupNumber() : enclosedCaptureGroupsLo; } /** * Sets the (inclusive) lower bound of the range of capture groups contained within this group. */ - public void setEnclosedCaptureGroupsLow(int enclosedCaptureGroupsLow) { - assert enclosedCaptureGroupsLow <= TRegexOptions.TRegexMaxNumberOfCaptureGroups; - this.enclosedCaptureGroupsLow = (short) enclosedCaptureGroupsLow; + public void setEnclosedCaptureGroupsLo(int enclosedCaptureGroupsLo) { + assert enclosedCaptureGroupsLo <= TRegexOptions.TRegexMaxNumberOfCaptureGroups; + this.enclosedCaptureGroupsLo = (short) enclosedCaptureGroupsLo; } /** * Gets the (exclusive) upper bound of the range of capture groups contained within this group. */ - public int getEnclosedCaptureGroupsHigh() { - return enclosedCaptureGroupsHigh; + public int getEnclosedCaptureGroupsHi() { + return enclosedCaptureGroupsHi; } /** * Gets the (exclusive) upper bound of the range of capture groups in this term. */ - public int getCaptureGroupsHigh() { - return enclosedCaptureGroupsHigh; + public int getCaptureGroupsHi() { + return enclosedCaptureGroupsHi; } /** * Sets the (exclusive) upper bound of the range of capture groups contained within this group. */ - public void setEnclosedCaptureGroupsHigh(int enclosedCaptureGroupsHigh) { - assert enclosedCaptureGroupsHigh <= TRegexOptions.TRegexMaxNumberOfCaptureGroups; - this.enclosedCaptureGroupsHigh = (short) enclosedCaptureGroupsHigh; + public void setEnclosedCaptureGroupsHi(int enclosedCaptureGroupsHi) { + assert enclosedCaptureGroupsHi <= TRegexOptions.TRegexMaxNumberOfCaptureGroups; + this.enclosedCaptureGroupsHi = (short) enclosedCaptureGroupsHi; } public boolean hasEnclosedCaptureGroups() { - return enclosedCaptureGroupsHigh > enclosedCaptureGroupsLow; + return enclosedCaptureGroupsHi > enclosedCaptureGroupsLo; + } + + public int getEnclosedZeroWidthGroupsLo() { + return enclosedZeroWidthGroupsLo; + } + + public void setEnclosedZeroWidthGroupsLo(int enclosedZeroWidthGroupsLo) { + this.enclosedZeroWidthGroupsLo = enclosedZeroWidthGroupsLo; + } + + public int getEnclosedZeroWidthGroupsHi() { + return enclosedZeroWidthGroupsHi; + } + + public void setEnclosedZeroWidthGroupsHi(int enclosedZeroWidthGroupsHi) { + this.enclosedZeroWidthGroupsHi = enclosedZeroWidthGroupsHi; } /** @@ -336,8 +356,6 @@ public boolean isEmpty() { /** * Adds a new alternative to this group. The new alternative will be appended to the * end, meaning it will have the lowest priority among all the alternatives. - * - * @param sequence */ public void add(Sequence sequence) { sequence.setParent(this); @@ -349,8 +367,6 @@ public void add(Sequence sequence) { * Inserts a new alternative to this group. The new alternative will be inserted at the * beginning, meaning it will have the highest priority among all the * alternatives. - * - * @param sequence */ public void insertFirst(Sequence sequence) { sequence.setParent(this); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/GroupBoundaries.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/GroupBoundaries.java index 33b22a9f20e..68d56c4b7f6 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/GroupBoundaries.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/GroupBoundaries.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -76,6 +76,7 @@ public class GroupBoundaries implements JsonConvertible { private final TBitSet updateIndices; private final TBitSet clearIndices; + private final int firstGroup; private final int lastGroup; private final int cachedHash; @CompilationFinal(dimensions = 1) private byte[] updateArrayByte; @@ -83,19 +84,20 @@ public class GroupBoundaries implements JsonConvertible { @CompilationFinal(dimensions = 1) private short[] updateArray; @CompilationFinal(dimensions = 1) private short[] clearArray; - GroupBoundaries(TBitSet updateIndices, TBitSet clearIndices, int lastGroup) { + GroupBoundaries(TBitSet updateIndices, TBitSet clearIndices, int firstGroup, int lastGroup) { this.updateIndices = updateIndices; this.clearIndices = clearIndices; + this.firstGroup = firstGroup; this.lastGroup = lastGroup; // both bit sets are immutable, and the hash is always needed immediately in // RegexAST#createGroupBoundaries() - this.cachedHash = (Objects.hashCode(updateIndices) * 31 + Objects.hashCode(clearIndices)) * 31 + lastGroup; + this.cachedHash = (Objects.hashCode(updateIndices) * 31 + Objects.hashCode(clearIndices)) * 31 + firstGroup * 31 + lastGroup; } public static GroupBoundaries[] createCachedGroupBoundaries() { GroupBoundaries[] instances = new GroupBoundaries[TBitSet.getNumberOfStaticInstances()]; for (int i = 0; i < instances.length; i++) { - instances[i] = new GroupBoundaries(TBitSet.getStaticInstance(i), TBitSet.getEmptyInstance(), -1); + instances[i] = new GroupBoundaries(TBitSet.getStaticInstance(i), TBitSet.getEmptyInstance(), -1, -1); } return instances; } @@ -212,6 +214,10 @@ public void updateBitSets(TBitSet foreignUpdateIndices, TBitSet foreignClearIndi foreignClearIndices.union(clearIndices); } + public int getFirstGroup() { + return firstGroup; + } + public int getLastGroup() { return lastGroup; } @@ -225,7 +231,7 @@ public boolean equals(Object obj) { return false; } GroupBoundaries o = (GroupBoundaries) obj; - return Objects.equals(updateIndices, o.updateIndices) && Objects.equals(clearIndices, o.clearIndices) && lastGroup == o.lastGroup; + return Objects.equals(updateIndices, o.updateIndices) && Objects.equals(clearIndices, o.clearIndices) && firstGroup == o.firstGroup && lastGroup == o.lastGroup; } @Override @@ -352,7 +358,7 @@ public JsonArray indexUpdateSourceSectionsToJson(RegexAST ast) { if (!hasIndexUpdates() || !ast.getOptions().isDumpAutomataWithSourceSections()) { return Json.array(); } - return RegexAST.sourceSectionsToJson(getUpdateIndices().stream().mapToObj(x -> ast.getSourceSections(ast.getGroupByBoundaryIndex(x)).get(x & 1))); + return RegexAST.sourceSectionsToJson(getUpdateIndices().stream().mapToObj(x -> ast.getSourceSections(ast.getGroupByBoundaryIndex(x).get(0)).get(x & 1))); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java index 1daabdfe55a..b6d1cc932da 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/QuantifiableTerm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -69,6 +69,10 @@ public boolean hasQuantifier() { return quantifier != null; } + public boolean hasMin0Quantifier() { + return hasQuantifier() && quantifier.getMin() == 0; + } + /** * Returns {@code true} iff this term has a quantifier that was not unrolled by the parser. */ diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java index caca3b2f4b7..2d29aadfea2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexAST.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -46,6 +46,7 @@ import java.util.StringJoiner; import java.util.stream.Stream; +import com.oracle.truffle.regex.tregex.parser.flavors.RegexFlavor; import org.graalvm.collections.EconomicMap; import org.graalvm.collections.Equivalence; @@ -98,7 +99,7 @@ public final class RegexAST implements StateIndex, JsonConvertible * Possibly wrapped root for NFA generation (see {@link #createPrefix()}). */ private Group wrappedRoot; - private final List captureGroups = new ArrayList<>(); + private final ArrayList> captureGroups = new ArrayList<>(); private final List quantifiers = new ArrayList<>(); private final List zeroWidthQuantifiables = new ArrayList<>(); private final GlobalSubTreeIndex subtrees = new GlobalSubTreeIndex(); @@ -146,6 +147,10 @@ public RegexOptions getOptions() { return source.getOptions(); } + public RegexFlavor getFlavor() { + return source.getOptions().getFlavor(); + } + public Encoding getEncoding() { return source.getEncoding(); } @@ -198,6 +203,10 @@ public Token.Quantifier[] getQuantifierArray() { return quantifiers.toArray(Token.Quantifier[]::new); } + public Token.Quantifier getQuantifier(int quantifierIndex) { + return quantifiers.get(quantifierIndex); + } + public void registerZeroWidthQuantifiable(QuantifiableTerm zeroWidthQuantifiable) { zeroWidthQuantifiable.getQuantifier().setZeroWidthIndex(zeroWidthQuantifiables.size()); zeroWidthQuantifiables.add(zeroWidthQuantifiable); @@ -207,11 +216,15 @@ public List getZeroWidthQuantifiables() { return zeroWidthQuantifiables; } - public Group getGroup(int index) { - return captureGroups.get(index); + /** + * Get capture group with given capture group number. May return multiple nodes due to + * quantifier unrolling. + */ + public ArrayList getGroup(int groupNumber) { + return captureGroups.get(groupNumber); } - public Group getGroupByBoundaryIndex(int index) { + public ArrayList getGroupByBoundaryIndex(int index) { return captureGroups.get(index / 2); } @@ -274,9 +287,8 @@ public GlobalSubTreeIndex getSubtrees() { } public void registerGroupWithGuards(Group group) { - if (group.getGroupsWithGuardsIndex() < 0) { - groupsWithGuards.add(group); - } + assert group.getGroupsWithGuardsIndex() < 0; + groupsWithGuards.add(group); } public GroupsWithGuardsIndex getGroupsWithGuards() { @@ -340,10 +352,21 @@ public Group createGroup() { public Group createCaptureGroup(int groupNumber) { Group group = register(new Group(groupNumber)); assert captureGroups.size() == groupNumber; - captureGroups.add(group); + ArrayList groupList = new ArrayList<>(); + groupList.add(group); + captureGroups.add(groupList); return group; } + public void registerCaptureGroupCopy(Group groupCopy) { + assert !captureGroups.get(groupCopy.getGroupNumber()).contains(groupCopy); + captureGroups.get(groupCopy.getGroupNumber()).add(groupCopy); + } + + public void clearRegisteredCaptureGroups(int groupNumber) { + captureGroups.get(groupNumber).clear(); + } + public Group createConditionalBackReferenceGroup(int referencedGroupNumber) { referencedGroups.set(referencedGroupNumber); conditionGroups.set(referencedGroupNumber); @@ -373,7 +396,7 @@ public AtomicGroup createAtomicGroup() { } public void createNFAHelperNodes(RegexASTSubtreeRootNode rootNode) { - nodeCount.inc(4); + nodeCount.inc(5); PositionAssertion anchored = new PositionAssertion(PositionAssertion.Type.CARET); rootNode.setAnchoredInitialState(anchored); MatchFound unAnchored = new MatchFound(); @@ -382,6 +405,8 @@ public void createNFAHelperNodes(RegexASTSubtreeRootNode rootNode) { rootNode.setMatchFound(end); PositionAssertion anchoredEnd = new PositionAssertion(PositionAssertion.Type.DOLLAR); rootNode.setAnchoredFinalState(anchoredEnd); + MatchFound endChecked = new MatchFound(); + rootNode.setMatchFoundChecked(endChecked); } public PositionAssertion createPositionAssertion(PositionAssertion.Type type) { @@ -558,18 +583,18 @@ public void unhidePrefix() { } } - public GroupBoundaries createGroupBoundaries(TBitSet updateIndices, TBitSet clearIndices, int lastGroup) { + public GroupBoundaries createGroupBoundaries(TBitSet updateIndices, TBitSet clearIndices, int firstGroup, int lastGroup) { if (!getOptions().getFlavor().usesLastGroupResultField()) { GroupBoundaries staticInstance = GroupBoundaries.getStaticInstance(language, updateIndices, clearIndices); if (staticInstance != null) { return staticInstance; } } - GroupBoundaries lookup = new GroupBoundaries(updateIndices, clearIndices, lastGroup); + GroupBoundaries lookup = new GroupBoundaries(updateIndices, clearIndices, firstGroup, lastGroup); if (groupBoundariesDeduplicationMap.containsKey(lookup)) { return groupBoundariesDeduplicationMap.get(lookup); } else { - GroupBoundaries gb = new GroupBoundaries(updateIndices.copy(), clearIndices.copy(), lastGroup); + GroupBoundaries gb = new GroupBoundaries(updateIndices.copy(), clearIndices.copy(), firstGroup, lastGroup); groupBoundariesDeduplicationMap.put(gb, gb); return gb; } @@ -679,8 +704,9 @@ public boolean canTransformToDFA() { getProperties().hasNonLiteralLookBehindAssertions() || getProperties().hasNegativeLookBehindAssertions() || getRoot().hasQuantifiers() || - getProperties().hasAtomicGroups() || - getProperties().hasConditionalReferencesIntoLookAheads()) && + getRoot().hasAtomicGroups() || + getProperties().hasConditionalReferencesIntoLookAheads() || + getProperties().hasLookAroundWithCaptureGroupsNestedInQuantifier()) && couldCalculateLastGroup; } @@ -715,12 +741,15 @@ public String canTransformToDFAFailureReason() { if (getRoot().hasQuantifiers()) { sb.add("could not unroll all quantifiers"); } - if (getProperties().hasAtomicGroups()) { + if (getRoot().hasAtomicGroups()) { sb.add("regex has atomic groups"); } if (getProperties().hasConditionalReferencesIntoLookAheads()) { sb.add("regex has conditional back-references into look-ahead assertions"); } + if (getProperties().hasLookAroundWithCaptureGroupsNestedInQuantifier()) { + sb.add("regex has look-around assertion with capture groups nested in a quantified group"); + } return sb.toString(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java index 3e3347fb8f6..825407a6728 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -62,20 +62,23 @@ public abstract class RegexASTNode implements JsonConvertible { static final int FLAG_BACK_REFERENCE_IS_NESTED_OR_FORWARD = 1 << 8; static final int FLAG_BACK_REFERENCE_IS_IGNORE_CASE = 1 << 9; static final int FLAG_BACK_REFERENCE_IS_IGNORE_CASE_ALTERNATIVE_MODE = 1 << 10; - static final int FLAG_GROUP_LOOP = 1 << 11; - static final int FLAG_GROUP_EXPANDED_QUANTIFIER = 1 << 12; - static final int FLAG_GROUP_MANDATORY_UNROLLED_QUANTIFIER = 1 << 13; - static final int FLAG_GROUP_QUANTIFIER_PASS_THROUGH_SEQUENCE = 1 << 14; - static final int FLAG_GROUP_LOCAL_FLAGS = 1 << 15; - static final int FLAG_EMPTY_GUARD = 1 << 16; - static final int FLAG_LOOK_AROUND_NEGATED = 1 << 17; - static final int FLAG_HAS_LOOPS = 1 << 18; - static final int FLAG_HAS_CAPTURE_GROUPS = 1 << 19; - static final int FLAG_HAS_QUANTIFIERS = 1 << 20; - static final int FLAG_HAS_LOOK_BEHINDS = 1 << 21; - static final int FLAG_HAS_LOOK_AHEADS = 1 << 22; - static final int FLAG_HAS_BACK_REFERENCES = 1 << 23; - static final int FLAG_CHARACTER_CLASS_WAS_SINGLE_CHAR = 1 << 24; + static final int FLAG_MAY_MATCH_EMPTY_STRING = 1 << 11; + static final int FLAG_GROUP_LOOP = 1 << 12; + static final int FLAG_GROUP_EXPANDED_QUANTIFIER = 1 << 13; + static final int FLAG_GROUP_MANDATORY_QUANTIFIER = 1 << 14; + static final int FLAG_GROUP_OPTIONAL_QUANTIFIER = 1 << 15; + static final int FLAG_GROUP_QUANTIFIER_PASS_THROUGH_SEQUENCE = 1 << 16; + static final int FLAG_GROUP_LOCAL_FLAGS = 1 << 17; + static final int FLAG_EMPTY_GUARD = 1 << 18; + static final int FLAG_LOOK_AROUND_NEGATED = 1 << 19; + static final int FLAG_HAS_ATOMIC_GROUPS = 1 << 20; + static final int FLAG_HAS_LOOPS = 1 << 21; + static final int FLAG_HAS_CAPTURE_GROUPS = 1 << 22; + static final int FLAG_HAS_QUANTIFIERS = 1 << 23; + static final int FLAG_HAS_LOOK_BEHINDS = 1 << 24; + static final int FLAG_HAS_LOOK_AHEADS = 1 << 25; + static final int FLAG_HAS_BACK_REFERENCES = 1 << 26; + static final int FLAG_CHARACTER_CLASS_WAS_SINGLE_CHAR = 1 << 27; private int id = -1; private RegexASTNode parent; @@ -150,6 +153,10 @@ protected boolean isFlagSet(int flag) { return (flags & flag) != 0; } + protected boolean areAllFlagsSet(int multipleFlags) { + return (flags & multipleFlags) == multipleFlags; + } + protected void setFlag(int flag) { setFlag(flag, true); } @@ -166,6 +173,13 @@ protected void setFlags(int newFlags, int mask) { flags = flags & ~mask | newFlags; } + /** + * Clear all flags denoted by {@code mask}. + */ + protected void clearFlags(int mask) { + flags = flags & ~mask; + } + protected void setFlag(int flag, boolean value) { if (value) { flags |= flag; @@ -224,6 +238,14 @@ public void setEmptyGuard(boolean emptyGuard) { setFlag(FLAG_EMPTY_GUARD, emptyGuard); } + public boolean mayMatchEmptyString() { + return isFlagSet(FLAG_MAY_MATCH_EMPTY_STRING); + } + + public void setMayMatchEmptyString(boolean value) { + setFlag(FLAG_MAY_MATCH_EMPTY_STRING, value); + } + /** * Subexpression contains {@link #isCaret() "^"}. */ @@ -284,6 +306,21 @@ public void setEndsWithDollar(boolean endsWithDollar) { setFlag(FLAG_ENDS_WITH_DOLLAR, endsWithDollar); } + /** + * Subexpression contains {@link AtomicGroup atomic groups}. + */ + public boolean hasAtomicGroups() { + return isFlagSet(FLAG_HAS_ATOMIC_GROUPS); + } + + public void setHasAtomicGroups() { + setHasAtomicGroups(true); + } + + public void setHasAtomicGroups(boolean hasAtomicGroups) { + setFlag(FLAG_HAS_ATOMIC_GROUPS, hasAtomicGroups); + } + /** * Subexpression contains {@link Group#isLoop() loops}. */ @@ -344,6 +381,10 @@ public void setHasLookBehinds() { setFlag(FLAG_HAS_LOOK_BEHINDS, true); } + public boolean hasLookArounds() { + return isFlagSet(FLAG_HAS_LOOK_AHEADS | FLAG_HAS_LOOK_BEHINDS); + } + /** * Subexpression contains {@link #isBackReference() back-references}. */ @@ -390,23 +431,48 @@ public void setExpandedQuantifier(boolean expandedQuantifier) { /** * Indicates whether this {@link RegexASTNode} represents a mandatory copy of a quantified term - * after unrolling. + * after unrolling or splitting. * * E.g., in the expansion of A{2,4}, which is AA(A(A|)|), the first two occurrences of A are * marked with this flag. */ - public boolean isMandatoryUnrolledQuantifier() { - return isFlagSet(FLAG_GROUP_MANDATORY_UNROLLED_QUANTIFIER); + public boolean isMandatoryQuantifier() { + return isFlagSet(FLAG_GROUP_MANDATORY_QUANTIFIER); + } + + /** + * Marks this {@link RegexASTNode} as being inserted into the AST as the mandatory part of + * unrolling or splitting a quantified term. + * + * @see #isMandatoryQuantifier() + */ + public void setMandatoryQuantifier(boolean mandatoryQuantifier) { + setFlag(FLAG_GROUP_MANDATORY_QUANTIFIER, mandatoryQuantifier); + } + + /** + * Indicates whether this {@link RegexASTNode} represents an optional copy of a quantified term + * after unrolling or splitting. + * + * E.g., in the expansion of A{2,4}, which is AA(A(A|)|), the groups (A(A|)|) are marked with + * this flag. + */ + public boolean isOptionalQuantifier() { + return isFlagSet(FLAG_GROUP_OPTIONAL_QUANTIFIER); } /** - * Marks this {@link RegexASTNode} as being inserted into the AST as part of unrolling the - * mandatory part of a quantified term. + * Marks this {@link RegexASTNode} as being inserted into the AST as the optional part of + * unrolling or splitting a quantified term. * - * @see #isMandatoryUnrolledQuantifier() + * @see #isOptionalQuantifier() */ - public void setMandatoryUnrolledQuantifier(boolean mandatoryUnrolledQuantifier) { - setFlag(FLAG_GROUP_MANDATORY_UNROLLED_QUANTIFIER, mandatoryUnrolledQuantifier); + public void setOptionalQuantifier(boolean optionalQuantifier) { + setFlag(FLAG_GROUP_OPTIONAL_QUANTIFIER, optionalQuantifier); + } + + public boolean isMandatoryUnrolledQuantifier() { + return areAllFlagsSet(FLAG_GROUP_MANDATORY_QUANTIFIER | FLAG_GROUP_EXPANDED_QUANTIFIER); } /** @@ -500,6 +566,10 @@ public boolean isInLookAheadAssertion() { return getSubTreeParent() instanceof LookAheadAssertion; } + public boolean isInLookAroundAssertion() { + return getSubTreeParent() instanceof LookAroundAssertion; + } + public String toStringWithID() { return String.format("%d (%s)", id, toString()); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java index cd9b96d754f..961ff298c38 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/RegexASTSubtreeRootNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -61,6 +61,7 @@ public abstract class RegexASTSubtreeRootNode extends Term implements RegexASTVi private MatchFound unAnchoredInitialState; private PositionAssertion anchoredFinalState; private MatchFound matchFound; + private MatchFound matchFoundChecked; private boolean visitorGroupVisited = false; private final SubTreeIndex subtrees = new SubTreeIndex(); @@ -84,6 +85,15 @@ public abstract class RegexASTSubtreeRootNode extends Term implements RegexASTVi setGroup(copy.group.copyRecursive(ast, compilationBuffer)); } + @Override + public void markAsDead() { + super.markAsDead(); + anchoredInitialState.markAsDead(); + unAnchoredInitialState.markAsDead(); + anchoredFinalState.markAsDead(); + matchFound.markAsDead(); + } + public boolean globalSubTreeIdInitialized() { return globalSubTreeId >= 0; } @@ -188,6 +198,10 @@ public void setAnchoredFinalState(PositionAssertion anchoredFinalState) { anchoredFinalState.setNext(group); } + public boolean isFixedWidth() { + return getGroup().getMinPath() == getGroup().getMaxPath(); + } + @Override public boolean visitorHasNext() { return !visitorGroupVisited; @@ -217,4 +231,13 @@ public String toString() { protected JsonObject toJson(String typeName) { return super.toJson(typeName).append(Json.prop("group", astNodeId(group))); } + + public void setMatchFoundChecked(MatchFound matchFoundChecked) { + this.matchFoundChecked = matchFoundChecked; + } + + public MatchFound getMatchFoundChecked() { + assert matchFoundChecked != null; + return matchFoundChecked; + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java index 792a1971a61..ed60a2fd2f2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/Sequence.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -200,13 +200,23 @@ public boolean isSingleCharClass() { return size() == 1 && isLiteral(); } + public QuantifiableTerm quantifierPassThroughGetQuantifiedTerm() { + assert isQuantifierPassThroughSequence(); + assert getParent().isGroup(); + assert getParent().size() == 2; + Sequence otherSeq = isFirstInGroup() ? getParent().getLastAlternative() : getParent().getFirstAlternative(); + Term quantifiedTerm = isInLookBehindAssertion() ? otherSeq.getLastTerm() : otherSeq.getFirstTerm(); + assert otherSeq.size() <= 2 || quantifiedTerm.isExpandedQuantifier(); + return quantifiedTerm.asQuantifiableTerm(); + } + public int getEnclosedCaptureGroupsLow() { int lo = Integer.MAX_VALUE; for (Term t : terms) { if (t instanceof Group) { Group g = (Group) t; - if (g.getEnclosedCaptureGroupsLow() != g.getEnclosedCaptureGroupsHigh()) { - lo = Math.min(lo, g.getEnclosedCaptureGroupsLow()); + if (g.getEnclosedCaptureGroupsLo() != g.getEnclosedCaptureGroupsHi()) { + lo = Math.min(lo, g.getEnclosedCaptureGroupsLo()); } if (g.isCapturing()) { lo = Math.min(lo, g.getGroupNumber()); @@ -221,8 +231,8 @@ public int getEnclosedCaptureGroupsHigh() { for (Term t : terms) { if (t instanceof Group) { Group g = (Group) t; - if (g.getEnclosedCaptureGroupsLow() != g.getEnclosedCaptureGroupsHigh()) { - hi = Math.max(hi, g.getEnclosedCaptureGroupsHigh()); + if (g.getEnclosedCaptureGroupsLo() != g.getEnclosedCaptureGroupsHi()) { + hi = Math.max(hi, g.getEnclosedCaptureGroupsHi()); } if (g.isCapturing()) { hi = Math.max(hi, g.getGroupNumber() + 1); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/InitIDVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/InitIDVisitor.java index 561207db7ae..ac17287783a 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/InitIDVisitor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/InitIDVisitor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -121,6 +121,7 @@ protected void leave(Group group) { if (group.getParent() instanceof RegexASTSubtreeRootNode) { initID(group.getSubTreeParent().getAnchoredFinalState()); initID(group.getSubTreeParent().getMatchFound()); + initID(group.getSubTreeParent().getMatchFoundChecked()); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkAsAliveVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkAsAliveVisitor.java new file mode 100644 index 00000000000..850750be66f --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkAsAliveVisitor.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.tregex.parser.ast.visitors; + +import com.oracle.truffle.regex.tregex.parser.ast.RegexASTNode; + +public class MarkAsAliveVisitor extends DepthFirstTraversalRegexASTVisitor { + + public static void markAsAlive(RegexASTNode runRoot) { + new MarkAsAliveVisitor().run(runRoot); + } + + @Override + protected void doVisit(RegexASTNode cur) { + cur.setDead(false); + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkAsDeadVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkAsDeadVisitor.java new file mode 100644 index 00000000000..3129c67f1d3 --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkAsDeadVisitor.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.tregex.parser.ast.visitors; + +import com.oracle.truffle.regex.tregex.parser.ast.RegexASTNode; + +public class MarkAsDeadVisitor extends DepthFirstTraversalRegexASTVisitor { + + public static void markAsDead(RegexASTNode runRoot) { + new MarkAsDeadVisitor().run(runRoot); + } + + @Override + protected void doVisit(RegexASTNode cur) { + cur.markAsDead(); + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkLookBehindEntriesVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkLookBehindEntriesVisitor.java index 76aaa8cce9a..1f9b2a4cfe1 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkLookBehindEntriesVisitor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/MarkLookBehindEntriesVisitor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -86,7 +86,7 @@ public MarkLookBehindEntriesVisitor(RegexAST ast) { public void run() { for (RegexASTSubtreeRootNode subtreeRootNode : ast.getSubtrees()) { - if (!subtreeRootNode.isLookBehindAssertion()) { + if (!subtreeRootNode.isLookBehindAssertion() || subtreeRootNode.isDead()) { continue; } LookBehindAssertion lb = subtreeRootNode.asLookBehindAssertion(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java index 48b26bddbe8..fad660277b9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/NFATraversalRegexASTVisitor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -40,13 +40,14 @@ */ package com.oracle.truffle.regex.tregex.parser.ast.visitors; +import static com.oracle.truffle.regex.tregex.util.MathUtil.saturatingInc; + import java.util.Arrays; import java.util.Set; import org.graalvm.collections.EconomicSet; import com.oracle.truffle.api.CompilerDirectives; -import com.oracle.truffle.regex.tregex.automaton.StateSet; import com.oracle.truffle.regex.tregex.buffer.LongArrayBuffer; import com.oracle.truffle.regex.tregex.nfa.ASTStepVisitor; import com.oracle.truffle.regex.tregex.nfa.TransitionGuard; @@ -54,7 +55,6 @@ import com.oracle.truffle.regex.tregex.parser.ast.CharacterClass; import com.oracle.truffle.regex.tregex.parser.ast.Group; import com.oracle.truffle.regex.tregex.parser.ast.GroupBoundaries; -import com.oracle.truffle.regex.tregex.parser.ast.GroupsWithGuardsIndex; import com.oracle.truffle.regex.tregex.parser.ast.LookAheadAssertion; import com.oracle.truffle.regex.tregex.parser.ast.LookAroundAssertion; import com.oracle.truffle.regex.tregex.parser.ast.LookBehindAssertion; @@ -64,6 +64,7 @@ import com.oracle.truffle.regex.tregex.parser.ast.RegexASTNode; import com.oracle.truffle.regex.tregex.parser.ast.Sequence; import com.oracle.truffle.regex.tregex.parser.ast.Term; +import com.oracle.truffle.regex.tregex.parser.flavors.RegexFlavor; import com.oracle.truffle.regex.util.TBitSet; /** @@ -118,21 +119,6 @@ public abstract class NFATraversalRegexASTVisitor { * alternation we should visit when back-tracking to find the next successor. */ private final LongArrayBuffer curPath = new LongArrayBuffer(8); - /** - * insideLoops is the set of looping groups that we are currently inside of. We need to maintain - * this in order to detect infinite loops in the NFA traversal. If we enter a looping group, - * traverse it without encountering a CharacterClass node or a MatchFound node and arrive back - * at the same group, then we are bound to loop like this forever. Using insideLoops, we can - * detect this situation and proceed with the search using another alternative. For example, in - * the RegexAST {@code ((|[a])*|)*}, which corresponds to the regex {@code /(a*?)* /}, we can - * traverse the inner loop, {@code (|[a])*}, without hitting any CharacterClass node by choosing - * the first alternative and we will then arrive back at the outer loop. There, we detect an - * infinite loop, which causes us to backtrack and choose the second alternative in the inner - * loop, leading us to the CharacterClass node {@code [a]}.
- * NB: For every looping group, this set tells us whether there is an {@code enter} node for it - * on the current path. - */ - private final StateSet insideLoops; /** * This set is needed to make sure that a quantified term cannot match the empty string, as is * specified in step 2a of RepeatMatcher from ECMAScript draft 2018, chapter 21.2.2.5.1. @@ -181,36 +167,54 @@ public abstract class NFATraversalRegexASTVisitor { private int caretsOnPath = 0; private int matchBeginAssertionsOnPath = 0; private int matchEndAssertionsOnPath = 0; - private final int[] lookAroundVisitiedCount; + private final int[] lookAroundVisitedCount; private final TBitSet captureGroupUpdates; private final TBitSet captureGroupClears; private final TBitSet referencedGroupBoundaries; + private int firstGroup = -1; private int lastGroup = -1; - private final TBitSet boundedQuantifiersLoop; - private final TBitSet boundedQuantifiersExited; - /** - * Quantifier guards are stored in an immutable linked list, which allows for cheap sharing of - * snapshots for the purposes of deduplication. + * Per-quantifier position of last quantified group exit or escape in the transition guards + * array. + */ + private final int[] bqLastCounterReset; + /** + * Per-quantifier position of last quantified group zero-width-enter guard in the transition + * guards array. + */ + private final int[] bqLastZeroWidthEnter; + /** + * Tracks whether a given quantifier has been exited "normally" on the current path. */ + private final TBitSet bqExited; + /** + * Tracks whether a given quantifier has been bypassed using either a group passthrough or a + * group escape on the current path. + */ + private final TBitSet bqBypassed; + private final TBitSet referencedCaptureGroupsTmp; + private final LongArrayBuffer transitionGuards = new LongArrayBuffer(8); + private final LongArrayBuffer transitionGuardsCanonicalized = new LongArrayBuffer(8); private long[] transitionGuardsResult = null; protected NFATraversalRegexASTVisitor(RegexAST ast) { this.ast = ast; - this.insideLoops = StateSet.create(ast.getGroupsWithGuards()); this.insideEmptyGuardGroup = new TBitSet(ast.getGroupsWithGuards().size()); this.lookAroundsOnPath = new TBitSet(ast.getSubtrees().size()); - this.lookAroundVisitiedCount = new int[ast.getSubtrees().size()]; + this.lookAroundVisitedCount = new int[ast.getSubtrees().size()]; this.captureGroupUpdates = new TBitSet(ast.getNumberOfCaptureGroups() * 2); this.captureGroupClears = new TBitSet(ast.getNumberOfCaptureGroups() * 2); this.referencedGroupBoundaries = new TBitSet(ast.getNumberOfCaptureGroups() * 2); - this.boundedQuantifiersLoop = new TBitSet(ast.getQuantifierCount()); - this.boundedQuantifiersExited = new TBitSet(ast.getQuantifierCount()); + this.bqLastCounterReset = new int[ast.getQuantifierCount()]; + this.bqLastZeroWidthEnter = new int[ast.getGroupsWithGuards().size()]; + this.bqExited = new TBitSet(ast.getGroupsWithGuards().size()); + this.bqBypassed = new TBitSet(ast.getGroupsWithGuards().size()); for (int i : ast.getReferencedGroups()) { referencedGroupBoundaries.set(Group.groupNumberToBoundaryIndexStart(i)); referencedGroupBoundaries.set(Group.groupNumberToBoundaryIndexEnd(i)); } + this.referencedCaptureGroupsTmp = new TBitSet(ast.getNumberOfCaptureGroups()); } public Set getTraversableLookBehindAssertions() { @@ -267,6 +271,14 @@ public void setReverse(boolean reverse) { this.forward = !reverse; } + private void setShouldRetreat() { + shouldRetreat = true; + } + + protected RegexFlavor getFlavor() { + return ast.getOptions().getFlavor(); + } + protected abstract boolean isBuildingDFA(); protected abstract boolean canPruneAfterUnconditionalFinalState(); @@ -278,7 +290,6 @@ private boolean canTraverseLookArounds() { protected void run(Term runRoot) { clearCaptureGroupData(); recalcTransitionGuards = false; - assert insideLoops.isEmpty(); assert insideEmptyGuardGroup.isEmpty(); assert curPath.isEmpty(); assert dollarsOnPath == 0; @@ -286,11 +297,12 @@ protected void run(Term runRoot) { assert matchBeginAssertionsOnPath == 0; assert matchEndAssertionsOnPath == 0; assert lookAroundsOnPath.isEmpty(); - assert nodeVisitsEmpty() : Arrays.toString(lookAroundVisitiedCount); + assert isEmpty(lookAroundVisitedCount) : Arrays.toString(lookAroundVisitedCount); assert !shouldRetreat; assert transitionGuards.isEmpty(); assert captureGroupUpdates.isEmpty(); assert captureGroupClears.isEmpty(); + assert firstGroup == -1; assert lastGroup == -1; root = runRoot; pathDeduplicationSet.clear(); @@ -316,14 +328,13 @@ protected void run(Term runRoot) { if (done) { break; } - RegexASTNode target = pathGetNode(curPath.peek()); - visit(target); - if (canPruneAfterUnconditionalFinalState() && target.isMatchFound() && !dollarsOnPath() && !caretsOnPath() && lookAroundsOnPath.isEmpty() && !hasTransitionGuards()) { + assert cur == pathGetNode(curPath.peek()); + visit(cur); + if (canPruneAfterUnconditionalFinalState() && cur.isMatchFound() && !dollarsOnPath() && !caretsOnPath() && lookAroundsOnPath.isEmpty() && !hasTransitionGuards()) { /* * Transitions after an unconditional final state transition will never be taken, so * it is safe to prune them. */ - insideLoops.clear(); insideEmptyGuardGroup.clear(); curPath.clear(); clearCaptureGroupData(); @@ -343,13 +354,11 @@ protected void run(Term runRoot) { // If we have back-tracked into an empty-match transition, then we must continue by // advancing past the empty-match group using advanceTerm instead of entering the group // again using doAdvance. - if (cur.isGroup() && cur.hasEmptyGuard()) { + if (cur.isGroup() && cur.hasEmptyGuard() && !done) { foundNextTarget = advanceTerm(cur.asGroup()); } } - if (useTransitionGuards()) { - clearTransitionGuards(); - } + clearTransitionGuards(); done = false; } @@ -390,13 +399,12 @@ protected long[] getTransitionGuardsOnPath() { protected void calcTransitionGuardsResult() { if (transitionGuardsResult == null) { - assert useTransitionGuards() || getTransitionGuards().isEmpty(); transitionGuardsResult = getTransitionGuards().isEmpty() ? TransitionGuard.NO_GUARDS : getTransitionGuards().toArray(); } } protected GroupBoundaries getGroupBoundaries() { - return ast.createGroupBoundaries(getCaptureGroupUpdates(), getCaptureGroupClears(), getLastGroup()); + return ast.createGroupBoundaries(getCaptureGroupUpdates(), getCaptureGroupClears(), getFirstGroup(), getLastGroup()); } /** @@ -406,10 +414,7 @@ protected GroupBoundaries getGroupBoundaries() { * @return {@code true} if a successor was reached in this step */ private boolean doAdvance() { - // We only use the insideLoops optimization when the regex flavor does not allow empty loop - // iterations. Empty loop iterations can occur when a regex flavor monitor capture groups - // in its empty check, or when it doesn't use backtracking when exiting a loop. - if (cur.isDead() || (!ast.getOptions().getFlavor().canHaveEmptyLoopIterations() && cur.isGroupWithGuards() && insideLoops.contains(cur.asGroup()))) { + if (cur.isDead()) { return retreat(); } if (cur.isSequence()) { @@ -419,19 +424,13 @@ private boolean doAdvance() { if (sequence.isQuantifierPassThroughSequence()) { // this empty sequence was inserted during quantifier expansion, so it is // allowed to pass through the parent quantified group. - assert pathGetNode(curPath.peek()) == parent && pathIsGroupEnter(curPath.peek()); + assert pathGetNode(curPath.peek()) == parent && PathElement.isGroupEnter(curPath.peek()); switchEnterToPassThrough(parent); - if (shouldRetreat) { - return retreat(); - } - if (parent.isLoop()) { - unregisterInsideLoop(parent); - } } else { pushGroupExit(parent); - if (shouldRetreat) { - return retreat(); - } + } + if (shouldRetreat) { + return retreat(); } return advanceTerm(parent); } else { @@ -447,9 +446,6 @@ private boolean doAdvance() { if (group.hasEmptyGuard()) { insideEmptyGuardGroup.set(group.getGroupsWithGuardsIndex()); } - if (group.isLoop()) { - registerInsideLoop(group); - } // This path will only be hit when visiting a group for the first time. All groups // must have at least one child sequence, so no check is needed here. // createGroupEnterPathElement initializes the group alternation index with 1, so we @@ -457,77 +453,87 @@ private boolean doAdvance() { cur = group.getFirstAlternative(); return deduplicatePath(true); } else { - curPath.add(createPathElement(cur)); + curPath.add(PathElement.create(cur)); if (cur.isPositionAssertion()) { - final PositionAssertion assertion = (PositionAssertion) cur; - switch (assertion.type) { - case CARET: - caretsOnPath++; - if (canTraverseCaret) { - return advanceTerm(assertion); - } else { - return retreat(); - } - case DOLLAR: - dollarsOnPath++; - return advanceTerm(assertion); - case MATCH_BEGIN: - if (!ignoreMatchBoundaryAssertions) { - matchBeginAssertionsOnPath++; - if (forward && isBuildingDFA() && !isRootEnterOnPath()) { - return retreat(); - } - } - return advanceTerm(assertion); - case MATCH_END: - if (!ignoreMatchBoundaryAssertions) { - matchEndAssertionsOnPath++; - if (!forward && isBuildingDFA() && !isRootEnterOnPath()) { - return retreat(); - } - } - return advanceTerm(assertion); - default: - throw CompilerDirectives.shouldNotReachHere(); - } + return advancePositionAssertion(cur.asPositionAssertion()); } else if (cur.isLookAroundAssertion()) { - LookAroundAssertion lookAround = cur.asLookAroundAssertion(); - if (canTraverseLookArounds()) { - if (lookAround.isLookAheadAssertion()) { - enterLookAhead(lookAround.asLookAheadAssertion()); - addLookAroundToVisitedSet(); - return advanceTerm(lookAround); - } else { - assert lookAround.isLookBehindAssertion(); - addLookAroundToVisitedSet(); - if (traversableLookBehindAssertions == null || traversableLookBehindAssertions.contains(lookAround.asLookBehindAssertion())) { - return advanceTerm(lookAround); - } else { - return retreat(); - } + return advanceLookAround(cur.asLookAroundAssertion()); + } else { + assert cur.isCharacterClass() || cur.isBackReference() || cur.isMatchFound() || cur.isAtomicGroup(); + if ((forward && dollarsOnPath() || !forward && caretsOnPath()) && !canMatchEmptyString(cur)) { + return retreat(); + } + if (!ignoreMatchBoundaryAssertions) { + if ((forward && matchBeginAssertionsOnPath() || !forward && matchEndAssertionsOnPath()) && !isRootEnterOnPath() && !canMatchEmptyString(cur)) { + return retreat(); + } + if (matchEndAssertionsOnPath() && (cur.isCharacterClass() || cur.isBackReference())) { + return retreat(); } } return true; + } + } + } + + private boolean advanceLookAround(LookAroundAssertion lookAround) { + if (canTraverseLookArounds()) { + if (lookAround.isLookAheadAssertion()) { + enterLookAhead(lookAround.asLookAheadAssertion()); + addLookAroundToVisitedSet(); + return advanceTerm(lookAround); } else { - assert cur.isCharacterClass() || cur.isBackReference() || cur.isMatchFound() || cur.isAtomicGroup(); - if ((forward && dollarsOnPath() || !forward && caretsOnPath()) && cur.isCharacterClass()) { - // don't visit CharacterClass nodes if we traversed PositionAssertions already + assert lookAround.isLookBehindAssertion(); + addLookAroundToVisitedSet(); + if (traversableLookBehindAssertions == null || traversableLookBehindAssertions.contains(lookAround.asLookBehindAssertion())) { + return advanceTerm(lookAround); + } else { return retreat(); } - if (!ignoreMatchBoundaryAssertions && matchEndAssertionsOnPath() && (cur.isCharacterClass() || cur.isBackReference())) { + } + } + return true; + } + + private boolean advancePositionAssertion(PositionAssertion assertion) { + switch (assertion.type) { + case CARET: + caretsOnPath++; + if (canTraverseCaret) { + return advanceTerm(assertion); + } else { return retreat(); } - return true; - } + case DOLLAR: + dollarsOnPath++; + return advanceTerm(assertion); + case MATCH_BEGIN: + if (!ignoreMatchBoundaryAssertions) { + matchBeginAssertionsOnPath++; + if (forward && isBuildingDFA() && !isRootEnterOnPath()) { + return retreat(); + } + } + return advanceTerm(assertion); + case MATCH_END: + if (!ignoreMatchBoundaryAssertions) { + matchEndAssertionsOnPath++; + if (!forward && isBuildingDFA() && !isRootEnterOnPath()) { + return retreat(); + } + } + return advanceTerm(assertion); + default: + throw CompilerDirectives.shouldNotReachHere(); } } /** * Advances past the given {@link Term} and updates {@link #cur the current node}. * - * @return {@code true} if a successor was reached in this step (possible if - * {@link #advanceEmptyGuard} returns {@code true} and we have the quantified group as - * the successor) + * @return {@code true} if a successor was reached in this step (possible if we want to generate + * a transition to the special EMPTY_STATE, which by itself doesn't match anything, but + * acts as a helper for simulating the backtracking behavior of the ECMAScript flavor) */ private boolean advanceTerm(Term term) { if (ast.isNFAInitialState(term) || (term.getParent().isSubtreeRoot() && (term.isPositionAssertion() || term.isMatchFound()))) { @@ -541,19 +547,46 @@ private boolean advanceTerm(Term term) { } Term curTerm = term; while (!curTerm.getParent().isSubtreeRoot()) { - // We are leaving curTerm, which is a quantified group that we have already entered - // during this step. - // Unless we are building a DFA in a flavor which can have empty loop iterations, we - // call into advanceEmptyGuard. This is crucial to preserve the termination of the AST - // traversal/NFA generation. In the case of building a DFA in a flavor which can have - // empty loop iterations: - // a) we cannot use advanceEmptyGuard because it might introduce empty transitions, - // which are forbidden in the DFA, - // and b) termination is ensured by resolving exitZeroWidth/escapeZeroWidth guards - // statically. + /* + * We are leaving curTerm, which is a quantified group that we have already entered + * during this step. + * + * We avoid infinite loops on these groups by statically resolving TransitionGuards and + * de-duplicating equivalent transitions, but we have to apply special treatment for + * ECMAScript and Python's behavior on empty loop iterations here. + * + * ECMAScript and Python don't stop quantifier loops on empty matches as long as their + * minimum count has not been reached. Unfortunately, we have to simulate this behavior + * in cases where it is observable via capture groups, back-references or position + * assertions. + */ if (curTerm.isGroupWithGuards() && insideEmptyGuardGroup.get(curTerm.asGroup().getGroupsWithGuardsIndex()) && - !(ast.getOptions().getFlavor().canHaveEmptyLoopIterations() && isBuildingDFA())) { - return advanceEmptyGuard(curTerm); + !getFlavor().emptyChecksMonitorCaptureGroups()) { + Group curGroup = curTerm.asGroup(); + Quantifier quantifier = curGroup.getQuantifier(); + // If we are: + // - in ECMAScript or Python flavor + // - in the mandatory split part of a quantifier + // - that has not been unrolled + // - and capture groups are visible to the caller, or the expression contains + // back-references, or we crossed a caret + if (!getFlavor().emptyChecksOnMandatoryLoopIterations() && + curGroup.isMandatoryQuantifier() && + !curGroup.isExpandedQuantifier() && + (!ast.getOptions().isBooleanMatch() || ast.getProperties().hasBackReferences() || caretsOnPath())) { + // the existence of a mandatory copy of the quantifier loop implies a minimum + // greater than zero + assert !curGroup.isMandatoryQuantifier() || quantifier.getMin() > 0; + popGroupExit(); + cur = curTerm; + // Set the current group node as the path's target to indicate we want to + // generate an EMPTY_STATE for it. The empty state allows the backtracking + // engine to loop without consuming characters. + curPath.add(PathElement.create(cur)); + return true; + } + // otherwise, retreat. + return retreat(); } Sequence parentSeq = (Sequence) curTerm.getParent(); if (curTerm == (forward ? parentSeq.getLastTerm() : parentSeq.getFirstTerm())) { @@ -578,33 +611,6 @@ private boolean advanceTerm(Term term) { return false; } - /** - * Advances past a {@link Group} with an empty-guard. This can produce a transition to the - * special empty-match state that is represented by setting the successor to the quantified - * group. - * - * @return {@code true} if a successor (the quantified group) was reached in this step - */ - private boolean advanceEmptyGuard(Term curTerm) { - // We found a zero-width match group with a quantifier. - // In flavors where we cannot have empty loop iterations (JavaScript), we generate - // transitions to the special empty-match state only for bounded quantifiers which haven't - // been unrolled. In flavors where we can have empty loop iterations, we generate - // transitions to the empty-match state unconditionally. This ensures that we do not try to - // generate NFA transitions that span multiple repetitions of the same quantified group, - // potentially leading to non-terminating NFA generation. - if (ast.getOptions().getFlavor().canHaveEmptyLoopIterations() || - (curTerm.isQuantifiableTerm() && curTerm.asQuantifiableTerm().hasNotUnrolledQuantifier() && curTerm.asQuantifiableTerm().getQuantifier().getMin() > 0)) { - assert curTerm.isGroup(); - // By returning the quantified group itself, we map the transition target to the special - // empty-match state. - cur = curTerm; - return true; - } else { - return retreat(); - } - } - /** * Backtrack through the traversal and find an unexplored alternative. * @@ -613,42 +619,31 @@ private boolean advanceEmptyGuard(Term curTerm) { private boolean retreat() { shouldRetreat = false; while (!curPath.isEmpty()) { - long lastVisited = curPath.peek(); - RegexASTNode node = pathGetNode(lastVisited); - if (pathIsGroup(lastVisited)) { + long lastElement = curPath.peek(); + RegexASTNode node = pathGetNode(lastElement); + if (PathElement.isGroup(lastElement)) { Group group = (Group) node; - if (pathIsGroupEnter(lastVisited) || pathIsGroupPassThrough(lastVisited)) { - if (pathGroupHasNext(lastVisited)) { - if (pathIsGroupPassThrough(lastVisited) && group.isLoop()) { - // a passthrough node was changed to an enter node, - // so we register the loop in insideLoops - registerInsideLoop(group); - } + if (PathElement.isGroupEnter(lastElement) || PathElement.isGroupPassThrough(lastElement)) { + if (pathGroupHasNext(lastElement)) { switchNextGroupAlternative(group); if (shouldRetreat) { return retreat(); } - cur = pathGroupGetNext(lastVisited); + cur = pathGroupGetNext(lastElement); return deduplicatePath(true); } else { - if (pathIsGroupEnter(lastVisited)) { + if (PathElement.isGroupEnter(lastElement)) { popGroupEnter(); } else { - assert pathIsGroupPassThrough(lastVisited); + assert PathElement.isGroupPassThrough(lastElement); popGroupPassThrough(); } - if (pathIsGroupEnter(lastVisited) && group.isLoop()) { - // we only deregister the node from insideLoops if this was an enter - // node, if it was a passthrough node, it was already deregistered when - // it was transformed from an enter node in doAdvance - unregisterInsideLoop(group); - } if (group.hasEmptyGuard()) { insideEmptyGuardGroup.clear(group.getGroupsWithGuardsIndex()); } } - } else if (ast.getOptions().getFlavor().failingEmptyChecksDontBacktrack() && pathIsGroupExit(lastVisited) && group.hasQuantifier() && group.getQuantifier().hasZeroWidthIndex()) { - // In Ruby, Python and OracleDB, when we finish an iteration of a loop, there is + } else if (PathElement.isGroupExit(lastElement) && needsZeroWidthEscape(group)) { + // In Ruby and OracleDB, when we finish an iteration of a loop, there is // an empty check. If we pass the empty check, we return to the beginning of the // loop where we get to make a non-deterministic choice whether we want to start // another iteration of the loop (so far the same as ECMAScript). However, if we @@ -659,6 +654,8 @@ private boolean retreat() { // (exitZeroWidth and escapeZeroWidth, respectively), so that at runtime, only // one of the two transitions will be admissible. The clause below lets us // generate the second transition by replacing the loop exit with a loop escape. + // In ECMAScript, we use the same mechanism to fast-forward mandatory quantifier + // parts when we find a zero-width match for the quantified expression. switchExitToEscape(group); if (shouldRetreat) { return retreat(); @@ -670,39 +667,19 @@ private boolean retreat() { pushGroupExit(parentGroup); return advanceTerm(parentGroup); } else { - if (pathIsGroupExit(lastVisited)) { + if (PathElement.isGroupExit(lastElement)) { popGroupExit(); } else { - assert pathIsGroupEscape(lastVisited); + assert PathElement.isGroupEscape(lastElement); popGroupEscape(group); } } } else { curPath.pop(); if (canTraverseLookArounds() && node.isLookAroundAssertion()) { - if (node.isLookAheadAssertion()) { - leaveLookAhead(node.asLookAheadAssertion()); - } - removeLookAroundFromVisitedSet(lastVisited); + popLookAround(node, lastElement); } else if (node.isPositionAssertion()) { - switch (node.asPositionAssertion().type) { - case CARET -> { - caretsOnPath--; - } - case DOLLAR -> { - dollarsOnPath--; - } - case MATCH_BEGIN -> { - if (!ignoreMatchBoundaryAssertions) { - matchBeginAssertionsOnPath--; - } - } - case MATCH_END -> { - if (!ignoreMatchBoundaryAssertions) { - matchEndAssertionsOnPath--; - } - } - } + popPositionAssertion(node); } } } @@ -710,6 +687,34 @@ private boolean retreat() { return false; } + private void popLookAround(RegexASTNode node, long pathElement) { + if (node.isLookAheadAssertion()) { + leaveLookAhead(node.asLookAheadAssertion()); + } + removeLookAroundFromVisitedSet(pathElement); + } + + private void popPositionAssertion(RegexASTNode node) { + switch (node.asPositionAssertion().type) { + case CARET -> { + caretsOnPath--; + } + case DOLLAR -> { + dollarsOnPath--; + } + case MATCH_BEGIN -> { + if (!ignoreMatchBoundaryAssertions) { + matchBeginAssertionsOnPath--; + } + } + case MATCH_END -> { + if (!ignoreMatchBoundaryAssertions) { + matchEndAssertionsOnPath--; + } + } + } + } + /** * This should be called whenever {@link #cur} is set to some {@link Sequence}. * @@ -720,7 +725,13 @@ private boolean deduplicatePath(boolean internal) { if (shouldRetreat) { return retreat(); } - // interal == true means that this is being called during traversal, before reaching a + if (internal && getFlavor().emptyChecksMonitorCaptureGroups()) { + // in Ruby, we don't deduplicate on intermediate Sequence nodes, because due to the + // "empty checks monitor capture groups" property, we may have to generate transitions + // that represent multiple loop iterations in a quantified expression. + return false; + } + // internal == true means that this is being called during traversal, before reaching a // successor node (these calls are made in regular intervals, whenever a new Sequence is // entered). // This method is also called for every successor we have found (internal == false). In @@ -732,7 +743,7 @@ private boolean deduplicatePath(boolean internal) { // encountered first will dominate the one found later and any empty capture groups that // would have been matched along the way cannot affect future matching. boolean captureGroupsMatter = !cur.isMatchFound() && - ((ast.getOptions().getFlavor().backreferencesToUnmatchedGroupsFail() && ast.getProperties().hasBackReferences()) || + ((getFlavor().backreferencesToUnmatchedGroupsFail() && ast.getProperties().hasBackReferences()) || (isBuildingDFA() && ast.getProperties().hasConditionalBackReferences())); long id = cur.getId(); @@ -777,6 +788,9 @@ private boolean deduplicatePath(boolean internal) { } private void dedupKeyAddGroupBoundaries(TBitSet boundaries) { + // We only care about groups referenced by back-references when de-duplicating transitions. + // Without back-references, the first possible transition to the same target always + // dominates the others. long[] bitset = boundaries.getInternalArray(); long[] referenced = referencedGroupBoundaries.getInternalArray(); assert bitset.length == referenced.length; @@ -785,93 +799,25 @@ private void dedupKeyAddGroupBoundaries(TBitSet boundaries) { } } - /** - * First field: (short) group alternation index. This value is used to iterate the alternations - * of groups referenced in a group-enter path element.
- * Since the same group can appear multiple times on the path, we cannot reuse {@link Group}'s - * implementation of {@link RegexASTVisitorIterable}. Therefore, every occurrence of a group on - * the path has its own index for iterating and back-tracking over its alternatives. - */ - private static final int PATH_GROUP_ALT_INDEX_OFFSET = 0; - /** - * Second field: (int) id of the path element's {@link RegexASTNode}. - */ - private static final int PATH_NODE_OFFSET = Short.SIZE; - /** - * Third field: group action. Every path element referencing a group must have one of four - * possible group actions: - *
    - *
  • group enter
  • - *
  • group exit
  • - *
  • group pass through
  • - *
  • group escape
  • - *
- */ - private static final int PATH_GROUP_ACTION_OFFSET = Short.SIZE + Integer.SIZE; - private static final long PATH_GROUP_ACTION_ENTER = 1L << PATH_GROUP_ACTION_OFFSET; - private static final long PATH_GROUP_ACTION_EXIT = 1L << PATH_GROUP_ACTION_OFFSET + 1; - private static final long PATH_GROUP_ACTION_PASS_THROUGH = 1L << PATH_GROUP_ACTION_OFFSET + 2; - private static final long PATH_GROUP_ACTION_ESCAPE = 1L << PATH_GROUP_ACTION_OFFSET + 3; - private static final long PATH_GROUP_ACTION_ANY = PATH_GROUP_ACTION_ENTER | PATH_GROUP_ACTION_EXIT | PATH_GROUP_ACTION_PASS_THROUGH | PATH_GROUP_ACTION_ESCAPE; - - /** - * Create a new path element containing the given node. - */ - private static long createPathElement(RegexASTNode node) { - return (long) node.getId() << PATH_NODE_OFFSET; - } - - private static int pathGetNodeId(long pathElement) { - return (int) (pathElement >>> PATH_NODE_OFFSET); + private static boolean canMatchEmptyString(RegexASTNode node) { + if (node.isBackReference()) { + return node.asBackReference().mayMatchEmptyString(); + } + return !node.isCharacterClass(); } /** * Get the {@link RegexASTNode} contained in the given path element. */ private RegexASTNode pathGetNode(long pathElement) { - return ast.getState(pathGetNodeId(pathElement)); - } - - /** - * Get the group alternation index of the given path element. - */ - private static int pathGetGroupAltIndex(long pathElement) { - return (short) (pathElement >>> PATH_GROUP_ALT_INDEX_OFFSET); - } - - /** - * Returns {@code true} if the given path element has any group action set. Every path element - * containing a group must have one group action. - */ - private static boolean pathIsGroup(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_ANY) != 0; - } - - private static boolean pathIsGroupEnter(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_ENTER) != 0; - } - - private static boolean pathIsGroupExit(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_EXIT) != 0; - } - - private static boolean pathIsGroupPassThrough(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_PASS_THROUGH) != 0; - } - - private static boolean pathIsGroupEscape(long pathElement) { - return (pathElement & PATH_GROUP_ACTION_ESCAPE) != 0; - } - - private static boolean pathIsGroupExitOrEscape(long pathElement) { - return (pathElement & (PATH_GROUP_ACTION_EXIT | PATH_GROUP_ACTION_ESCAPE)) != 0; + return ast.getState(PathElement.getNodeId(pathElement)); } /** * Returns {@code true} if the path element's group alternation index is still in bounds. */ private boolean pathGroupHasNext(long pathElement) { - return pathGetGroupAltIndex(pathElement) < ((Group) pathGetNode(pathElement)).size(); + return PathElement.getGroupAltIndex(pathElement) < ((Group) pathGetNode(pathElement)).size(); } /** @@ -879,16 +825,17 @@ private boolean pathGroupHasNext(long pathElement) { * the group alternation index! */ private Sequence pathGroupGetNext(long pathElement) { - return ((Group) pathGetNode(pathElement)).getAlternatives().get(pathGetGroupAltIndex(pathElement)); + return ((Group) pathGetNode(pathElement)).getAlternatives().get(PathElement.getGroupAltIndex(pathElement)); } protected boolean isRootEnterOnPath() { - return isGroupEnterOnPath(ast.getRoot().getId()); + return isGroupEnterOnPath(ast.getRoot()); } - private boolean isGroupEnterOnPath(int groupNodeId) { + private boolean isGroupEnterOnPath(Group group) { + int groupNodeId = group.getId(); for (long element : curPath) { - if (pathGetNodeId(element) == groupNodeId && pathIsGroupEnter(element)) { + if (PathElement.getNodeId(element) == groupNodeId && PathElement.isGroupEnter(element)) { return true; } } @@ -897,48 +844,48 @@ private boolean isGroupEnterOnPath(int groupNodeId) { /// Pushing and popping group elements to and from the path private void pushGroupEnter(Group group, int groupAltIndex) { - curPath.add(createPathElement(group) | (groupAltIndex << PATH_GROUP_ALT_INDEX_OFFSET) | PATH_GROUP_ACTION_ENTER); + curPath.add(PathElement.createGroupEnter(group, groupAltIndex)); recalcTransitionGuards = true; } private int popGroupEnter() { long pathEntry = curPath.pop(); - assert pathIsGroupEnter(pathEntry); + assert PathElement.isGroupEnter(pathEntry); recalcTransitionGuards = true; - return pathGetGroupAltIndex(pathEntry); + return PathElement.getGroupAltIndex(pathEntry); } private void switchNextGroupAlternative(Group group) { int groupAltIndex; - if (pathIsGroupEnter(curPath.peek())) { + if (PathElement.isGroupEnter(curPath.peek())) { groupAltIndex = popGroupEnter(); } else { - assert pathIsGroupPassThrough(curPath.peek()); + assert PathElement.isGroupPassThrough(curPath.peek()); groupAltIndex = popGroupPassThrough(); } pushGroupEnter(group, groupAltIndex + 1); } private void pushGroupExit(Group group) { - curPath.add(createPathElement(group) | PATH_GROUP_ACTION_EXIT); + curPath.add(PathElement.createGroupExit(group)); recalcTransitionGuards = true; } private void popGroupExit() { long pathEntry = curPath.pop(); - assert pathIsGroupExit(pathEntry); + assert PathElement.isGroupExit(pathEntry); recalcTransitionGuards = true; } private void pushGroupPassThrough(Group group, int groupAltIndex) { - curPath.add(createPathElement(group) | PATH_GROUP_ACTION_PASS_THROUGH | (groupAltIndex << PATH_GROUP_ALT_INDEX_OFFSET)); + curPath.add(PathElement.createGroupPassThrough(group, groupAltIndex)); recalcTransitionGuards = true; } private int popGroupPassThrough() { long pathEntry = curPath.pop(); - int groupAltIndex = pathGetGroupAltIndex(pathEntry); - assert pathIsGroupPassThrough(pathEntry); + int groupAltIndex = PathElement.getGroupAltIndex(pathEntry); + assert PathElement.isGroupPassThrough(pathEntry); recalcTransitionGuards = true; return groupAltIndex; } @@ -954,13 +901,14 @@ private void switchExitToEscape(Group group) { } private void pushGroupEscape(Group group) { - curPath.add(createPathElement(group) | PATH_GROUP_ACTION_ESCAPE); + long groupEscape = PathElement.createGroupEscape(group); + curPath.add(groupEscape); recalcTransitionGuards = true; } private void popGroupEscape(Group group) { long pathEntry = curPath.pop(); - assert pathIsGroupEscape(pathEntry); + assert PathElement.isGroupEscape(pathEntry); assert group == pathGetNode(pathEntry); recalcTransitionGuards = true; } @@ -969,6 +917,7 @@ private void popGroupEscape(Group group) { private void clearCaptureGroupData() { captureGroupUpdates.clear(); captureGroupClears.clear(); + firstGroup = -1; lastGroup = -1; } @@ -982,6 +931,11 @@ private TBitSet getCaptureGroupClears() { return captureGroupClears; } + private int getFirstGroup() { + calcTransitionGuards(); + return firstGroup; + } + private int getLastGroup() { calcTransitionGuards(); return lastGroup; @@ -989,34 +943,16 @@ private int getLastGroup() { private LongArrayBuffer getTransitionGuards() { calcTransitionGuards(); - return transitionGuards; + return transitionGuardsCanonicalized; } private void calcTransitionGuards() { if (recalcTransitionGuards) { - if (useTransitionGuards()) { - calculateTransitionGuards(); - } else { - calculateGroupBoundaries(); - } + calculateTransitionGuards(); recalcTransitionGuards = false; } } - private void calculateGroupBoundaries() { - clearCaptureGroupData(); - for (long element : curPath) { - if (pathIsGroup(element)) { - Group group = (Group) pathGetNode(element); - if (pathIsGroupEnter(element)) { - calcGroupBoundariesEnter(group); - } else if (pathIsGroupExitOrEscape(element)) { - calcGroupBoundariesExit(group); - } - } - } - } - private int getBoundaryIndexStart(Group group) { return forward ? group.getBoundaryIndexStart() : group.getBoundaryIndexEnd(); } @@ -1028,10 +964,13 @@ private int getBoundaryIndexEnd(Group group) { private void calcGroupBoundariesEnter(Group group) { if (group.isCapturing()) { captureGroupUpdate(getBoundaryIndexStart(group)); + if (updatesLastGroupField(group) && firstGroup == -1) { + firstGroup = group.getGroupNumber(); + } } - if (!ast.getOptions().getFlavor().nestedCaptureGroupsKeptOnLoopReentry() && group.hasQuantifier() && group.hasEnclosedCaptureGroups()) { - int lo = Group.groupNumberToBoundaryIndexStart(group.getEnclosedCaptureGroupsLow()); - int hi = Group.groupNumberToBoundaryIndexEnd(group.getEnclosedCaptureGroupsHigh() - 1); + if (clearsEnclosedGroups(group)) { + int lo = Group.groupNumberToBoundaryIndexStart(group.getEnclosedCaptureGroupsLo()); + int hi = Group.groupNumberToBoundaryIndexEnd(group.getEnclosedCaptureGroupsHi() - 1); captureGroupClears.setRange(lo, hi); captureGroupUpdates.clearRange(lo, hi); } @@ -1040,7 +979,7 @@ private void calcGroupBoundariesEnter(Group group) { private void calcGroupBoundariesExit(Group group) { if (group.isCapturing()) { captureGroupUpdate(getBoundaryIndexEnd(group)); - if (ast.getOptions().getFlavor().usesLastGroupResultField() && group.getGroupNumber() != 0) { + if (updatesLastGroupField(group)) { lastGroup = group.getGroupNumber(); } } @@ -1053,51 +992,74 @@ private void captureGroupUpdate(int boundary) { private void calculateTransitionGuards() { clearCaptureGroupData(); - boundedQuantifiersLoop.clear(); - boundedQuantifiersExited.clear(); + bqExited.clear(); + bqBypassed.clear(); + Arrays.fill(bqLastZeroWidthEnter, -1); + Arrays.fill(bqLastCounterReset, -1); transitionGuards.clear(); - for (long element : curPath) { - if (pathIsGroup(element)) { + transitionGuardsCanonicalized.clear(); + for (int i = 0; i < curPath.length(); i++) { + long element = curPath.get(i); + if (PathElement.isGroup(element)) { Group group = (Group) pathGetNode(element); - int groupAltIndex = pathGetGroupAltIndex(element); - if (pathIsGroupEnter(element)) { + int groupAltIndex = PathElement.getGroupAltIndex(element); + if (PathElement.isGroupEnter(element)) { if (group.hasQuantifier()) { Quantifier quantifier = group.getQuantifier(); if (quantifier.hasIndex()) { - if (!quantifier.isInfiniteLoop() && boundedQuantifiersLoop.get(quantifier.getIndex()) && !boundedQuantifiersExited.get(quantifier.getIndex())) { - pushTransitionGuard(TransitionGuard.createLoop(quantifier)); + if (bqExited.get(group.getGroupsWithGuardsIndex()) && !bqBypassed.get(group.getGroupsWithGuardsIndex())) { + if (group.isMandatoryQuantifier()) { + pushTransitionGuard(TransitionGuard.createCountLtMin(quantifier)); + } else if (!quantifier.isInfiniteLoop()) { + pushTransitionGuard(TransitionGuard.createCountLtMax(quantifier)); + } + pushTransitionGuard(TransitionGuard.createCountInc(quantifier)); } else { - pushTransitionGuard(TransitionGuard.createLoopInc(quantifier)); + if (group.isOptionalQuantifier()) { + pushTransitionGuard(TransitionGuard.createCountSetMin(quantifier)); + } else { + pushTransitionGuard(TransitionGuard.createCountSet1(quantifier)); + } } } + if (group.getEnclosedZeroWidthGroupsHi() - group.getEnclosedZeroWidthGroupsLo() > 0) { + bqBypassed.clearRange(group.getEnclosedZeroWidthGroupsLo(), group.getEnclosedZeroWidthGroupsHi() - 1); + bqExited.clearRange(group.getEnclosedZeroWidthGroupsLo(), group.getEnclosedZeroWidthGroupsHi() - 1); + } if (needsEmptyCheck(group)) { pushTransitionGuard(TransitionGuard.createEnterZeroWidth(quantifier)); } } - if (needsUpdateCGStepByStep(group) && !captureGroupUpdates.get(getBoundaryIndexStart(group))) { + if (needsUpdateCGStepByStep(group) && (getFlavor().usesLastGroupResultField() || !captureGroupUpdates.get(getBoundaryIndexStart(group)))) { pushTransitionGuard(TransitionGuard.createUpdateCG(getBoundaryIndexStart(group))); } calcGroupBoundariesEnter(group); if (group.isConditionalBackReferenceGroup()) { pushTransitionGuard(getConditionalBackReferenceGroupTransitionGuard(group, groupAltIndex)); } - } else if (pathIsGroupExitOrEscape(element)) { - if (pathIsGroupExit(element)) { + } else if (PathElement.isGroupExitOrEscape(element)) { + if (PathElement.isGroupExit(element)) { if (group.hasQuantifier()) { Quantifier quantifier = group.getQuantifier(); if (quantifier.hasIndex()) { - boundedQuantifiersLoop.set(quantifier.getIndex()); + if (!root.isGroup()) { + bqLastCounterReset[quantifier.getIndex()] = transitionGuards.length(); + } + bqExited.set(group.getGroupsWithGuardsIndex()); } if (needsEmptyCheck(group)) { pushTransitionGuard(TransitionGuard.createExitZeroWidth(quantifier)); } } - } else if (pathIsGroupEscape(element)) { + } else if (PathElement.isGroupEscape(element)) { if (group.hasQuantifier()) { Quantifier quantifier = group.getQuantifier(); if (quantifier.hasIndex()) { - boundedQuantifiersExited.set(quantifier.getIndex()); - pushTransitionGuard(TransitionGuard.createExitReset(quantifier)); + bqLastCounterReset[quantifier.getIndex()] = transitionGuards.length(); + if (bqBypassed.get(group.getGroupsWithGuardsIndex())) { + setShouldRetreat(); + } + bqBypassed.set(group.getGroupsWithGuardsIndex()); } if (quantifier.hasZeroWidthIndex()) { pushTransitionGuard(TransitionGuard.createEscapeZeroWidth(quantifier)); @@ -1105,50 +1067,122 @@ private void calculateTransitionGuards() { } } pushRecursiveBackrefUpdates(group); - if (needsUpdateCGStepByStep(group) && !captureGroupUpdates.get(getBoundaryIndexEnd(group))) { + if (needsUpdateCGStepByStep(group) && (getFlavor().usesLastGroupResultField() || !captureGroupUpdates.get(getBoundaryIndexEnd(group)))) { pushTransitionGuard(TransitionGuard.createUpdateCG(getBoundaryIndexEnd(group))); } calcGroupBoundariesExit(group); - } else if (pathIsGroupPassThrough(element)) { + } else if (PathElement.isGroupPassThrough(element)) { Group quantifierGroup = getQuantifiedGroupFromPassthrough(group, groupAltIndex); Quantifier quantifier = quantifierGroup.getQuantifier(); if (!quantifierGroup.isExpandedQuantifier()) { - if (quantifier.hasIndex()) { + if (quantifierGroup.isDead()) { if (quantifier.getMin() > 0) { - boundedQuantifiersExited.set(quantifier.getIndex()); - pushTransitionGuard(TransitionGuard.createExit(quantifier)); - } else { - pushTransitionGuard(TransitionGuard.createExitReset(quantifier)); + setShouldRetreat(); } - } else { - assert quantifierGroup.isDead(); - if (quantifier.getMin() > 0) { - shouldRetreat = true; + } else if (quantifier.hasIndex()) { + if (bqBypassed.get(quantifierGroup.getGroupsWithGuardsIndex())) { + setShouldRetreat(); + } + bqBypassed.set(quantifierGroup.getGroupsWithGuardsIndex()); + if (quantifierGroup.isMandatoryQuantifier() || quantifier.getMin() > 0 && !quantifierGroup.isOptionalQuantifier()) { + if (!bqExited.get(quantifierGroup.getGroupsWithGuardsIndex())) { + setShouldRetreat(); + } + if (quantifier.getMin() > 0) { + pushTransitionGuard(TransitionGuard.createCountGeMin(quantifier)); + } } } } } } } + for (int i = 0; i < transitionGuards.length(); i++) { + long guard = transitionGuards.get(i); + if (shouldKeepGuard(guard, i)) { + transitionGuardsCanonicalized.add(guard); + } + } + } + + private boolean shouldKeepGuard(long guard, int guardPosition) { + switch (TransitionGuard.getKind(guard)) { + case countSet1, countInc, countSetMin -> { + return getFlavor().emptyChecksMonitorCaptureGroups() || guardPosition >= bqLastCounterReset[TransitionGuard.getQuantifierIndex(guard)]; + } + case enterZeroWidth -> { + int zeroWidthQuantifierIndex = TransitionGuard.getZeroWidthQuantifierIndex(guard); + Group quantifiedTerm = (Group) ast.getZeroWidthQuantifiables().get(zeroWidthQuantifierIndex); + // we need to keep enterZeroWidth guards if the quantified expression can contain + // NFA states that don't consume any characters, or the expression contains capture + // groups referred to by back-references. In the case of referenced groups, the + // guard is needed just to differentiate transitions in nested quantifiers, + // because these may require additional backtracking, e.g. matching + // /a(b*)*c\\1d/ against "abbbbcbbd" + return bqLastZeroWidthEnter[zeroWidthQuantifierIndex] == guardPosition && (quantifiedTerm.hasCaret() || + quantifiedTerm.hasLookArounds() || + quantifiedTerm.hasBackReferences() || + quantifiedTerm.hasAtomicGroups() || + hasReferencedCaptureGroups(quantifiedTerm) || + (cur.isGroup() && cur.asGroup().getQuantifier().getZeroWidthIndex() != zeroWidthQuantifierIndex)); + } + case updateRecursiveBackrefPointer -> { + for (int i = transitionGuards.length() - 1; i > guardPosition; i--) { + if (transitionGuards.get(i) == guard) { + return false; + } + } + return true; + } + default -> { + return true; + } + } + } + + private boolean hasReferencedCaptureGroups(Group quantifiedTerm) { + if (!ast.getProperties().hasBackReferences() || !quantifiedTerm.hasCaptureGroups()) { + return false; + } + referencedCaptureGroupsTmp.clear(); + referencedCaptureGroupsTmp.setRange(quantifiedTerm.getCaptureGroupsLo(), quantifiedTerm.getCaptureGroupsHi() - 1); + return !ast.getReferencedGroups().isDisjoint(referencedCaptureGroupsTmp); } private static Group getQuantifiedGroupFromPassthrough(Group group, int groupAltIndex) { assert group.size() == 2 && groupAltIndex - 1 >= 0 && groupAltIndex - 1 <= 1; int otherAltIndex = (groupAltIndex - 1) ^ 1; Sequence otherAlternative = group.getAlternatives().get(otherAltIndex); - assert !otherAlternative.isEmpty() && otherAlternative.get(0).isGroup(); - Group quantifierGroup = otherAlternative.get(0).asGroup(); + Term quantifiedTerm = group.isInLookBehindAssertion() ? otherAlternative.getLastTerm() : otherAlternative.getFirstTerm(); + assert !otherAlternative.isEmpty() && quantifiedTerm.isGroup(); + Group quantifierGroup = quantifiedTerm.asGroup(); assert quantifierGroup.hasQuantifier(); return quantifierGroup; } private boolean needsUpdateCGStepByStep(Group group) { - return ast.getOptions().getFlavor().matchesTransitionsStepByStep() && group.isCapturing(); + return getFlavor().matchesTransitionsStepByStep() && group.isCapturing(); } private boolean needsEmptyCheck(Group group) { assert group.hasQuantifier(); - return group.getQuantifier().hasZeroWidthIndex() && (ast.getOptions().getFlavor().emptyChecksOnMandatoryLoopIterations() || !group.isMandatoryUnrolledQuantifier()); + return group.getQuantifier().hasZeroWidthIndex() && (getFlavor().emptyChecksOnMandatoryLoopIterations() || !group.isMandatoryUnrolledQuantifier()); + } + + private boolean needsZeroWidthEscape(Group group) { + if (getFlavor().failingEmptyChecksDontBacktrack()) { + return group.hasQuantifier() && group.getQuantifier().hasZeroWidthIndex(); + } else { + return group.hasNotUnrolledQuantifier() && group.getQuantifier().hasZeroWidthIndex() && group.getQuantifier().getMin() > 0 && group.isMandatoryQuantifier(); + } + } + + private boolean clearsEnclosedGroups(Group group) { + return !getFlavor().nestedCaptureGroupsKeptOnLoopReentry() && group.hasQuantifier() && group.hasEnclosedCaptureGroups(); + } + + private boolean updatesLastGroupField(Group group) { + return getFlavor().usesLastGroupResultField() && group.isCapturing() && group.getGroupNumber() != 0; } private static long getConditionalBackReferenceGroupTransitionGuard(Group group, int groupAltIndex) { @@ -1163,33 +1197,31 @@ private static long getConditionalBackReferenceGroupTransitionGuard(Group group, } private void pushRecursiveBackrefUpdates(Group group) { - if (ast.getOptions().getFlavor().supportsRecursiveBackreferences() && ast.getProperties().hasRecursiveBackReferences()) { + if (getFlavor().supportsRecursiveBackreferences() && ast.getProperties().hasRecursiveBackReferences()) { if (group.isCapturing() && ast.isGroupRecursivelyReferenced(group.getGroupNumber())) { pushTransitionGuard(TransitionGuard.createUpdateRecursiveBackref(group.getGroupNumber())); } } } - /// Quantifier guard data handling - private boolean useTransitionGuards() { - // In some flavors, we need to calculate quantifier guards even when building DFAs, since - // these guards represent critical semantic details. While these guards would be ignored by - // the DFA at runtime, they are all resolved statically during this traversal. This is - // checked by ASTStepVisitor#noPredicatesInGuards. - return !isBuildingDFA() || ast.getOptions().getFlavor().canHaveEmptyLoopIterations(); - } - private void clearTransitionGuards() { transitionGuards.clear(); + transitionGuardsCanonicalized.clear(); } private void pushTransitionGuard(long guard) { - assert useTransitionGuards(); // First, we check whether the guard can be resolved statically. If it is trivially true, // we ignore it (normalization). If it is impossible to satisfy, we backtrack. switch (TransitionGuard.getKind(guard)) { - case exitZeroWidth: - case escapeZeroWidth: { + case countSet1, countSetMin -> { + bqLastCounterReset[TransitionGuard.getQuantifierIndex(guard)] = transitionGuards.length(); + } + case countLtMin, countGeMin, countLtMax -> { + if (canOmitCounterCheck(guard)) { + return; + } + } + case exitZeroWidth, escapeZeroWidth -> { boolean keptAliveByConsumedInput = false; boolean keptAliveByCaptureGroups = false; if (!transitionGuards.isEmpty() && transitionGuards.peek() == guard) { @@ -1204,76 +1236,151 @@ private void pushTransitionGuard(long guard) { enterFound = true; break; } - if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && TransitionGuard.is(tg, TransitionGuard.Kind.updateCG)) { + if (getFlavor().emptyChecksMonitorCaptureGroups() && TransitionGuard.is(tg, TransitionGuard.Kind.updateCG)) { keptAliveByCaptureGroups = true; } } if (!enterFound) { // We did not find any corresponding enterZeroWidth, so exitZeroWidth will // pass because of input being consumed. - keptAliveByConsumedInput = isBuildingDFA() || root.isCharacterClass(); + keptAliveByConsumedInput = isBuildingDFA() || !canMatchEmptyString(root); } boolean keptAlive = keptAliveByConsumedInput || keptAliveByCaptureGroups; - if (isBuildingDFA()) { - // TODO: We should be able to eliminate some of these - // exitZeroWidth/escapeZeroWidth guards even - // when not building a DFA. - if ((TransitionGuard.is(guard, TransitionGuard.Kind.exitZeroWidth) && !keptAlive) || (TransitionGuard.is(guard, TransitionGuard.Kind.escapeZeroWidth) && keptAlive)) { - shouldRetreat = true; + boolean isExit = TransitionGuard.is(guard, TransitionGuard.Kind.exitZeroWidth); + boolean isEscape = TransitionGuard.is(guard, TransitionGuard.Kind.escapeZeroWidth); + int zeroWidthQuantifierIndex = TransitionGuard.getZeroWidthQuantifierIndex(guard); + if (isEscape) { + bqLastZeroWidthEnter[zeroWidthQuantifierIndex] = -1; + } + if ((isExit && !keptAlive) || (isEscape && keptAlive)) { + if (isBuildingDFA() || (isExit && enterFound) || !canMatchEmptyString(root) || root.isMatchFound()) { + setShouldRetreat(); } + } + if (isBuildingDFA() || !canMatchEmptyString(root) || root.isMatchFound() || + (root.isGroup() && root.asGroup().getQuantifier().getZeroWidthIndex() == zeroWidthQuantifierIndex) || + (isEscape && enterFound && !keptAliveByCaptureGroups)) { return; } - break; } - case enterZeroWidth: { - // If there is another enterZeroWidth for the same group in the quantifier guards - // and there are no CG updates in between, then this new enterZeroWidth is - // redundant. - for (int i = transitionGuards.length() - 1; i >= 0; i--) { - long tg = transitionGuards.get(i); - if (ast.getOptions().getFlavor().emptyChecksMonitorCaptureGroups() && TransitionGuard.is(tg, TransitionGuard.Kind.updateCG)) { - break; - } - if (tg == guard) { - return; + case enterZeroWidth -> { + int zeroWidthQuantifierIndex = TransitionGuard.getZeroWidthQuantifierIndex(guard); + if (bqLastZeroWidthEnter[zeroWidthQuantifierIndex] < 0) { + bqLastZeroWidthEnter[zeroWidthQuantifierIndex] = transitionGuards.length(); + } else if (getFlavor().emptyChecksMonitorCaptureGroups()) { + // If there is another enterZeroWidth for the same group in the quantifier + // guards and there are no CG updates in between, then this new enterZeroWidth + // is redundant. + for (int i = transitionGuards.length() - 1; i >= bqLastZeroWidthEnter[zeroWidthQuantifierIndex]; i--) { + if (TransitionGuard.is(transitionGuards.get(i), TransitionGuard.Kind.updateCG)) { + bqLastZeroWidthEnter[zeroWidthQuantifierIndex] = transitionGuards.length(); + break; + } } } - break; } - case checkGroupMatched: - case checkGroupNotMatched: { + case checkGroupMatched, checkGroupNotMatched -> { assert (isBuildingDFA() && getMatchedConditionGroups() != null) == this instanceof ASTStepVisitor; if (isBuildingDFA() && getMatchedConditionGroups() != null) { int referencedGroupNumber = TransitionGuard.getGroupNumber(guard); int groupEndIndex = Group.groupNumberToBoundaryIndexEnd(referencedGroupNumber); boolean groupMatched = (getMatchedConditionGroups().get(referencedGroupNumber) && !captureGroupClears.get(groupEndIndex)) || captureGroupUpdates.get(groupEndIndex); if ((TransitionGuard.is(guard, TransitionGuard.Kind.checkGroupMatched)) != groupMatched) { - shouldRetreat = true; + setShouldRetreat(); } return; } - break; } } transitionGuards.add(guard); } + private boolean canOmitCounterCheck(long guard) { + assert TransitionGuard.is(guard, TransitionGuard.Kind.countLtMin) || TransitionGuard.is(guard, TransitionGuard.Kind.countGeMin) || TransitionGuard.is(guard, TransitionGuard.Kind.countLtMax); + int quantifierIndex = TransitionGuard.getQuantifierIndex(guard); + int min = ast.getQuantifier(quantifierIndex).getMin(); + int max = ast.getQuantifier(quantifierIndex).getMax(); + int minPlus1 = saturatingInc(min); + + long countLtMin = TransitionGuard.createCountLtMin(quantifierIndex); + long countGeMin = TransitionGuard.createCountGeMin(quantifierIndex); + long countLtMax = TransitionGuard.createCountLtMax(quantifierIndex); + long countInc = TransitionGuard.createCountInc(quantifierIndex); + long countSetMin = TransitionGuard.createCountSetMin(quantifierIndex); + long countSet1 = TransitionGuard.createCountSet1(quantifierIndex); + + int counterLow = 0; + int counterHigh = Integer.MAX_VALUE; + for (long existingGuard : transitionGuards) { + if (existingGuard == countLtMin) { + counterHigh = Math.min(counterHigh, min - 1); + } else if (existingGuard == countGeMin) { + counterLow = Math.max(counterLow, min); + } else if (existingGuard == countLtMax) { + counterHigh = Math.min(counterHigh, max - 1); + } else if (existingGuard == countSetMin) { + counterLow = minPlus1; + counterHigh = minPlus1; + } else if (existingGuard == countSet1) { + counterLow = 1; + counterHigh = 1; + } else if (existingGuard == countInc) { + counterLow = saturatingInc(counterLow); + counterHigh = saturatingInc(counterHigh); + } + } + + switch (TransitionGuard.getKind(guard)) { + case countLtMin -> { + if (counterHigh < min) { + return true; + } else if (counterLow >= min) { + setShouldRetreat(); + return true; + } else { + return false; + } + } + case countLtMax -> { + if (counterHigh < max) { + return true; + } else if (counterLow >= max) { + setShouldRetreat(); + return true; + } else { + return false; + } + } + case countGeMin -> { + if (counterLow >= min) { + return true; + } else if (counterHigh < min) { + setShouldRetreat(); + return true; + } else { + return false; + } + } + default -> throw CompilerDirectives.shouldNotReachHere(); + } + } + /// Visited set management private void addLookAroundToVisitedSet() { LookAroundAssertion la = (LookAroundAssertion) cur; - lookAroundVisitiedCount[la.getGlobalSubTreeId()]++; + lookAroundVisitedCount[la.getGlobalSubTreeId()]++; lookAroundsOnPath.set(la.getGlobalSubTreeId()); } private void removeLookAroundFromVisitedSet(long pathElement) { LookAroundAssertion la = (LookAroundAssertion) pathGetNode(pathElement); - if (--lookAroundVisitiedCount[la.getGlobalSubTreeId()] == 0) { + if (--lookAroundVisitedCount[la.getGlobalSubTreeId()] == 0) { lookAroundsOnPath.clear(la.getGlobalSubTreeId()); } } - private boolean nodeVisitsEmpty() { - for (int i : lookAroundVisitiedCount) { + private static boolean isEmpty(int[] array) { + for (int i : array) { if (i != 0) { return false; } @@ -1281,37 +1388,24 @@ private boolean nodeVisitsEmpty() { return true; } - /// insideLoops management - private void registerInsideLoop(Group group) { - if (!ast.getOptions().getFlavor().canHaveEmptyLoopIterations()) { - insideLoops.add(group); - } - } - - private void unregisterInsideLoop(Group group) { - if (!ast.getOptions().getFlavor().canHaveEmptyLoopIterations()) { - insideLoops.remove(group); - } - } - @SuppressWarnings("unused") private void dumpPath() { System.out.println("NEW PATH"); for (int i = 0; i < curPath.length(); i++) { long element = curPath.get(i); - if (pathIsGroup(element)) { + if (PathElement.isGroup(element)) { Group group = (Group) pathGetNode(element); - if (pathIsGroupEnter(element)) { - System.out.printf("ENTER (%d) %s%n", pathGetGroupAltIndex(element), group); - } else if (pathIsGroupExit(element)) { - System.out.printf("EXIT %s%n", group); - } else if (pathIsGroupPassThrough(element)) { - System.out.printf("PASSTHROUGH %s%n", group); + if (PathElement.isGroupEnter(element)) { + System.out.printf("ENTER (%2d) %2d %s%n", PathElement.getGroupAltIndex(element), group.getId(), group); + } else if (PathElement.isGroupExit(element)) { + System.out.printf("EXIT %2d %s%n", group.getId(), group); + } else if (PathElement.isGroupPassThrough(element)) { + System.out.printf("PASSTHROUGH %2d %s%n", group.getId(), group); } else { - System.out.printf("ESCAPE %s%n", group); + System.out.printf("ESCAPE %2d %s%n", group.getId(), group); } } else { - System.out.printf("NODE %s%n", pathGetNode(element)); + System.out.printf("NODE %2d %s%n", PathElement.getNodeId(element), pathGetNode(element)); } } } @@ -1345,4 +1439,99 @@ public int hashCode() { return hashCode; } } + + private static final class PathElement { + + /** + * First field: (short) group alternation index. This value is used to iterate the + * alternations of groups referenced in a group-enter path element.
+ * Since the same group can appear multiple times on the path, we cannot reuse + * {@link Group}'s implementation of {@link RegexASTVisitorIterable}. Therefore, every + * occurrence of a group on the path has its own index for iterating and back-tracking over + * its alternatives. + */ + private static final int PATH_GROUP_ALT_INDEX_OFFSET = 0; + /** + * Second field: (int) id of the path element's {@link RegexASTNode}. + */ + private static final int PATH_NODE_OFFSET = Short.SIZE; + /** + * Third field: group action. Every path element referencing a group must have one of four + * possible group actions: + *
    + *
  • group enter
  • + *
  • group exit
  • + *
  • group pass through
  • + *
  • group escape
  • + *
+ */ + private static final int GROUP_ACTION_OFFSET = Short.SIZE + Integer.SIZE; + private static final long GROUP_ACTION_ENTER = 1L << GROUP_ACTION_OFFSET; + private static final long GROUP_ACTION_EXIT = 1L << GROUP_ACTION_OFFSET + 1; + private static final long GROUP_ACTION_PASS_THROUGH = 1L << GROUP_ACTION_OFFSET + 2; + private static final long GROUP_ACTION_ESCAPE = 1L << GROUP_ACTION_OFFSET + 3; + private static final long GROUP_ACTION_ANY = GROUP_ACTION_ENTER | GROUP_ACTION_EXIT | GROUP_ACTION_PASS_THROUGH | GROUP_ACTION_ESCAPE; + + /** + * Create a new path element containing the given node. + */ + private static long create(RegexASTNode node) { + return (long) node.getId() << PATH_NODE_OFFSET; + } + + private static long createGroupEnter(Group group, int groupAltIndex) { + return create(group) | (groupAltIndex << PathElement.PATH_GROUP_ALT_INDEX_OFFSET) | PathElement.GROUP_ACTION_ENTER; + } + + public static long createGroupPassThrough(Group group, int groupAltIndex) { + return create(group) | (groupAltIndex << PathElement.PATH_GROUP_ALT_INDEX_OFFSET) | PathElement.GROUP_ACTION_PASS_THROUGH; + } + + public static long createGroupExit(Group group) { + return create(group) | PathElement.GROUP_ACTION_EXIT; + } + + public static long createGroupEscape(Group group) { + return create(group) | PathElement.GROUP_ACTION_ESCAPE; + } + + private static int getNodeId(long pathElement) { + return (int) (pathElement >>> PATH_NODE_OFFSET); + } + + /** + * Get the group alternation index of the given path element. + */ + private static int getGroupAltIndex(long pathElement) { + return (short) (pathElement >>> PATH_GROUP_ALT_INDEX_OFFSET); + } + + /** + * Returns {@code true} if the given path element has any group action set. Every path + * element containing a group must have one group action. + */ + private static boolean isGroup(long pathElement) { + return (pathElement & GROUP_ACTION_ANY) != 0; + } + + private static boolean isGroupEnter(long pathElement) { + return (pathElement & GROUP_ACTION_ENTER) != 0; + } + + private static boolean isGroupExit(long pathElement) { + return (pathElement & GROUP_ACTION_EXIT) != 0; + } + + private static boolean isGroupPassThrough(long pathElement) { + return (pathElement & GROUP_ACTION_PASS_THROUGH) != 0; + } + + private static boolean isGroupEscape(long pathElement) { + return (pathElement & GROUP_ACTION_ESCAPE) != 0; + } + + private static boolean isGroupExitOrEscape(long pathElement) { + return (pathElement & (GROUP_ACTION_EXIT | GROUP_ACTION_ESCAPE)) != 0; + } + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/PropagateDeadFlagVisitor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/PropagateDeadFlagVisitor.java new file mode 100644 index 00000000000..475eeee1db5 --- /dev/null +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/ast/visitors/PropagateDeadFlagVisitor.java @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2024, 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package com.oracle.truffle.regex.tregex.parser.ast.visitors; + +import com.oracle.truffle.regex.tregex.parser.ast.AtomicGroup; +import com.oracle.truffle.regex.tregex.parser.ast.BackReference; +import com.oracle.truffle.regex.tregex.parser.ast.CharacterClass; +import com.oracle.truffle.regex.tregex.parser.ast.Group; +import com.oracle.truffle.regex.tregex.parser.ast.LookAheadAssertion; +import com.oracle.truffle.regex.tregex.parser.ast.LookBehindAssertion; +import com.oracle.truffle.regex.tregex.parser.ast.PositionAssertion; +import com.oracle.truffle.regex.tregex.parser.ast.RegexASTNode; +import com.oracle.truffle.regex.tregex.parser.ast.Sequence; +import com.oracle.truffle.regex.tregex.parser.ast.SubexpressionCall; + +public class PropagateDeadFlagVisitor extends DepthFirstTraversalRegexASTVisitor { + + private int deadFlagsSeen = 0; + + public static void propagateDeadFlag(RegexASTNode runRoot) { + new PropagateDeadFlagVisitor().run(runRoot); + } + + private void mark(RegexASTNode node) { + if (deadFlagsSeen > 0) { + node.markAsDead(); + } + } + + private void incFlagsSeen(RegexASTNode node) { + if (node.isDead()) { + deadFlagsSeen++; + } + } + + private void decFlagsSeen(RegexASTNode node) { + if (node.isDead()) { + deadFlagsSeen--; + } + mark(node); + } + + @Override + protected void visit(BackReference backReference) { + mark(backReference); + } + + @Override + protected void visit(Group group) { + incFlagsSeen(group); + } + + @Override + protected void visit(Sequence sequence) { + incFlagsSeen(sequence); + } + + @Override + protected void visit(SubexpressionCall subexpressionCall) { + mark(subexpressionCall); + } + + @Override + protected void visit(PositionAssertion assertion) { + mark(assertion); + } + + @Override + protected void visit(LookBehindAssertion assertion) { + incFlagsSeen(assertion); + } + + @Override + protected void visit(LookAheadAssertion assertion) { + incFlagsSeen(assertion); + } + + @Override + protected void visit(AtomicGroup atomicGroup) { + incFlagsSeen(atomicGroup); + } + + @Override + protected void visit(CharacterClass characterClass) { + mark(characterClass); + } + + @Override + protected void leave(Group group) { + decFlagsSeen(group); + } + + @Override + protected void leave(Sequence sequence) { + decFlagsSeen(sequence); + } + + @Override + protected void leave(LookBehindAssertion assertion) { + decFlagsSeen(assertion); + } + + @Override + protected void leave(LookAheadAssertion assertion) { + decFlagsSeen(assertion); + } + + @Override + protected void leave(AtomicGroup atomicGroup) { + decFlagsSeen(atomicGroup); + } +} diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexLexer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexLexer.java index 1643cd4b2a9..8145263caf9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexLexer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexLexer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -46,6 +46,7 @@ import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.charset.ClassSetContents; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.CodePointSetAccumulator; @@ -137,19 +138,19 @@ protected CodePointSet getPOSIXCharClass(String name) { if (cps != null) { return cps; } - throw syntaxError(OracleDBErrorMessages.INVALID_CHARACTER_CLASS); + throw syntaxError(OracleDBErrorMessages.INVALID_CHARACTER_CLASS, ErrorCode.InvalidCharacterClass); } @Override protected void validatePOSIXCollationElement(String sequence) { assert !JavaStringUtil.isSingleCodePoint(sequence); - throw syntaxError(OracleDBErrorMessages.INVALID_COLLATION_ELEMENT); + throw syntaxError(OracleDBErrorMessages.INVALID_COLLATION_ELEMENT, ErrorCode.InvalidCharacterClass); } @Override protected void validatePOSIXEquivalenceClass(String sequence) { assert !JavaStringUtil.isSingleCodePoint(sequence); - throw syntaxError(OracleDBErrorMessages.INVALID_EQUIVALENCE_CLASS); + throw syntaxError(OracleDBErrorMessages.INVALID_EQUIVALENCE_CLASS, ErrorCode.InvalidCharacterClass); } @Override @@ -277,7 +278,7 @@ protected long boundedQuantifierMaxValue() { @Override protected RegexSyntaxException handleBoundedQuantifierOutOfOrder() { - return syntaxError(OracleDBErrorMessages.INVALID_INTERVAL); + return syntaxError(OracleDBErrorMessages.INVALID_INTERVAL, ErrorCode.InvalidQuantifier); } @Override @@ -297,23 +298,23 @@ protected Token handleBoundedQuantifierOverflow(long min, long max) { if (Long.compareUnsigned(min, max) > 0) { throw handleBoundedQuantifierOutOfOrder(); } - throw syntaxError(OracleDBErrorMessages.INVALID_INTERVAL); + throw syntaxError(OracleDBErrorMessages.INVALID_INTERVAL, ErrorCode.InvalidQuantifier); } @Override protected Token handleBoundedQuantifierOverflowMin(long min, long max) { - throw syntaxError(OracleDBErrorMessages.INVALID_INTERVAL); + throw syntaxError(OracleDBErrorMessages.INVALID_INTERVAL, ErrorCode.InvalidQuantifier); } @Override protected RegexSyntaxException handleCCRangeOutOfOrder(int startPos) { - return syntaxError(OracleDBErrorMessages.INVALID_RANGE); + return syntaxError(OracleDBErrorMessages.INVALID_RANGE, ErrorCode.InvalidCharacterClass); } @Override protected void handleCCRangeWithPredefCharClass(int startPos, ClassSetContents firstAtom, ClassSetContents secondAtom) { if ((firstAtom.isAllowedInRange() || !firstAtom.isCodePointSetOnly()) && secondAtom.isCodePointSetOnly()) { - throw syntaxError(OracleDBErrorMessages.INVALID_RANGE); + throw syntaxError(OracleDBErrorMessages.INVALID_RANGE, ErrorCode.InvalidCharacterClass); } } @@ -334,7 +335,7 @@ protected void handleIncompleteEscapeX() { @Override protected Token handleInvalidBackReference(int reference) { - throw syntaxError(OracleDBErrorMessages.MISSING_GROUP_FOR_BACKREFERENCE); + throw syntaxError(OracleDBErrorMessages.MISSING_GROUP_FOR_BACKREFERENCE, ErrorCode.InvalidBackReference); } @Override @@ -393,7 +394,7 @@ protected void handleUnmatchedRightBrace() { @Override protected RegexSyntaxException handleUnmatchedLeftBracket() { - return syntaxError(OracleDBErrorMessages.UNMATCHED_LEFT_BRACKET); + return syntaxError(OracleDBErrorMessages.UNMATCHED_LEFT_BRACKET, ErrorCode.UnmatchedBracket); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexParser.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexParser.java index 6cbd2955e55..d2917c73092 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexParser.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/OracleDBRegexParser.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -47,6 +47,7 @@ import com.oracle.truffle.regex.RegexLanguage; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.charset.ClassSetContents; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.CodePointSetAccumulator; @@ -168,7 +169,7 @@ public RegexAST parse() throws RegexSyntaxException { break; case quantifier: if (prevKind == Token.Kind.quantifier) { - throw syntaxError(OracleDBErrorMessages.NESTED_QUANTIFIER); + throw syntaxError(OracleDBErrorMessages.NESTED_QUANTIFIER, ErrorCode.InvalidQuantifier); } if (astBuilder.getCurTerm() == null || prevKind == Token.Kind.captureGroupBegin) { // quantifiers without target are ignored @@ -189,7 +190,7 @@ public RegexAST parse() throws RegexSyntaxException { break; case groupEnd: if (astBuilder.getCurGroup().getParent() instanceof RegexASTRootNode) { - throw syntaxError(OracleDBErrorMessages.UNMATCHED_RIGHT_PARENTHESIS); + throw syntaxError(OracleDBErrorMessages.UNMATCHED_RIGHT_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } astBuilder.popGroup(token); break; @@ -218,7 +219,7 @@ public RegexAST parse() throws RegexSyntaxException { } } if (!astBuilder.curGroupIsRoot()) { - throw syntaxError(OracleDBErrorMessages.UNTERMINATED_GROUP); + throw syntaxError(OracleDBErrorMessages.UNTERMINATED_GROUP, ErrorCode.UnmatchedParenthesis); } if (!literalStringBuffer.isEmpty()) { addLiteralString(literalStringBuffer); @@ -355,7 +356,7 @@ private void addLiteralString(IntArrayBuffer literalStringBuffer) { literalStringBuffer.clear(); } - private RegexSyntaxException syntaxError(String msg) { - return RegexSyntaxException.createPattern(source, msg, lexer.getLastTokenPosition()); + private RegexSyntaxException syntaxError(String msg, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, lexer.getLastTokenPosition(), errorCode); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlags.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlags.java index 02031d2e877..b970ab8c3a8 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlags.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlags.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -63,7 +63,6 @@ public final class PythonFlags extends AbstractConstantKeysObject { private static final String PROP_IGNORECASE = "IGNORECASE"; private static final String PROP_LOCALE = "LOCALE"; private static final String PROP_MULTILINE = "MULTILINE"; - private static final String PROP_TEMPLATE = "TEMPLATE"; private static final String PROP_UNICODE = "UNICODE"; private static final String PROP_VERBOSE = "VERBOSE"; private static final TruffleReadOnlyKeysArray KEYS = new TruffleReadOnlyKeysArray( @@ -72,16 +71,15 @@ public final class PythonFlags extends AbstractConstantKeysObject { PROP_IGNORECASE, PROP_LOCALE, PROP_MULTILINE, - PROP_TEMPLATE, PROP_UNICODE, PROP_VERBOSE); private final int value; - private static final TBitSet ALL_FLAG_CHARS = TBitSet.valueOf('L', 'a', 'i', 'm', 's', 't', 'u', 'x'); + private static final TBitSet ALL_FLAG_CHARS = TBitSet.valueOf('L', 'a', 'i', 'm', 's', 'u', 'x'); private static final TBitSet TYPE_FLAG_CHARS = TBitSet.valueOf('L', 'a', 'u'); - private static final String FLAGS = "iLmsxatu"; + private static final String FLAGS = "iLmsxau"; private static final int FLAG_IGNORE_CASE = 1; private static final int FLAG_LOCALE = 1 << 1; @@ -89,16 +87,14 @@ public final class PythonFlags extends AbstractConstantKeysObject { private static final int FLAG_DOT_ALL = 1 << 3; private static final int FLAG_VERBOSE = 1 << 4; private static final int FLAG_ASCII = 1 << 5; - private static final int FLAG_TEMPLATE = 1 << 6; - private static final int FLAG_UNICODE = 1 << 7; + private static final int FLAG_UNICODE = 1 << 6; private static final int[] FLAG_LOOKUP = { FLAG_ASCII, 0, 0, 0, 0, 0, 0, 0, FLAG_IGNORE_CASE, 0, 0, FLAG_LOCALE, FLAG_MULTILINE, 0, 0, 0, - 0, 0, FLAG_DOT_ALL, FLAG_TEMPLATE, FLAG_UNICODE, 0, 0, FLAG_VERBOSE + 0, 0, FLAG_DOT_ALL, 0, FLAG_UNICODE, 0, 0, FLAG_VERBOSE }; private static final int TYPE_FLAGS = FLAG_LOCALE | FLAG_ASCII | FLAG_UNICODE; - private static final int GLOBAL_FLAGS = FLAG_TEMPLATE; public static final PythonFlags EMPTY_INSTANCE = new PythonFlags(""); public static final PythonFlags TYPE_FLAGS_INSTANCE = new PythonFlags(TYPE_FLAGS); @@ -151,10 +147,6 @@ public boolean isAscii() { return hasFlag(FLAG_ASCII); } - public boolean isTemplate() { - return hasFlag(FLAG_TEMPLATE); - } - public boolean isUnicodeExplicitlySet() { return hasFlag(FLAG_UNICODE); } @@ -229,10 +221,6 @@ public int numberOfTypeFlags() { return Integer.bitCount(value & TYPE_FLAGS); } - public boolean includesGlobalFlags() { - return (value & GLOBAL_FLAGS) != 0; - } - public boolean overlaps(PythonFlags otherFlags) { return (this.value & otherFlags.value) != 0; } @@ -281,7 +269,6 @@ public boolean isMemberReadableImpl(String symbol) { case PROP_IGNORECASE: case PROP_LOCALE: case PROP_MULTILINE: - case PROP_TEMPLATE: case PROP_UNICODE: case PROP_VERBOSE: return true; @@ -303,8 +290,6 @@ public Object readMemberImpl(String symbol) throws UnknownIdentifierException { return isLocale(); case PROP_MULTILINE: return isMultiLine(); - case PROP_TEMPLATE: - return isTemplate(); case PROP_UNICODE: return isUnicodeExplicitlySet(); case PROP_VERBOSE: diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlavor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlavor.java index d05026502bb..2b7b53940e7 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlavor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlavor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -67,7 +67,7 @@ public final class PythonFlavor extends RegexFlavor { private PythonFlavor() { super(BACKREFERENCES_TO_UNMATCHED_GROUPS_FAIL | NESTED_CAPTURE_GROUPS_KEPT_ON_LOOP_REENTRY | FAILING_EMPTY_CHECKS_DONT_BACKTRACK | USES_LAST_GROUP_RESULT_FIELD | - LOOKBEHINDS_RUN_LEFT_TO_RIGHT | NEEDS_GROUP_START_POSITIONS | HAS_CONDITIONAL_BACKREFERENCES | EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS); + LOOKBEHINDS_RUN_LEFT_TO_RIGHT | NEEDS_GROUP_START_POSITIONS | HAS_CONDITIONAL_BACKREFERENCES); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexLexer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexLexer.java index 9c03959cdc2..086dee97dfe 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexLexer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexLexer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -40,6 +40,8 @@ */ package com.oracle.truffle.regex.tregex.parser.flavors; +import static com.oracle.truffle.regex.tregex.parser.flavors.PythonFlavor.UNICODE; + import java.util.ArrayDeque; import java.util.Deque; import java.util.HashMap; @@ -51,6 +53,7 @@ import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.chardata.UnicodeCharacterAliases; import com.oracle.truffle.regex.charset.ClassSetContents; @@ -65,8 +68,6 @@ import com.oracle.truffle.regex.tregex.string.Encodings; import com.oracle.truffle.regex.util.TBitSet; -import static com.oracle.truffle.regex.tregex.parser.flavors.PythonFlavor.UNICODE; - public final class PythonRegexLexer extends RegexLexer { private static final CodePointSet ASCII_WHITESPACE = CodePointSet.createNoDedup(0x09, 0x0d, 0x20, 0x20); @@ -137,7 +138,15 @@ public final class PythonRegexLexer extends RegexLexer { // Similarly as for \S, we will not be able to produce a replacement string for \W. // We will need to construct the set ourselves. CodePointSet alpha = UNICODE.getProperty("General_Category=Letter"); - CodePointSet numericExtras = CodePointSet.createNoDedup(0xf96b, 0xf973, 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x2f890); + CodePointSet numericExtras = CodePointSet.createNoDedup( + 0xf96b, 0xf96b, + 0xf973, 0xf973, + 0xf978, 0xf978, + 0xf9b2, 0xf9b2, + 0xf9d1, 0xf9d1, + 0xf9d3, 0xf9d3, + 0xf9fd, 0xf9fd, + 0x2f890, 0x2f890); CodePointSet numeric = UNICODE.getProperty("General_Category=Number").union(numericExtras); CodePointSet wordChars = alpha.union(numeric).union(CodePointSet.create('_')); CodePointSet nonWordChars = wordChars.createInverse(Encodings.UTF_32); @@ -400,12 +409,12 @@ protected long boundedQuantifierMaxValue() { } private RegexSyntaxException handleBadCharacterInGroupName(ParseGroupNameResult result) { - return syntaxErrorAtRel(PyErrorMessages.badCharacterInGroupName(result.groupName), result.groupName.length() + 1); + return syntaxErrorAtRel(PyErrorMessages.badCharacterInGroupName(result.groupName), result.groupName.length() + 1, ErrorCode.InvalidNamedGroup); } @Override protected RegexSyntaxException handleBoundedQuantifierOutOfOrder() { - return syntaxErrorAtAbs(PyErrorMessages.MIN_REPEAT_GREATER_THAN_MAX_REPEAT, getLastTokenPosition() + 1); + return syntaxErrorAtAbs(PyErrorMessages.MIN_REPEAT_GREATER_THAN_MAX_REPEAT, getLastTokenPosition() + 1, ErrorCode.InvalidQuantifier); } @Override @@ -431,12 +440,12 @@ protected Token handleBoundedQuantifierOverflowMin(long min, long max) { @Override protected RegexSyntaxException handleCCRangeOutOfOrder(int rangeStart) { - return syntaxErrorAtAbs(PyErrorMessages.badCharacterRange(pattern.substring(rangeStart, position)), rangeStart); + return syntaxErrorAtAbs(PyErrorMessages.badCharacterRange(pattern.substring(rangeStart, position)), rangeStart, ErrorCode.InvalidCharacterClass); } @Override protected void handleCCRangeWithPredefCharClass(int rangeStart, ClassSetContents firstAtom, ClassSetContents secondAtom) { - throw syntaxErrorAtAbs(PyErrorMessages.badCharacterRange(pattern.substring(rangeStart, position)), rangeStart); + throw syntaxErrorAtAbs(PyErrorMessages.badCharacterRange(pattern.substring(rangeStart, position)), rangeStart, ErrorCode.InvalidCharacterClass); } @Override @@ -461,18 +470,18 @@ protected RegexSyntaxException handleComplementOfStringSet() { @Override protected void handleGroupRedefinition(String name, int newId, int oldId) { - throw syntaxErrorAtRel(PyErrorMessages.redefinitionOfGroupName(name, newId, oldId), name.length() + 1); + throw syntaxErrorAtRel(PyErrorMessages.redefinitionOfGroupName(name, newId, oldId), name.length() + 1, ErrorCode.InvalidNamedGroup); } @Override protected void handleIncompleteEscapeX() { - throw syntaxError(PyErrorMessages.incompleteEscape(substring(2 + count(RegexLexer::isHexDigit)))); + throw syntaxError(PyErrorMessages.incompleteEscape(substring(2 + count(RegexLexer::isHexDigit))), ErrorCode.InvalidEscape); } @Override protected Token handleInvalidBackReference(int reference) { String ref = Integer.toString(reference); - throw syntaxErrorAtRel(PyErrorMessages.invalidGroupReference(ref), ref.length()); + throw syntaxErrorAtRel(PyErrorMessages.invalidGroupReference(ref), ref.length(), ErrorCode.InvalidBackReference); } @Override @@ -483,7 +492,7 @@ protected RegexSyntaxException handleInvalidCharInCharClass() { @Override protected RegexSyntaxException handleInvalidGroupBeginQ() { retreat(); - return syntaxErrorAtAbs(PyErrorMessages.unknownExtensionQ(curChar()), getLastTokenPosition() + 1); + return syntaxErrorAtAbs(PyErrorMessages.unknownExtensionQ(curChar()), getLastTokenPosition() + 1, ErrorCode.InvalidGroup); } @Override @@ -498,7 +507,7 @@ protected RegexSyntaxException handleMissingClassSetOperand(ClassSetOperator ope @Override protected void handleOctalOutOfRange() { - throw syntaxError(PyErrorMessages.invalidOctalEscape(substring(4))); + throw syntaxError(PyErrorMessages.invalidOctalEscape(substring(4)), ErrorCode.InvalidEscape); } @Override @@ -508,17 +517,17 @@ protected RegexSyntaxException handleRangeAsClassSetOperand(ClassSetOperator ope @Override protected void handleUnfinishedEscape() { - throw syntaxError(PyErrorMessages.BAD_ESCAPE_END_OF_PATTERN); + throw syntaxError(PyErrorMessages.BAD_ESCAPE_END_OF_PATTERN, ErrorCode.InvalidEscape); } @Override protected void handleUnfinishedGroupComment() { - throw syntaxError(PyErrorMessages.UNTERMINATED_COMMENT); + throw syntaxError(PyErrorMessages.UNTERMINATED_COMMENT, ErrorCode.UnmatchedParenthesis); } @Override protected RegexSyntaxException handleUnfinishedGroupQ() { - return syntaxErrorHere(PyErrorMessages.UNEXPECTED_END_OF_PATTERN); + return syntaxErrorHere(PyErrorMessages.UNEXPECTED_END_OF_PATTERN, ErrorCode.UnmatchedParenthesis); } @Override @@ -533,7 +542,7 @@ protected void handleUnmatchedRightBrace() { @Override protected RegexSyntaxException handleUnmatchedLeftBracket() { - return syntaxErrorAtAbs(PyErrorMessages.UNTERMINATED_CHARACTER_SET, getLastCharacterClassBeginPosition()); + return syntaxErrorAtAbs(PyErrorMessages.UNTERMINATED_CHARACTER_SET, getLastCharacterClassBeginPosition(), ErrorCode.UnmatchedBracket); } @Override @@ -585,42 +594,42 @@ protected int parseCustomEscapeChar(char c, boolean inCharClass) { } int length = countUpTo(RegexLexer::isHexDigit, escapeLength); if (length != escapeLength) { - throw syntaxError(PyErrorMessages.incompleteEscape(substring(2 + length))); + throw syntaxError(PyErrorMessages.incompleteEscape(substring(2 + length)), ErrorCode.InvalidEscape); } advance(length); try { int codePoint = Integer.parseInt(pattern, position - length, position, 16); if (codePoint > 0x10FFFF) { - throw syntaxError(PyErrorMessages.invalidUnicodeEscape(substring(2 + length))); + throw syntaxError(PyErrorMessages.invalidUnicodeEscape(substring(2 + length)), ErrorCode.InvalidEscape); } return codePoint; } catch (NumberFormatException e) { - throw syntaxError(PyErrorMessages.incompleteEscape(substring(2 + length))); + throw syntaxError(PyErrorMessages.incompleteEscape(substring(2 + length)), ErrorCode.InvalidEscape); } } else { // \\u or \\U in 'bytes' patterns - throw syntaxError(PyErrorMessages.badEscape(c)); + throw syntaxError(PyErrorMessages.badEscape(c), ErrorCode.InvalidEscape); } case 'N': { if (mode != PythonREMode.Str) { - throw syntaxError(PyErrorMessages.badEscape(c)); + throw syntaxError(PyErrorMessages.badEscape(c), ErrorCode.InvalidEscape); } if (!consumingLookahead("{")) { - throw syntaxErrorHere(PyErrorMessages.missing("{")); + throw syntaxErrorHere(PyErrorMessages.missing("{"), ErrorCode.InvalidEscape); } int nameStart = position; int nameEnd = pattern.indexOf('}', position); if (atEnd() || nameEnd == position) { - throw syntaxErrorHere(PyErrorMessages.missing("character name")); + throw syntaxErrorHere(PyErrorMessages.missing("character name"), ErrorCode.InvalidEscape); } if (nameEnd < 0) { - throw syntaxErrorHere(PyErrorMessages.missingUnterminatedName('}')); + throw syntaxErrorHere(PyErrorMessages.missingUnterminatedName('}'), ErrorCode.InvalidEscape); } String characterName = pattern.substring(nameStart, nameEnd); position = nameEnd + 1; int codePoint = lookupCharacterByName(characterName); if (codePoint == -1) { - throw syntaxError(PyErrorMessages.undefinedCharacterName(characterName)); + throw syntaxError(PyErrorMessages.undefinedCharacterName(characterName), ErrorCode.InvalidEscape); } return codePoint; } @@ -632,7 +641,7 @@ protected int parseCustomEscapeChar(char c, boolean inCharClass) { @Override protected int parseCustomEscapeCharFallback(int c, boolean inCharClass) { if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') { - throw syntaxError(PyErrorMessages.badEscape(c)); + throw syntaxError(PyErrorMessages.badEscape(c), ErrorCode.InvalidEscape); } return c; } @@ -649,9 +658,9 @@ protected Token parseCustomGroupBeginQ(char charAfterQuestionMark) { ParseGroupNameResult result = parseGroupName('>'); switch (result.state) { case empty: - throw syntaxErrorHere(PyErrorMessages.MISSING_GROUP_NAME); + throw syntaxErrorHere(PyErrorMessages.MISSING_GROUP_NAME, ErrorCode.InvalidNamedGroup); case unterminated: - throw syntaxErrorAtAbs(PyErrorMessages.UNTERMINATED_NAME_ANGLE_BRACKET, pos); + throw syntaxErrorAtAbs(PyErrorMessages.UNTERMINATED_NAME_ANGLE_BRACKET, pos, ErrorCode.InvalidNamedGroup); case invalidStart: case invalidRest: throw handleBadCharacterInGroupName(result); @@ -667,7 +676,7 @@ protected Token parseCustomGroupBeginQ(char charAfterQuestionMark) { return parseNamedBackReference(); } default: - throw syntaxErrorAtRel(PyErrorMessages.unknownExtensionP(ch2), 3); + throw syntaxErrorAtRel(PyErrorMessages.unknownExtensionP(ch2), 3, ErrorCode.InvalidGroup); } } case '>': @@ -681,7 +690,6 @@ protected Token parseCustomGroupBeginQ(char charAfterQuestionMark) { case 's': case 'x': case 'a': - case 't': case 'u': return parseInlineFlags(charAfterQuestionMark); default: @@ -692,9 +700,9 @@ protected Token parseCustomGroupBeginQ(char charAfterQuestionMark) { @Override protected Token parseGroupLt() { if (atEnd()) { - throw syntaxErrorHere(PyErrorMessages.UNEXPECTED_END_OF_PATTERN); + throw syntaxErrorHere(PyErrorMessages.UNEXPECTED_END_OF_PATTERN, ErrorCode.InvalidGroup); } - throw syntaxErrorAtAbs(PyErrorMessages.unknownExtensionLt(curChar()), getLastTokenPosition() + 1); + throw syntaxErrorAtAbs(PyErrorMessages.unknownExtensionLt(curChar()), getLastTokenPosition() + 1, ErrorCode.InvalidGroup); } /** @@ -706,9 +714,9 @@ private Token parseConditionalBackReference() { ParseGroupNameResult result = parseGroupName(')'); switch (result.state) { case empty: - throw syntaxErrorHere(PyErrorMessages.MISSING_GROUP_NAME); + throw syntaxErrorHere(PyErrorMessages.MISSING_GROUP_NAME, ErrorCode.InvalidNamedGroup); case unterminated: - throw syntaxErrorAtRel(PyErrorMessages.UNTERMINATED_NAME, result.groupName.length()); + throw syntaxErrorAtRel(PyErrorMessages.UNTERMINATED_NAME, result.groupName.length(), ErrorCode.InvalidNamedGroup); case invalidStart: case invalidRest: position -= result.groupName.length() + 1; @@ -723,9 +731,9 @@ private Token parseConditionalBackReference() { assert curChar() == ')'; advance(); if (groupNumber == 0) { - throw syntaxErrorAtRel(PyErrorMessages.BAD_GROUP_NUMBER, result.groupName.length() + 1); + throw syntaxErrorAtRel(PyErrorMessages.BAD_GROUP_NUMBER, result.groupName.length() + 1, ErrorCode.InvalidBackReference); } else if (groupNumber == -1) { - throw syntaxErrorAtRel(PyErrorMessages.invalidGroupReference(result.groupName), result.groupName.length() + 1); + throw syntaxErrorAtRel(PyErrorMessages.invalidGroupReference(result.groupName), result.groupName.length() + 1, ErrorCode.InvalidBackReference); } break; case valid: @@ -734,7 +742,7 @@ private Token parseConditionalBackReference() { groupNumber = getSingleNamedGroupNumber(result.groupName); namedReference = true; } else { - throw syntaxErrorAtRel(PyErrorMessages.unknownGroupName(result.groupName, mode), result.groupName.length() + 1); + throw syntaxErrorAtRel(PyErrorMessages.unknownGroupName(result.groupName, mode), result.groupName.length() + 1, ErrorCode.InvalidBackReference); } break; default: @@ -758,52 +766,43 @@ private Token parseInlineFlags(int ch0) { case ')': return Token.createInlineFlags(positiveFlags, true); case ':': - if (positiveFlags.includesGlobalFlags()) { - throw syntaxErrorAtRel(PyErrorMessages.INLINE_FLAGS_CANNOT_TURN_ON_GLOBAL_FLAG, 1); - } return parseLocalFlags(positiveFlags, PythonFlags.EMPTY_INSTANCE); case '-': - if (positiveFlags.includesGlobalFlags()) { - throw syntaxErrorAtRel(PyErrorMessages.INLINE_FLAGS_CANNOT_TURN_ON_GLOBAL_FLAG, 1); - } if (atEnd()) { - throw syntaxErrorHere(PyErrorMessages.MISSING_FLAG); + throw syntaxErrorHere(PyErrorMessages.MISSING_FLAG, ErrorCode.InvalidInlineFlag); } ch = consumeChar(); if (!PythonFlags.isValidFlagChar(ch)) { if (Character.isAlphabetic(ch)) { - throw syntaxErrorAtRel(PyErrorMessages.UNKNOWN_FLAG, 1); + throw syntaxErrorAtRel(PyErrorMessages.UNKNOWN_FLAG, 1, ErrorCode.InvalidInlineFlag); } else { - throw syntaxErrorAtRel(PyErrorMessages.MISSING_FLAG, 1); + throw syntaxErrorAtRel(PyErrorMessages.MISSING_FLAG, 1, ErrorCode.InvalidInlineFlag); } } PythonFlags negativeFlags = PythonFlags.EMPTY_INSTANCE; while (PythonFlags.isValidFlagChar(ch)) { negativeFlags = negativeFlags.addFlag(ch); if (PythonFlags.isTypeFlagChar(ch)) { - throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_CANNOT_TURN_OFF_FLAGS_A_U_AND_L); + throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_CANNOT_TURN_OFF_FLAGS_A_U_AND_L, ErrorCode.InvalidInlineFlag); } if (atEnd()) { - throw syntaxErrorHere(PyErrorMessages.MISSING_COLON); + throw syntaxErrorHere(PyErrorMessages.MISSING_COLON, ErrorCode.InvalidInlineFlag); } ch = consumeChar(); } if (ch != ':') { if (Character.isAlphabetic(ch)) { - throw syntaxErrorAtRel(PyErrorMessages.UNKNOWN_FLAG, 1); + throw syntaxErrorAtRel(PyErrorMessages.UNKNOWN_FLAG, 1, ErrorCode.InvalidInlineFlag); } else { - throw syntaxErrorAtRel(PyErrorMessages.MISSING_COLON, 1); + throw syntaxErrorAtRel(PyErrorMessages.MISSING_COLON, 1, ErrorCode.InvalidInlineFlag); } } - if (negativeFlags.includesGlobalFlags()) { - throw syntaxErrorAtRel(PyErrorMessages.INLINE_FLAGS_CANNOT_TURN_OFF_GLOBAL_FLAG, 1); - } return parseLocalFlags(positiveFlags, negativeFlags); default: if (Character.isAlphabetic(ch)) { - throw syntaxErrorAtRel(PyErrorMessages.UNKNOWN_FLAG, 1); + throw syntaxErrorAtRel(PyErrorMessages.UNKNOWN_FLAG, 1, ErrorCode.InvalidInlineFlag); } else { - throw syntaxErrorAtRel(PyErrorMessages.MISSING_DASH_COLON_PAREN, 1); + throw syntaxErrorAtRel(PyErrorMessages.MISSING_DASH_COLON_PAREN, 1, ErrorCode.InvalidInlineFlag); } } } @@ -811,16 +810,16 @@ private Token parseInlineFlags(int ch0) { private PythonFlags addFlag(PythonFlags flagsArg, int ch) { PythonFlags flags = flagsArg.addFlag(ch); if (mode == PythonREMode.Str && ch == 'L') { - throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_CANNOT_USE_L_FLAG_WITH_A_STR_PATTERN); + throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_CANNOT_USE_L_FLAG_WITH_A_STR_PATTERN, ErrorCode.InvalidInlineFlag); } if (mode == PythonREMode.Bytes && ch == 'u') { - throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_CANNOT_USE_U_FLAG_WITH_A_BYTES_PATTERN); + throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_CANNOT_USE_U_FLAG_WITH_A_BYTES_PATTERN, ErrorCode.InvalidInlineFlag); } if (flags.numberOfTypeFlags() > 1) { - throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_FLAGS_A_U_AND_L_ARE_INCOMPATIBLE); + throw syntaxErrorHere(PyErrorMessages.INLINE_FLAGS_FLAGS_A_U_AND_L_ARE_INCOMPATIBLE, ErrorCode.InvalidInlineFlag); } if (atEnd()) { - throw syntaxErrorHere(PyErrorMessages.MISSING_DASH_COLON_PAREN); + throw syntaxErrorHere(PyErrorMessages.MISSING_DASH_COLON_PAREN, ErrorCode.InvalidInlineFlag); } return flags; } @@ -834,7 +833,7 @@ private PythonFlags addFlag(PythonFlags flagsArg, int ch) { */ private Token parseLocalFlags(PythonFlags positiveFlags, PythonFlags negativeFlags) { if (positiveFlags.overlaps(negativeFlags)) { - throw syntaxErrorAtRel(PyErrorMessages.INLINE_FLAGS_FLAG_TURNED_ON_AND_OFF, 1); + throw syntaxErrorAtRel(PyErrorMessages.INLINE_FLAGS_FLAG_TURNED_ON_AND_OFF, 1, ErrorCode.InvalidInlineFlag); } PythonFlags newFlags = getLocalFlags().addFlags(positiveFlags).delFlags(negativeFlags); if (positiveFlags.numberOfTypeFlags() > 0) { @@ -846,7 +845,7 @@ private Token parseLocalFlags(PythonFlags positiveFlags, PythonFlags negativeFla private void mustHaveMore() { if (atEnd()) { - throw syntaxErrorHere(PyErrorMessages.UNEXPECTED_END_OF_PATTERN); + throw syntaxErrorHere(PyErrorMessages.UNEXPECTED_END_OF_PATTERN, ErrorCode.InvalidGroup); } } @@ -857,9 +856,9 @@ private Token parseNamedBackReference() { ParseGroupNameResult result = parseGroupName(')'); switch (result.state) { case empty: - throw syntaxErrorHere(PyErrorMessages.MISSING_GROUP_NAME); + throw syntaxErrorHere(PyErrorMessages.MISSING_GROUP_NAME, ErrorCode.InvalidBackReference); case unterminated: - throw syntaxErrorAtRel(PyErrorMessages.UNTERMINATED_NAME, result.groupName.length()); + throw syntaxErrorAtRel(PyErrorMessages.UNTERMINATED_NAME, result.groupName.length(), ErrorCode.InvalidBackReference); case invalidStart: case invalidRest: throw handleBadCharacterInGroupName(result); @@ -868,7 +867,7 @@ private Token parseNamedBackReference() { assert namedCaptureGroups.get(result.groupName).size() == 1; return Token.createBackReference(namedCaptureGroups.get(result.groupName).get(0), true); } else { - throw syntaxErrorAtRel(PyErrorMessages.unknownGroupName(result.groupName, mode), result.groupName.length() + 1); + throw syntaxErrorAtRel(PyErrorMessages.unknownGroupName(result.groupName, mode), result.groupName.length() + 1, ErrorCode.InvalidBackReference); } default: throw CompilerDirectives.shouldNotReachHere(); @@ -879,16 +878,16 @@ private String substring(int length) { return pattern.substring(getLastAtomPosition(), getLastAtomPosition() + length); } - public RegexSyntaxException syntaxErrorAtAbs(String msg, int i) { - return RegexSyntaxException.createPattern(source, msg, i); + public RegexSyntaxException syntaxErrorAtAbs(String msg, int i, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, i, errorCode); } - private RegexSyntaxException syntaxErrorAtRel(String msg, int i) { - return RegexSyntaxException.createPattern(source, msg, position - i); + private RegexSyntaxException syntaxErrorAtRel(String msg, int i, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, position - i, errorCode); } - public RegexSyntaxException syntaxErrorHere(String msg) { - return RegexSyntaxException.createPattern(source, msg, position); + public RegexSyntaxException syntaxErrorHere(String msg, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, msg, position, errorCode); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexParser.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexParser.java index 9bb36b62723..a750575a6df 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexParser.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonRegexParser.java @@ -50,6 +50,8 @@ import com.oracle.truffle.regex.RegexLanguage; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; +import com.oracle.truffle.regex.charset.ClassSetContents; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.CodePointSetAccumulator; import com.oracle.truffle.regex.charset.Constants; @@ -75,6 +77,7 @@ public final class PythonRegexParser implements RegexParser { private final PythonRegexLexer lexer; private final RegexASTBuilder astBuilder; private final CodePointSetAccumulator curCharClass = new CodePointSetAccumulator(); + private final CodePointSetAccumulator curCharClassCaseClosure = new CodePointSetAccumulator(); public PythonRegexParser(RegexLanguage language, RegexSource source, CompilationBuffer compilationBuffer) throws RegexSyntaxException { this.mode = PythonREMode.fromEncoding(source.getEncoding()); @@ -194,16 +197,16 @@ public RegexAST parse() throws RegexSyntaxException { break; case quantifier: if (prevKind == Token.Kind.quantifier) { - throw syntaxError(PyErrorMessages.MULTIPLE_REPEAT); + throw syntaxError(PyErrorMessages.MULTIPLE_REPEAT, ErrorCode.InvalidQuantifier); } if (astBuilder.getCurTerm() == null || !QUANTIFIER_PREV.contains(prevKind)) { - throw syntaxError(PyErrorMessages.NOTHING_TO_REPEAT); + throw syntaxError(PyErrorMessages.NOTHING_TO_REPEAT, ErrorCode.InvalidQuantifier); } astBuilder.addQuantifier((Token.Quantifier) token); break; case alternation: if (astBuilder.getCurGroup().isConditionalBackReferenceGroup() && astBuilder.getCurGroup().getAlternatives().size() == 2) { - throw syntaxError(PyErrorMessages.CONDITIONAL_BACKREF_WITH_MORE_THAN_TWO_BRANCHES); + throw syntaxError(PyErrorMessages.CONDITIONAL_BACKREF_WITH_MORE_THAN_TWO_BRANCHES, ErrorCode.InvalidBackReference); } astBuilder.nextSequence(); break; @@ -224,7 +227,7 @@ public RegexAST parse() throws RegexSyntaxException { break; case groupEnd: if (astBuilder.getCurGroup().getParent() instanceof RegexASTRootNode) { - throw syntaxError(PyErrorMessages.UNBALANCED_PARENTHESIS); + throw syntaxError(PyErrorMessages.UNBALANCED_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } if (astBuilder.getCurGroup().isLocalFlags()) { lexer.popLocalFlags(); @@ -243,14 +246,22 @@ public RegexAST parse() throws RegexSyntaxException { break; case charClassBegin: curCharClass.clear(); + curCharClassCaseClosure.clear(); break; case charClassAtom: - curCharClass.addSet(((Token.CharacterClassAtom) token).getContents().getCodePointSet()); + ClassSetContents contents = ((Token.CharacterClassAtom) token).getContents(); + if (lexer.featureEnabledIgnoreCase() && !contents.isCharacterClass()) { + curCharClassCaseClosure.addSet(contents.getCodePointSet()); + } else { + curCharClass.addSet(contents.getCodePointSet()); + } break; case charClassEnd: - boolean wasSingleChar = !lexer.isCurCharClassInverted() && curCharClass.matchesSingleChar(); + boolean wasSingleChar = !lexer.isCurCharClassInverted() && + (curCharClass.matchesSingleChar() && curCharClassCaseClosure.isEmpty() || curCharClass.isEmpty() && curCharClassCaseClosure.matchesSingleChar()); if (lexer.featureEnabledIgnoreCase()) { - lexer.caseFoldUnfold(curCharClass); + lexer.caseFoldUnfold(curCharClassCaseClosure); + curCharClass.addSet(curCharClassCaseClosure.get()); } CodePointSet cps = curCharClass.toCodePointSet(); astBuilder.addCharClass(lexer.isCurCharClassInverted() ? cps.createInverse(lexer.source.getEncoding()) : cps, wasSingleChar); @@ -266,7 +277,7 @@ public RegexAST parse() throws RegexSyntaxException { if (inlineFlags.isGlobal()) { boolean first = prev == null || (prevKind == Token.Kind.inlineFlags && ((Token.InlineFlags) prev).isGlobal()); if (!first) { - throw syntaxErrorAtAbs(PyErrorMessages.GLOBAL_FLAGS_NOT_AT_START, inlineFlags.getPosition()); + throw syntaxErrorAtAbs(PyErrorMessages.GLOBAL_FLAGS_NOT_AT_START, inlineFlags.getPosition(), ErrorCode.InvalidInlineFlag); } lexer.addGlobalFlags((PythonFlags) inlineFlags.getFlags()); } else { @@ -282,13 +293,14 @@ public RegexAST parse() throws RegexSyntaxException { astBuilder.addDollar(); } if (!astBuilder.curGroupIsRoot()) { - throw syntaxErrorAtAbs(PyErrorMessages.UNTERMINATED_SUBPATTERN, astBuilder.getCurGroupStartPosition()); + throw syntaxErrorAtAbs(PyErrorMessages.UNTERMINATED_SUBPATTERN, astBuilder.getCurGroupStartPosition(), ErrorCode.UnmatchedParenthesis); } RegexAST ast = astBuilder.popRootGroup(); for (Token.BackReference conditionalBackReference : conditionalBackReferences) { assert conditionalBackReference.getGroupNumbers().length == 1; if (conditionalBackReference.getGroupNumbers()[0] >= ast.getNumberOfCaptureGroups()) { - throw syntaxErrorAtAbs(PyErrorMessages.invalidGroupReference(Integer.toString(conditionalBackReference.getGroupNumbers()[0])), conditionalBackReference.getPosition() + 3); + throw syntaxErrorAtAbs(PyErrorMessages.invalidGroupReference(Integer.toString(conditionalBackReference.getGroupNumbers()[0])), conditionalBackReference.getPosition() + 3, + ErrorCode.InvalidBackReference); } } lexer.fixFlags(); @@ -323,7 +335,7 @@ private void verifyGroupReference(Token.BackReference backRefToken) throws Regex // references but also when a forward reference is made. if (conditional && insideLookBehind) { if (groupNumber >= lexer.numberOfCaptureGroupsSoFar()) { - throw syntaxErrorHere(PyErrorMessages.CANNOT_REFER_TO_AN_OPEN_GROUP); + throw syntaxErrorHere(PyErrorMessages.CANNOT_REFER_TO_AN_OPEN_GROUP, ErrorCode.InvalidBackReference); } } if (!conditional || insideLookBehind) { @@ -331,7 +343,7 @@ private void verifyGroupReference(Token.BackReference backRefToken) throws Regex while (parent != null) { if (parent instanceof Group && ((Group) parent).getGroupNumber() == groupNumber) { int errorPosition = backRefToken.isNamedReference() ? backRefToken.getPosition() + 4 : backRefToken.getPosition(); - throw syntaxErrorAtAbs(PyErrorMessages.CANNOT_REFER_TO_AN_OPEN_GROUP, errorPosition); + throw syntaxErrorAtAbs(PyErrorMessages.CANNOT_REFER_TO_AN_OPEN_GROUP, errorPosition, ErrorCode.InvalidBackReference); } parent = parent.getParent(); } @@ -346,8 +358,8 @@ private void verifyGroupReference(Token.BackReference backRefToken) throws Regex // other error that appears later in the expression. In such cases, we would not be // compatible with CPython error messages. while (parent != null) { - if (parent instanceof LookBehindAssertion && ((LookBehindAssertion) parent).getGroup().getEnclosedCaptureGroupsLow() <= groupNumber) { - throw syntaxErrorHere(PyErrorMessages.CANNOT_REFER_TO_GROUP_DEFINED_IN_THE_SAME_LOOKBEHIND_SUBPATTERN); + if (parent instanceof LookBehindAssertion && ((LookBehindAssertion) parent).getGroup().getEnclosedCaptureGroupsLo() <= groupNumber) { + throw syntaxErrorHere(PyErrorMessages.CANNOT_REFER_TO_GROUP_DEFINED_IN_THE_SAME_LOOKBEHIND_SUBPATTERN, ErrorCode.InvalidBackReference); } parent = parent.getSubTreeParent(); } @@ -366,15 +378,15 @@ private boolean insideLookBehind() { return insideLookBehind; } - private RegexSyntaxException syntaxError(String msg) { - return lexer.syntaxError(msg); + private RegexSyntaxException syntaxError(String msg, ErrorCode errorCode) { + return lexer.syntaxError(msg, errorCode); } - private RegexSyntaxException syntaxErrorHere(String msg) { - return lexer.syntaxErrorHere(msg); + private RegexSyntaxException syntaxErrorHere(String msg, ErrorCode errorCode) { + return lexer.syntaxErrorHere(msg, errorCode); } - private RegexSyntaxException syntaxErrorAtAbs(String msg, int i) { - return lexer.syntaxErrorAtAbs(msg, i); + private RegexSyntaxException syntaxErrorAtAbs(String msg, int i, ErrorCode errorCode) { + return lexer.syntaxErrorAtAbs(msg, i, errorCode); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java index 080bfc3a55e..9ac057e286c 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -228,7 +228,7 @@ public final class RubyFlavor extends RegexFlavor { private RubyFlavor() { super(BACKREFERENCES_TO_UNMATCHED_GROUPS_FAIL | EMPTY_CHECKS_MONITOR_CAPTURE_GROUPS | NESTED_CAPTURE_GROUPS_KEPT_ON_LOOP_REENTRY | FAILING_EMPTY_CHECKS_DONT_BACKTRACK | - HAS_CONDITIONAL_BACKREFERENCES | EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS); + HAS_CONDITIONAL_BACKREFERENCES | EMPTY_CHECKS_ON_MANDATORY_LOOP_ITERATIONS | LOOKBEHINDS_RUN_LEFT_TO_RIGHT); } @Override diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyRegexParser.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyRegexParser.java index af3a7df4066..2258a58f8c4 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyRegexParser.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyRegexParser.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -55,6 +55,7 @@ import java.util.Optional; import java.util.function.IntPredicate; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import org.graalvm.collections.Pair; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; @@ -98,9 +99,6 @@ public final class RubyRegexParser implements RegexValidator, RegexParser { // This is the same as above but restricted to ASCII. private static final Map ASCII_POSIX_CHAR_CLASSES; - // The [\n\r] CodePointSet. - private static final CodePointSet NEWLINE_RETURN = CodePointSet.create('\n', '\n', '\r', '\r'); - // CodePointSet constants used for expanding the \R escape sequence. private static final CodePointSet UNICODE_LINE_BREAKS = CodePointSet.create(0x0a, 0x0c, 0x85, 0x85, 0x2028, 0x2029); private static final CodePointSet ASCII_LINE_BREAKS = CodePointSet.create(0x0a, 0x0c); @@ -477,7 +475,7 @@ private String getUpTo(int count, IntPredicate pred) { private void advance() { if (atEnd()) { - throw syntaxErrorAtEnd(RbErrorMessages.UNEXPECTED_END_OF_PATTERN); + throw syntaxErrorAtEnd(RbErrorMessages.UNEXPECTED_END_OF_PATTERN, ErrorCode.UnfinishedSequence); } advance(1); } @@ -499,10 +497,9 @@ private boolean match(String next) { } } - private void mustMatch(String next) { - assert "}".equals(next) || ")".equals(next); + private void mustMatch(String next, String errorMsg, ErrorCode errorCode) { if (!match(next)) { - throw syntaxErrorHere("}".equals(next) ? RbErrorMessages.EXPECTED_BRACE : RbErrorMessages.EXPECTED_PAREN); + throw syntaxErrorHere(errorMsg, errorCode); } } @@ -618,16 +615,16 @@ private void addDeadNode() { // Error reporting - private RegexSyntaxException syntaxErrorAtEnd(String message) { - return RegexSyntaxException.createPattern(inSource, message, inPattern.length() - 1); + private RegexSyntaxException syntaxErrorAtEnd(String message, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(inSource, message, inPattern.length() - 1, errorCode); } - private RegexSyntaxException syntaxErrorHere(String message) { - return RegexSyntaxException.createPattern(inSource, message, position); + private RegexSyntaxException syntaxErrorHere(String message, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(inSource, message, position, errorCode); } - private RegexSyntaxException syntaxErrorAt(String message, int pos) { - return RegexSyntaxException.createPattern(inSource, message, pos); + private RegexSyntaxException syntaxErrorAt(String message, int pos, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(inSource, message, pos, errorCode); } // First pass - identifying capture groups @@ -742,7 +739,7 @@ private void run() { if (!atEnd()) { assert curChar() == ')'; - throw syntaxErrorHere(RbErrorMessages.UNBALANCED_PARENTHESIS); + throw syntaxErrorHere(RbErrorMessages.UNBALANCED_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } } @@ -778,6 +775,7 @@ private void disjunction(boolean toplevel) { } private void disjunction() { + canHaveQuantifier = false; disjunction(false); } @@ -1151,17 +1149,19 @@ private boolean assertionEscape() { addCaret(); return true; case 'Z': - // (?:$|(?=[\r\n]$)) + notAllowedInLookbehind(restorePosition); + // (?:$|(?=[\n]$)) pushGroup(); // (?: addDollar(); // $ nextSequence(); // | pushLookAheadAssertion(false); // (?= - addCharClass(NEWLINE_RETURN); // [\r\n] + addCharClass(CodePointSet.create('\n')); // [\n] addDollar(); // $ popGroup(); // ) popGroup(); // ) return true; case 'z': + notAllowedInLookbehind(restorePosition); addDollar(); return true; case 'G': @@ -1319,14 +1319,12 @@ private boolean backreference() { return false; } if (containsNamedCaptureGroups()) { - throw syntaxErrorAt(RbErrorMessages.NUMBERED_BACKREF_CALL_IS_NOT_ALLOWED, restorePosition); + throw syntaxErrorAt(RbErrorMessages.NUMBERED_BACKREF_CALL_IS_NOT_ALLOWED, restorePosition, ErrorCode.InvalidBackReference); } if (groupNumber > numberOfCaptureGroups()) { - throw syntaxErrorAt(RbErrorMessages.invalidGroupReference(number), restorePosition); - } - if (lookbehindDepth > 0) { - throw syntaxErrorAt(RbErrorMessages.INVALID_PATTERN_IN_LOOK_BEHIND, restorePosition); + throw syntaxErrorAt(RbErrorMessages.invalidGroupReference(number), restorePosition, ErrorCode.InvalidBackReference); } + notAllowedInLookbehind(restorePosition); if (groupNumber > groupIndex && groupNumber >= 10) { // forward references >= 10 are interpreted as octal escapes instead position = restorePosition; @@ -1339,6 +1337,12 @@ private boolean backreference() { } } + private void notAllowedInLookbehind(int errorPosition) { + if (lookbehindDepth > 0) { + throw syntaxErrorAt(RbErrorMessages.INVALID_PATTERN_IN_LOOK_BEHIND, errorPosition, ErrorCode.InvalidLookbehind); + } + } + /** * Tries to parse a named backreference (e.g. {@code \k}). * @@ -1363,7 +1367,7 @@ private List parseGroupReference(char terminator, boolean allowNumeric, int beginPos = position; if (curChar() == '-' || RegexLexer.isDecimalDigit(curChar())) { if (!allowNumeric) { - throw syntaxErrorHere(RbErrorMessages.INVALID_GROUP_NAME); + throw syntaxErrorHere(RbErrorMessages.INVALID_GROUP_NAME, ErrorCode.InvalidBackReference); } int sign = match("-") ? -1 : 1; groupName = getMany(RegexLexer::isDecimalDigit); @@ -1371,24 +1375,24 @@ private List parseGroupReference(char terminator, boolean allowNumeric, try { groupNumber = sign * Integer.parseInt(groupName); } catch (NumberFormatException e) { - throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos); + throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos, ErrorCode.InvalidBackReference); } if (groupNumber < 0) { groupNumber = numberOfCaptureGroups() + 1 + groupNumber; } if (containsNamedCaptureGroups()) { - throw syntaxErrorAt(RbErrorMessages.NUMBERED_BACKREF_CALL_IS_NOT_ALLOWED, beginPos); + throw syntaxErrorAt(RbErrorMessages.NUMBERED_BACKREF_CALL_IS_NOT_ALLOWED, beginPos, ErrorCode.InvalidBackReference); } if (resolveReference) { if (groupNumber < 0 || groupNumber > numberOfCaptureGroups()) { - throw syntaxErrorAt(RbErrorMessages.invalidGroupReference(groupName), beginPos); + throw syntaxErrorAt(RbErrorMessages.invalidGroupReference(groupName), beginPos, ErrorCode.InvalidBackReference); } groupNumbers = new ArrayList<>(1); groupNumbers.add(groupNumber); } } else { if (!allowNamed) { - throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos); + throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos, ErrorCode.InvalidBackReference); } groupName = getMany(c -> { if (allowLevels) { @@ -1398,11 +1402,11 @@ private List parseGroupReference(char terminator, boolean allowNumeric, } }); if (groupName.isEmpty()) { - throw syntaxErrorAt(RbErrorMessages.MISSING_GROUP_NAME, beginPos); + throw syntaxErrorAt(RbErrorMessages.MISSING_GROUP_NAME, beginPos, ErrorCode.InvalidBackReference); } if (resolveReference) { if (namedCaptureGroups == null || !namedCaptureGroups.containsKey(groupName)) { - throw syntaxErrorAt(RbErrorMessages.unknownGroupName(groupName), beginPos); + throw syntaxErrorAt(RbErrorMessages.unknownGroupName(groupName), beginPos, ErrorCode.InvalidBackReference); } groupNumbers = namedCaptureGroups.get(groupName); } @@ -1411,22 +1415,20 @@ private List parseGroupReference(char terminator, boolean allowNumeric, advance(); // consume sign String level = getMany(RegexLexer::isDecimalDigit); if (level.isEmpty()) { - throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos); + throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos, ErrorCode.InvalidBackReference); } bailOut("backreferences to other levels are not supported"); } if (!match(Character.toString(terminator))) { - throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos); - } - if (lookbehindDepth > 0) { - throw syntaxErrorAt(RbErrorMessages.INVALID_PATTERN_IN_LOOK_BEHIND, beginPos); + throw syntaxErrorAt(RbErrorMessages.INVALID_GROUP_NAME, beginPos, ErrorCode.InvalidBackReference); } + notAllowedInLookbehind(beginPos); return groupNumbers; } private void buildNamedBackreference(Integer[] groupNumbers, String name) { if (groupNumbers.length == 0) { - throw syntaxErrorHere(RbErrorMessages.undefinedReference(name)); + throw syntaxErrorHere(RbErrorMessages.undefinedReference(name), ErrorCode.InvalidBackReference); } else if (groupNumbers.length == 1) { buildBackreference(groupNumbers[0], true); } else { @@ -1534,7 +1536,7 @@ private boolean subexpressionCall() { List targetGroups = parseGroupReference('>', true, true, false, true); int nameEnd = position - 1; if (targetGroups.size() > 1) { - throw syntaxErrorHere(RbErrorMessages.multiplexCall(inPattern.substring(nameStart, nameEnd))); + throw syntaxErrorHere(RbErrorMessages.multiplexCall(inPattern.substring(nameStart, nameEnd)), ErrorCode.InvalidSubexpressionCall); } addSubexpressionCall(targetGroups.get(0)); hasSubexpressionCalls = true; @@ -1560,11 +1562,11 @@ private boolean stringEscape() { try { int codePoint = Integer.parseInt(code, 16); if (codePoint > 0x10FFFF) { - throw syntaxErrorAt(RbErrorMessages.invalidUnicodeEscape(code), beginPos); + throw syntaxErrorAt(RbErrorMessages.invalidUnicodeEscape(code), beginPos, ErrorCode.InvalidEscape); } buildChar(codePoint); } catch (NumberFormatException e) { - throw syntaxErrorAt(RbErrorMessages.badEscape(code), beginPos); + throw syntaxErrorAt(RbErrorMessages.badEscape(code), beginPos, ErrorCode.InvalidEscape); } getMany(WHITESPACE::get); } @@ -1614,10 +1616,10 @@ private int fetchEscapedChar() { case 'c': case 'C': { if (atEnd()) { - throw syntaxErrorAt(RbErrorMessages.END_PATTERN_AT_CONTROL, beginPos); + throw syntaxErrorAt(RbErrorMessages.END_PATTERN_AT_CONTROL, beginPos, ErrorCode.InvalidEscape); } if (ch == 'C' && !match("-")) { - throw syntaxErrorAt(RbErrorMessages.INVALID_CONTROL_CODE_SYNTAX, beginPos); + throw syntaxErrorAt(RbErrorMessages.INVALID_CONTROL_CODE_SYNTAX, beginPos, ErrorCode.InvalidEscape); } int c = consumeChar(); if (c == '?') { @@ -1630,13 +1632,13 @@ private int fetchEscapedChar() { } case 'M': { if (atEnd()) { - throw syntaxErrorAt(RbErrorMessages.END_PATTERN_AT_META, beginPos); + throw syntaxErrorAt(RbErrorMessages.END_PATTERN_AT_META, beginPos, ErrorCode.InvalidEscape); } if (!match("-")) { - throw syntaxErrorAt(RbErrorMessages.INVALID_META_CODE_SYNTAX, beginPos); + throw syntaxErrorAt(RbErrorMessages.INVALID_META_CODE_SYNTAX, beginPos, ErrorCode.InvalidEscape); } if (atEnd()) { - throw syntaxErrorAt(RbErrorMessages.END_PATTERN_AT_META, beginPos); + throw syntaxErrorAt(RbErrorMessages.END_PATTERN_AT_META, beginPos, ErrorCode.InvalidEscape); } int c = consumeChar(); if (c == '\\') { @@ -1682,21 +1684,21 @@ private Optional characterEscape() { String code; if (match("{")) { code = getMany(RegexLexer::isHexDigit); - mustMatch("}"); + mustMatch("}", RbErrorMessages.EXPECTED_BRACE, ErrorCode.InvalidEscape); } else { code = getUpTo(4, RegexLexer::isHexDigit); if (code.length() < 4) { - throw syntaxErrorAt(RbErrorMessages.incompleteEscape(code), beginPos); + throw syntaxErrorAt(RbErrorMessages.incompleteEscape(code), beginPos, ErrorCode.InvalidEscape); } } try { int codePoint = Integer.parseInt(code, 16); if (codePoint > 0x10FFFF) { - throw syntaxErrorAt(RbErrorMessages.invalidUnicodeEscape(code), beginPos); + throw syntaxErrorAt(RbErrorMessages.invalidUnicodeEscape(code), beginPos, ErrorCode.InvalidEscape); } return Optional.of(codePoint); } catch (NumberFormatException e) { - throw syntaxErrorAt(RbErrorMessages.badEscape(code), beginPos); + throw syntaxErrorAt(RbErrorMessages.badEscape(code), beginPos, ErrorCode.InvalidEscape); } } case '0': @@ -1707,10 +1709,10 @@ private Optional characterEscape() { case '5': case '6': case '7': { - String code = getUpTo(3, c -> RegexLexer.isOctalDigit(c)); + String code = getUpTo(3, RegexLexer::isOctalDigit); int codePoint = Integer.parseInt(code, 8); if (codePoint > 0xFF) { - throw syntaxErrorAt(RbErrorMessages.TOO_BIG_NUMBER, beginPos); + throw syntaxErrorAt(RbErrorMessages.TOO_BIG_NUMBER, beginPos, ErrorCode.InvalidEscape); } return Optional.of(codePoint); } @@ -1772,7 +1774,7 @@ private void collectCharClass() { int firstPosInside = position; classBody: while (true) { if (atEnd()) { - throw syntaxErrorAt(RbErrorMessages.UNTERMINATED_CHARACTER_SET, beginPos); + throw syntaxErrorAt(RbErrorMessages.UNTERMINATED_CHARACTER_SET, beginPos, ErrorCode.InvalidCharacterClass); } int rangeStart = position; Optional lowerBound; @@ -1811,7 +1813,7 @@ private void collectCharClass() { // a hyphen following a nested char class is never interpreted as a range operator if (!wasNestedCharClass && match("-")) { if (atEnd()) { - throw syntaxErrorAt(RbErrorMessages.UNTERMINATED_CHARACTER_SET, beginPos); + throw syntaxErrorAt(RbErrorMessages.UNTERMINATED_CHARACTER_SET, beginPos, ErrorCode.InvalidCharacterClass); } Optional upperBound; ch = consumeChar(); @@ -1852,7 +1854,7 @@ private void collectCharClass() { // both the left operand and the range operator if (!wasNestedCharClass) { if (!lowerBound.isPresent() || !upperBound.isPresent() || upperBound.get() < lowerBound.get()) { - throw syntaxErrorAt(RbErrorMessages.badCharacterRange(inPattern.substring(rangeStart, position)), rangeStart); + throw syntaxErrorAt(RbErrorMessages.badCharacterRange(inPattern.substring(rangeStart, position)), rangeStart, ErrorCode.InvalidCharacterClass); } curCharClassAddRange(lowerBound.get(), upperBound.get()); } @@ -1954,7 +1956,7 @@ private PosixClassParseResult collectPosixCharClass() { } if (match(":]")) { if (!UNICODE_POSIX_CHAR_CLASSES.containsKey(className)) { - throw syntaxErrorAt(RbErrorMessages.INVALID_POSIX_BRACKET_TYPE, restorePosition); + throw syntaxErrorAt(RbErrorMessages.INVALID_POSIX_BRACKET_TYPE, restorePosition, ErrorCode.InvalidCharacterClass); } CodePointSet charSet; if (getLocalFlags().isAscii()) { @@ -2015,7 +2017,7 @@ private void quantifier(int ch) { if (canHaveQuantifier) { addQuantifier(Token.createQuantifier(quantifier.lower, quantifier.upper, quantifier.greedy, quantifier.possessive, ch != '{')); } else { - throw syntaxErrorAt(RbErrorMessages.NOTHING_TO_REPEAT, start); + throw syntaxErrorAt(RbErrorMessages.NOTHING_TO_REPEAT, start, ErrorCode.InvalidQuantifier); } } else { string(consumeChar()); @@ -2053,7 +2055,7 @@ private Quantifier parseQuantifier(int ch) { return null; } if (lowerBound.isPresent() && upperBound.isPresent() && lowerBound.get().compareTo(upperBound.get()) > 0) { - throw syntaxErrorAt(RbErrorMessages.MIN_REPEAT_GREATER_THAN_MAX_REPEAT, start); + throw syntaxErrorAt(RbErrorMessages.MIN_REPEAT_GREATER_THAN_MAX_REPEAT, start, ErrorCode.InvalidQuantifier); } boolean greedy = true; if (canBeNonGreedy && match("?")) { @@ -2117,7 +2119,7 @@ private static int quantifierBoundsToIntValue(BigInteger i) { */ private void parens() { if (atEnd()) { - throw syntaxErrorAtEnd(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorAtEnd(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } if (match("?")) { final int ch1 = consumeChar(); @@ -2182,7 +2184,7 @@ private void parens() { break; default: - throw syntaxErrorAt(RbErrorMessages.unknownExtension(ch1), position - 1); + throw syntaxErrorAt(RbErrorMessages.unknownExtension(ch1), position - 1, ErrorCode.InvalidGroup); } } else { group(!containsNamedCaptureGroups()); @@ -2197,10 +2199,10 @@ private void parens() { private String parseGroupName(char terminator) { String groupName = getMany(c -> c != terminator); if (!match(Character.toString(terminator))) { - throw syntaxErrorHere(RbErrorMessages.unterminatedName(terminator)); + throw syntaxErrorHere(RbErrorMessages.unterminatedName(terminator), ErrorCode.InvalidNamedGroup); } if (groupName.isEmpty()) { - throw syntaxErrorHere(RbErrorMessages.MISSING_GROUP_NAME); + throw syntaxErrorHere(RbErrorMessages.MISSING_GROUP_NAME, ErrorCode.InvalidNamedGroup); } return groupName; } @@ -2212,7 +2214,7 @@ private void parenComment() { int beginPos = position - 2; while (true) { if (atEnd()) { - throw syntaxErrorAt(RbErrorMessages.UNTERMINATED_COMMENT, beginPos); + throw syntaxErrorAt(RbErrorMessages.UNTERMINATED_COMMENT, beginPos, ErrorCode.UnmatchedParenthesis); } int ch = consumeChar(); if (ch == '\\' && !atEnd()) { @@ -2246,7 +2248,7 @@ private void group(boolean capturing) { } canHaveQuantifier = true; } else { - throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } } @@ -2257,13 +2259,14 @@ private void group(boolean capturing) { * @param negate {@code true} if the assertion to be pushed is a negative lookahead assertion */ private void lookahead(boolean negate) { + notAllowedInLookbehind(position); pushLookAheadAssertion(negate); disjunction(); if (match(")")) { popGroup(); canHaveQuantifier = true; } else { - throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } } @@ -2279,7 +2282,7 @@ private void lookbehind(boolean negate) { popGroup(); canHaveQuantifier = true; } else { - throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } } @@ -2294,7 +2297,7 @@ private void atomicGroup() { popGroup(); canHaveQuantifier = true; } else { - throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } } @@ -2307,16 +2310,16 @@ private void conditionalBackReference() { if (match("<")) { namedReference = curChar() != '-' && !RegexLexer.isDecimalDigit(curChar()); groupNumbers = parseGroupReference('>', true, true, true, true); - mustMatch(")"); + mustMatch(")", RbErrorMessages.EXPECTED_PAREN, ErrorCode.InvalidBackReference); } else if (match("'")) { namedReference = curChar() != '-' && !RegexLexer.isDecimalDigit(curChar()); groupNumbers = parseGroupReference('\'', true, true, true, true); - mustMatch(")"); + mustMatch(")", RbErrorMessages.EXPECTED_PAREN, ErrorCode.InvalidBackReference); } else if (RegexLexer.isDecimalDigit(curChar())) { namedReference = false; groupNumbers = parseGroupReference(')', true, false, true, true); } else { - throw syntaxErrorHere(RbErrorMessages.INVALID_GROUP_NAME); + throw syntaxErrorHere(RbErrorMessages.INVALID_GROUP_NAME, ErrorCode.InvalidBackReference); } pushConditionalBackReferenceGroup(groupNumbers.get(0), namedReference); alternative(); @@ -2325,14 +2328,14 @@ private void conditionalBackReference() { canHaveQuantifier = false; alternative(); if (curChar() == '|') { - throw syntaxErrorHere(RbErrorMessages.CONDITIONAL_BACKREF_WITH_MORE_THAN_TWO_BRANCHES); + throw syntaxErrorHere(RbErrorMessages.CONDITIONAL_BACKREF_WITH_MORE_THAN_TWO_BRANCHES, ErrorCode.InvalidGroup); } } else { // Generate the implicit empty else-branch, if it was not specified. nextSequence(); } if (!match(")")) { - throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } popGroup(); canHaveQuantifier = true; @@ -2347,7 +2350,7 @@ private void absentExpression() { if (match(")")) { bailOut("absent expressions not supported"); } else { - throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN); + throw syntaxErrorHere(RbErrorMessages.UNTERMINATED_SUBPATTERN, ErrorCode.UnmatchedParenthesis); } canHaveQuantifier = true; } @@ -2366,20 +2369,20 @@ private void flags(int ch0) { } else if (RubyFlags.isValidFlagChar(ch)) { if (negative) { if (RubyFlags.isTypeFlag(ch)) { - throw syntaxErrorHere(RbErrorMessages.UNDEFINED_GROUP_OPTION); + throw syntaxErrorHere(RbErrorMessages.UNDEFINED_GROUP_OPTION, ErrorCode.InvalidInlineFlag); } newFlags = newFlags.delFlag(ch); } else { newFlags = newFlags.addFlag(ch); } } else if (Character.isAlphabetic(ch)) { - throw syntaxErrorHere(RbErrorMessages.UNDEFINED_GROUP_OPTION); + throw syntaxErrorHere(RbErrorMessages.UNDEFINED_GROUP_OPTION, ErrorCode.InvalidInlineFlag); } else { - throw syntaxErrorHere(RbErrorMessages.MISSING_DASH_COLON_PAREN); + throw syntaxErrorHere(RbErrorMessages.MISSING_DASH_COLON_PAREN, ErrorCode.InvalidInlineFlag); } if (atEnd()) { - throw syntaxErrorAtEnd(RbErrorMessages.MISSING_FLAG_DASH_COLON_PAREN); + throw syntaxErrorAtEnd(RbErrorMessages.MISSING_FLAG_DASH_COLON_PAREN, ErrorCode.InvalidInlineFlag); } ch = consumeChar(); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubySubexpressionCalls.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubySubexpressionCalls.java index abf3efb58d2..698dc35ef6f 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubySubexpressionCalls.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubySubexpressionCalls.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -47,6 +47,7 @@ import com.oracle.truffle.regex.tregex.parser.ast.SubexpressionCall; import com.oracle.truffle.regex.tregex.parser.ast.visitors.CopyVisitor; import com.oracle.truffle.regex.tregex.parser.ast.visitors.DepthFirstTraversalRegexASTVisitor; +import com.oracle.truffle.regex.tregex.parser.ast.visitors.MarkAsAliveVisitor; import java.util.ArrayList; import java.util.HashMap; @@ -74,7 +75,7 @@ public static void expandNonRecursiveSubexpressionCalls(RegexAST ast) { CallGraphNode node = expansionStack.remove(expansionStack.size() - 1); if (node instanceof SubexpressionCallNode) { SubexpressionCall subexpressionCall = ((SubexpressionCallNode) node).subexpressionCall; - replace(subexpressionCall, ast.getGroup(subexpressionCall.getGroupNr()), copyVisitor); + replace(subexpressionCall, ast.getGroup(subexpressionCall.getGroupNr()).get(0), copyVisitor); } if (callGraph.containsKey(node)) { for (CallGraphNode dependent : callGraph.get(node)) { @@ -98,6 +99,7 @@ public static void expandNonRecursiveSubexpressionCalls(RegexAST ast) { private static void replace(SubexpressionCall caller, Group callee, CopyVisitor copyVisitor) { Group copy = (Group) copyVisitor.copy(callee); + MarkAsAliveVisitor.markAsAlive(copy); copy.setQuantifier(caller.getQuantifier()); Sequence callerSeq = caller.getParent(); int callerSeqIndex = caller.getSeqIndex(); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaFlags.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaFlags.java index 99cb31a8571..24b44362d52 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaFlags.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaFlags.java @@ -179,6 +179,10 @@ public String toString() { return sb.toString(); } + public int getValue() { + return value; + } + public boolean isCanonEq() { return isSet(Pattern.CANON_EQ); } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexLexer.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexLexer.java index 7129aa86d62..47fd08058a9 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexLexer.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexLexer.java @@ -48,6 +48,7 @@ import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.charset.ClassSetContents; import com.oracle.truffle.regex.charset.CodePointSet; @@ -378,27 +379,27 @@ protected long boundedQuantifierMaxValue() { @Override protected RegexSyntaxException handleBoundedQuantifierOutOfOrder() { - return syntaxError(JavaErrorMessages.ILLEGAL_REPETITION); + return syntaxError(JavaErrorMessages.ILLEGAL_REPETITION, ErrorCode.InvalidQuantifier); } @Override protected Token handleBoundedQuantifierEmptyOrMissingMin() { - throw syntaxError(JavaErrorMessages.ILLEGAL_REPETITION); + throw syntaxError(JavaErrorMessages.ILLEGAL_REPETITION, ErrorCode.InvalidQuantifier); } @Override protected Token handleBoundedQuantifierInvalidCharacter() { - throw syntaxError(JavaErrorMessages.UNCLOSED_COUNTED_CLOSURE); + throw syntaxError(JavaErrorMessages.UNCLOSED_COUNTED_CLOSURE, ErrorCode.InvalidQuantifier); } @Override protected Token handleBoundedQuantifierOverflow(long min, long max) { - throw syntaxError(JavaErrorMessages.ILLEGAL_REPETITION); + throw syntaxError(JavaErrorMessages.ILLEGAL_REPETITION, ErrorCode.InvalidQuantifier); } @Override protected Token handleBoundedQuantifierOverflowMin(long min, long max) { - throw syntaxError(JavaErrorMessages.ILLEGAL_REPETITION); + throw syntaxError(JavaErrorMessages.ILLEGAL_REPETITION, ErrorCode.InvalidQuantifier); } @Override @@ -418,12 +419,12 @@ protected RegexSyntaxException handleComplementOfStringSet() { @Override protected void handleGroupRedefinition(String name, int newId, int oldId) { - throw syntaxError(JavaErrorMessages.groupRedefinition(name)); + throw syntaxError(JavaErrorMessages.groupRedefinition(name), ErrorCode.InvalidNamedGroup); } @Override protected void handleIncompleteEscapeX() { - throw syntaxError(JavaErrorMessages.ILLEGAL_HEX_ESCAPE); + throw syntaxError(JavaErrorMessages.ILLEGAL_HEX_ESCAPE, ErrorCode.InvalidEscape); } @Override @@ -446,7 +447,7 @@ protected RegexSyntaxException handleInvalidCharInCharClass() { @Override protected RegexSyntaxException handleInvalidGroupBeginQ() { - return syntaxError(JavaErrorMessages.UNKNOWN_INLINE_MODIFIER); + return syntaxError(JavaErrorMessages.UNKNOWN_INLINE_MODIFIER, ErrorCode.InvalidGroup); } @Override @@ -470,7 +471,7 @@ protected RegexSyntaxException handleRangeAsClassSetOperand(ClassSetOperator ope @Override protected void handleUnfinishedEscape() { - throw syntaxError(JavaErrorMessages.UNESCAPED_TRAILING_BACKSLASH); + throw syntaxError(JavaErrorMessages.UNESCAPED_TRAILING_BACKSLASH, ErrorCode.InvalidEscape); } @Override @@ -480,7 +481,7 @@ protected void handleUnfinishedGroupComment() { @Override protected RegexSyntaxException handleUnfinishedGroupQ() { - return syntaxError(JavaErrorMessages.UNKNOWN_INLINE_MODIFIER); + return syntaxError(JavaErrorMessages.UNKNOWN_INLINE_MODIFIER, ErrorCode.InvalidGroup); } @Override @@ -494,7 +495,7 @@ protected void handleUnmatchedRightBrace() { @Override protected RegexSyntaxException handleUnmatchedLeftBracket() { - return syntaxError(JavaErrorMessages.UNCLOSED_CHARACTER_CLASS); + return syntaxError(JavaErrorMessages.UNCLOSED_CHARACTER_CLASS, ErrorCode.UnmatchedBracket); } @Override @@ -530,11 +531,11 @@ protected ClassSetContents parseUnicodeCharacterProperty(boolean invert) throws advance(); } if (!consumingLookahead("}")) { - throw syntaxError(JavaErrorMessages.UNCLOSED_CHAR_FAMILY); + throw syntaxError(JavaErrorMessages.UNCLOSED_CHAR_FAMILY, ErrorCode.InvalidCharacterClass); } name = pattern.substring(namePos, position - 1); if (name.isEmpty()) { - throw syntaxError(JavaErrorMessages.EMPTY_CHAR_FAMILY); + throw syntaxError(JavaErrorMessages.EMPTY_CHAR_FAMILY, ErrorCode.InvalidCharacterClass); } } CodePointSet p = null; @@ -563,7 +564,7 @@ protected ClassSetContents parseUnicodeCharacterProperty(boolean invert) throws break; } if (p == null) { - throw syntaxError(JavaErrorMessages.unknownUnicodeProperty(name, value)); + throw syntaxError(JavaErrorMessages.unknownUnicodeProperty(name, value), ErrorCode.InvalidCharacterClass); } } else { if (name.startsWith("In")) { @@ -588,7 +589,7 @@ protected ClassSetContents parseUnicodeCharacterProperty(boolean invert) throws } } if (p == null) { - throw syntaxError(JavaErrorMessages.unknownUnicodeCharacterProperty(name)); + throw syntaxError(JavaErrorMessages.unknownUnicodeCharacterProperty(name), ErrorCode.InvalidCharacterClass); } } if (invert) { @@ -655,13 +656,13 @@ private CodePointSet parseCharClassInternal(boolean consume) throws RegexSyntaxE } if (prev == null) { if (right == null) { - throw syntaxError(JavaErrorMessages.BAD_CLASS_SYNTAX); + throw syntaxError(JavaErrorMessages.BAD_CLASS_SYNTAX, ErrorCode.InvalidCharacterClass); } else { prev = right; } } else { if (curr == null) { - throw syntaxError(JavaErrorMessages.BAD_INTERSECTION_SYNTAX); + throw syntaxError(JavaErrorMessages.BAD_INTERSECTION_SYNTAX, ErrorCode.InvalidCharacterClass); } prev = prev.createIntersection(curr, compilationBuffer); } @@ -722,14 +723,14 @@ private CodePointSet parseCharClassInternal(boolean consume) throws RegexSyntaxE } } } - throw syntaxError(JavaErrorMessages.UNCLOSED_CHARACTER_CLASS); + throw syntaxError(JavaErrorMessages.UNCLOSED_CHARACTER_CLASS, ErrorCode.UnmatchedBracket); } private CodePointSet parseRange(char c) { int ch = parseCharClassAtomCodePoint(c); if (consumingLookahead('-')) { if (atEnd()) { - throw syntaxError(JavaErrorMessages.ILLEGAL_CHARACTER_RANGE); + throw syntaxError(JavaErrorMessages.ILLEGAL_CHARACTER_RANGE, ErrorCode.InvalidCharacterClass); } if (curChar() == ']' || curChar() == '[') { // unmatched '-' is treated as literal @@ -738,7 +739,7 @@ private CodePointSet parseRange(char c) { } int upper = parseCharClassAtomCodePoint(consumeChar()); if (upper < ch) { - throw syntaxError(JavaErrorMessages.ILLEGAL_CHARACTER_RANGE); + throw syntaxError(JavaErrorMessages.ILLEGAL_CHARACTER_RANGE, ErrorCode.InvalidCharacterClass); } return CodePointSet.create(ch, upper); } else { @@ -763,14 +764,14 @@ protected Token parseCustomEscape(char c) { handleUnfinishedEscape(); } if (consumeChar() != '<') { - throw syntaxError(JavaErrorMessages.NAMED_CAPTURE_GROUP_REFERENCE_MISSING_BEGIN); + throw syntaxError(JavaErrorMessages.NAMED_CAPTURE_GROUP_REFERENCE_MISSING_BEGIN, ErrorCode.InvalidBackReference); } String groupName = javaParseGroupName(); // backward reference if (namedCaptureGroups.containsKey(groupName)) { return Token.createBackReference(getSingleNamedGroupNumber(groupName), false); } - throw syntaxError(JavaErrorMessages.unknownGroupReference(groupName)); + throw syntaxError(JavaErrorMessages.unknownGroupReference(groupName), ErrorCode.InvalidBackReference); } case 'R' -> { return Token.createLineBreak(); @@ -797,12 +798,12 @@ protected int parseCustomEscapeChar(char c, boolean inCharClass) { switch (c) { case 'b': // \b is only valid as the boundary matcher and not as a character escape - throw syntaxError(JavaErrorMessages.ILLEGAL_ESCAPE_SEQUENCE); + throw syntaxError(JavaErrorMessages.ILLEGAL_ESCAPE_SEQUENCE, ErrorCode.InvalidEscape); case '0': if (lookahead(RegexLexer::isOctalDigit, 1)) { return parseOctal(0, 3); } - throw syntaxError(JavaErrorMessages.ILLEGAL_OCT_ESCAPE); + throw syntaxError(JavaErrorMessages.ILLEGAL_OCT_ESCAPE, ErrorCode.InvalidEscape); case 'u': int n = parseUnicodeHexEscape(); if (Character.isHighSurrogate((char) n)) { @@ -819,36 +820,36 @@ protected int parseCustomEscapeChar(char c, boolean inCharClass) { case 'x': if (consumingLookahead('{')) { int hex = parseHex(1, 8, Character.MAX_CODE_POINT, () -> { - throw syntaxError(JavaErrorMessages.ILLEGAL_HEX_ESCAPE); + throw syntaxError(JavaErrorMessages.ILLEGAL_HEX_ESCAPE, ErrorCode.InvalidEscape); }, () -> { - throw syntaxError(JavaErrorMessages.HEX_TOO_BIG); + throw syntaxError(JavaErrorMessages.HEX_TOO_BIG, ErrorCode.InvalidEscape); }); if (!consumingLookahead('}')) { - throw syntaxError(JavaErrorMessages.UNCLOSED_HEX); + throw syntaxError(JavaErrorMessages.UNCLOSED_HEX, ErrorCode.InvalidEscape); } return hex; } return -1; case 'c': if (atEnd()) { - throw syntaxError(JavaErrorMessages.ILLEGAL_CTRL_SEQ); + throw syntaxError(JavaErrorMessages.ILLEGAL_CTRL_SEQ, ErrorCode.InvalidEscape); } return consumeChar() ^ 64; case 'N': if (consumingLookahead('{')) { int i = position; if (!findChars('}')) { - throw syntaxError(JavaErrorMessages.UNCLOSED_CHAR_NAME); + throw syntaxError(JavaErrorMessages.UNCLOSED_CHAR_NAME, ErrorCode.InvalidEscape); } advance(); // skip '}' String name = pattern.substring(i, position - 1); try { return Character.codePointOf(name); } catch (IllegalArgumentException x) { - throw syntaxError(JavaErrorMessages.unknownCharacterName(name)); + throw syntaxError(JavaErrorMessages.unknownCharacterName(name), ErrorCode.InvalidEscape); } } - throw syntaxError(JavaErrorMessages.ILLEGAL_CHARACTER_NAME); + throw syntaxError(JavaErrorMessages.ILLEGAL_CHARACTER_NAME, ErrorCode.InvalidEscape); case 'a': return 0x7; case 'e': @@ -864,7 +865,7 @@ private int parseUnicodeHexEscape() { if (consumingLookahead(RegexLexer::isHexDigit, 4)) { return Integer.parseInt(pattern, position - 4, position, 16); } - throw syntaxError(JavaErrorMessages.ILLEGAL_UNICODE_ESC_SEQ); + throw syntaxError(JavaErrorMessages.ILLEGAL_UNICODE_ESC_SEQ, ErrorCode.InvalidEscape); } @Override @@ -873,7 +874,7 @@ protected int parseCustomEscapeCharFallback(int c, boolean inCharClass) { // digits are not accepted here since they should have been parsed as octal sequence or // backreference earlier if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { - throw syntaxError(JavaErrorMessages.ILLEGAL_ESCAPE_SEQUENCE); + throw syntaxError(JavaErrorMessages.ILLEGAL_ESCAPE_SEQUENCE, ErrorCode.InvalidEscape); } return c; } @@ -929,9 +930,9 @@ private String javaParseGroupName() { ParseGroupNameResult result = parseGroupName('>'); switch (result.state) { case empty, invalidStart: - throw syntaxError(JavaErrorMessages.INVALID_GROUP_NAME_START); + throw syntaxError(JavaErrorMessages.INVALID_GROUP_NAME_START, ErrorCode.InvalidNamedGroup); case unterminated, invalidRest: - throw syntaxError(JavaErrorMessages.INVALID_GROUP_NAME_REST); + throw syntaxError(JavaErrorMessages.INVALID_GROUP_NAME_REST, ErrorCode.InvalidNamedGroup); case valid: return result.groupName; default: diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexParser.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexParser.java index 2c39bf6271a..2a2e9d7630d 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexParser.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexParser.java @@ -46,6 +46,7 @@ import com.oracle.truffle.regex.RegexLanguage; import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.charset.CodePointSet; import com.oracle.truffle.regex.charset.Constants; @@ -107,11 +108,23 @@ public RegexAST parse() { addCaret(); break; case Z: - pushGroup(); // (?: - lineTerminators(); + pushGroup(); + pushLookAheadAssertion(); + if (getFlags().isUnixLines()) { + addCharClass(CodePointSet.create('\n')); + addDollar(); + } else { + addCharClass(CodePointSet.create('\r')); + addCharClass(CodePointSet.create('\n')); + addDollar(); + nextSequence(); + addCharClass(CodePointSet.createNoDedup('\n', '\n', '\r', '\r', 0x0085, 0x0085, 0x2028, 0x2029)); + addDollar(); + } + popGroup(); nextSequence(); - popGroup(); // ) addDollar(); + popGroup(); break; case z: addDollar(); @@ -143,7 +156,7 @@ public RegexAST parse() { Token.Quantifier quantifier = (Token.Quantifier) token; // quantifiers of type *, + or ? cannot directly follow another quantifier if (last instanceof Token.Quantifier && quantifier.isSingleChar()) { - throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier)); + throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier), ErrorCode.InvalidQuantifier); } if (astBuilder.getCurTerm() != null) { if (quantifier.isPossessive()) { @@ -152,7 +165,7 @@ public RegexAST parse() { addQuantifier((Token.Quantifier) token); } else { if (quantifier.isSingleChar()) { - throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier)); + throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier), ErrorCode.InvalidQuantifier); } } break; @@ -186,7 +199,7 @@ public RegexAST parse() { break; case groupEnd: if (astBuilder.getCurGroup().getParent() instanceof RegexASTRootNode) { - throw syntaxErrorHere(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS); + throw syntaxErrorHere(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } lexer.popLocalFlags(); astBuilder.popGroup(token); @@ -216,7 +229,7 @@ public RegexAST parse() { astBuilder.addDollar(); } if (!astBuilder.curGroupIsRoot()) { - throw syntaxErrorHere(JavaErrorMessages.UNCLOSED_GROUP); + throw syntaxErrorHere(JavaErrorMessages.UNCLOSED_GROUP, ErrorCode.UnmatchedParenthesis); } return astBuilder.popRootGroup(); } @@ -233,8 +246,8 @@ public AbstractRegexObject getNamedCaptureGroups() { // Error reporting - private RegexSyntaxException syntaxErrorHere(String message) { - return RegexSyntaxException.createPattern(source, message, lexer.getLastTokenPosition()); + private RegexSyntaxException syntaxErrorHere(String message, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, message, lexer.getLastTokenPosition(), errorCode); } private void literalString(Token.LiteralString token) { diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexValidator.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexValidator.java index 76a22eb8530..c88316405d2 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexValidator.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaRegexValidator.java @@ -42,6 +42,7 @@ import com.oracle.truffle.regex.RegexSource; import com.oracle.truffle.regex.RegexSyntaxException; +import com.oracle.truffle.regex.RegexSyntaxException.ErrorCode; import com.oracle.truffle.regex.UnsupportedRegexException; import com.oracle.truffle.regex.errors.JavaErrorMessages; import com.oracle.truffle.regex.errors.JsErrorMessages; @@ -117,10 +118,10 @@ public void validate() throws RegexSyntaxException { Token.Quantifier quantifier = (Token.Quantifier) token; // quantifiers of type *, + or ? cannot directly follow another quantifier if (last instanceof Token.Quantifier && quantifier.isSingleChar()) { - throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier)); + throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier), ErrorCode.InvalidQuantifier); } if (curTermState == CurTermState.Null && quantifier.isSingleChar()) { - throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier)); + throw syntaxErrorHere(JavaErrorMessages.danglingMetaCharacter(quantifier), ErrorCode.InvalidQuantifier); } if (quantifier.isPossessive()) { throw new UnsupportedRegexException("possessive quantifiers are not supported"); @@ -145,7 +146,7 @@ public void validate() throws RegexSyntaxException { break; case groupEnd: if (syntaxStack.isEmpty()) { - throw syntaxErrorHere(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS); + throw syntaxErrorHere(JsErrorMessages.UNMATCHED_RIGHT_PARENTHESIS, ErrorCode.UnmatchedParenthesis); } RegexStackElem poppedElem = syntaxStack.remove(syntaxStack.size() - 1); switch (poppedElem) { @@ -164,12 +165,12 @@ public void validate() throws RegexSyntaxException { } if (!syntaxStack.isEmpty()) { - throw syntaxErrorHere(JavaErrorMessages.UNCLOSED_GROUP); + throw syntaxErrorHere(JavaErrorMessages.UNCLOSED_GROUP, ErrorCode.UnmatchedParenthesis); } } // Error reporting - private RegexSyntaxException syntaxErrorHere(String message) { - return RegexSyntaxException.createPattern(source, message, lexer.getLastTokenPosition()); + private RegexSyntaxException syntaxErrorHere(String message, ErrorCode errorCode) { + return RegexSyntaxException.createPattern(source, message, lexer.getLastTokenPosition(), errorCode); } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java index d6608f6c1d3..56496640831 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/DebugUtil.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -78,23 +78,24 @@ public static String escapeString(String s) { public static String regexSourceEscape(String pattern, String flags) { StringBuilder sb = new StringBuilder(pattern.length() + 2); sb.append('/'); - int i = 0; - while (i < pattern.length()) { - int c = pattern.codePointAt(i); + javaStringEscape(sb, pattern); + return sb.append('/').append(flags).toString(); + } + + public static String javaStringEscape(String string) { + return javaStringEscape(new StringBuilder(string.length()), string).toString(); + } + + private static StringBuilder javaStringEscape(StringBuilder sb, String string) { + for (int i = 0; i < string.length(); i++) { + int c = string.charAt(i); if (0x20 <= c && c <= 0x7e) { sb.appendCodePoint(c); } else { - sb.append("\\u"); - if (c > 0xffff) { - i++; - sb.append(String.format("{%06x}", c)); - } else { - sb.append(String.format("%04x", c)); - } + sb.append("\\u").append(String.format("%04x", c)); } - i++; } - return sb.append('/').append(flags).toString(); + return sb; } @TruffleBoundary @@ -111,7 +112,7 @@ public static String nodeID(int id) { @TruffleBoundary public static String jsStringEscape(String str) { - StringBuffer escapedString = new StringBuffer(); + StringBuilder escapedString = new StringBuilder(); Matcher m = specialChars.matcher(str); while (m.find()) { String replacement; diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/MathUtil.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/MathUtil.java index ab87aa515ea..8ed09130599 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/MathUtil.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/util/MathUtil.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -50,4 +50,11 @@ public static int log2floor(int x) { public static int log2ceil(int x) { return 32 - Integer.numberOfLeadingZeros(x - 1); } + + public static int saturatingInc(int x) { + if (x == Integer.MAX_VALUE) { + return x; + } + return x + 1; + } } diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java index 896f8de42f5..3e629d4ea9f 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/BitSets.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -107,6 +107,9 @@ public static boolean add(long[] bs, int index) { return bs[wordIndex(index)] != old; } + /** + * Set all bits from lo (inclusive) to hi (inclusive). + */ public static void setRange(long[] bs, int lo, int hi) { int wordIndexLo = wordIndex(lo); int wordIndexHi = wordIndex(hi); @@ -123,6 +126,9 @@ public static void setRange(long[] bs, int lo, int hi) { bs[wordIndexHi] |= rangeHi; } + /** + * Clear all bits from lo (inclusive) to hi (inclusive). + */ public static void clearRange(long[] bs, int lo, int hi) { int wordIndexLo = wordIndex(lo); int wordIndexHi = wordIndex(hi); diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/TBitSet.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/TBitSet.java index ff385c1b137..2994bc1fe2f 100644 --- a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/TBitSet.java +++ b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/util/TBitSet.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -155,11 +155,17 @@ public void set(int b) { BitSets.set(words, b); } + /** + * Set all bits from lo (inclusive) to hi (inclusive). + */ public void setRange(int lo, int hi) { ensureCapacity(BitSets.wordIndex(hi) + 1); BitSets.setRange(words, lo, hi); } + /** + * Clear all bits from lo (inclusive) to hi (inclusive). + */ public void clearRange(int lo, int hi) { ensureCapacity(BitSets.wordIndex(hi) + 1); BitSets.clearRange(words, lo, hi);