Skip to content

Commit 5b7b717

Browse files
committed
[GR-57588] Implement the JS regexp-modifiers proposal in TRegex.
PullRequest: graal/19506
2 parents eef4988 + 2fd3c55 commit 5b7b717

File tree

10 files changed

+253
-126
lines changed

10 files changed

+253
-126
lines changed

regex/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
This changelog summarizes major changes between TRegex versions relevant to language implementors integrating TRegex into their language. This document will focus on API changes relevant to integrators of TRegex.
44

5+
## Version 24.2.0
6+
7+
* Implemented the [Regular Expression Pattern Modifiers](https://github.com/tc39/proposal-regexp-modifiers) proposal for ECMAScript regular expressions.
8+
59
## Version 24.0.0
610

711
* Added support for atomic groups and possessive quantifiers in Python regular expressions.

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/RegexFlags.java

Lines changed: 88 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -50,6 +50,7 @@
5050
import com.oracle.truffle.regex.tregex.util.json.Json;
5151
import com.oracle.truffle.regex.tregex.util.json.JsonConvertible;
5252
import com.oracle.truffle.regex.tregex.util.json.JsonValue;
53+
import com.oracle.truffle.regex.util.TBitSet;
5354
import com.oracle.truffle.regex.util.TruffleReadOnlyKeysArray;
5455

5556
@ExportLibrary(InteropLibrary.class)
@@ -76,6 +77,9 @@ public final class RegexFlags extends AbstractConstantKeysObject implements Json
7677
PROP_HAS_INDICES,
7778
PROP_UNICODE_SETS);
7879

80+
private static final TBitSet ALL_FLAG_CHARS = TBitSet.valueOf('d', 'g', 'i', 'm', 's', 'u', 'v', 'y');
81+
private static final TBitSet LOCAL_FLAG_CHARS = TBitSet.valueOf('i', 'm', 's');
82+
7983
private static final int NONE = 0;
8084
private static final int IGNORE_CASE = 1;
8185
private static final int MULTILINE = 1 << 1;
@@ -86,6 +90,10 @@ public final class RegexFlags extends AbstractConstantKeysObject implements Json
8690
private static final int HAS_INDICES = 1 << 6;
8791
private static final int UNICODE_SETS = 1 << 7;
8892

93+
private static final int[] FLAG_LOOKUP = {
94+
HAS_INDICES, 0, 0, GLOBAL, 0, IGNORE_CASE, 0, 0, 0, MULTILINE, 0, 0, 0, 0, 0, DOT_ALL, 0, UNICODE, UNICODE_SETS, 0, 0, STICKY
95+
};
96+
8997
public static final RegexFlags DEFAULT = new RegexFlags("", NONE);
9098

9199
private final String source;
@@ -96,6 +104,17 @@ private RegexFlags(String source, int value) {
96104
this.value = value;
97105
}
98106

107+
private RegexFlags(int value) {
108+
this.source = generateSource(value);
109+
this.value = value;
110+
}
111+
112+
private static int maskForFlag(char flagChar) {
113+
assert ALL_FLAG_CHARS.get(flagChar);
114+
// flagChar must be one of [d-y].
115+
return FLAG_LOOKUP[flagChar - 'd'];
116+
}
117+
99118
public static Builder builder() {
100119
return new Builder();
101120
}
@@ -109,51 +128,21 @@ public static RegexFlags parseFlags(RegexSource source) throws RegexSyntaxExcept
109128
int flags = NONE;
110129
for (int i = 0; i < flagsStr.length(); i++) {
111130
char ch = flagsStr.charAt(i);
112-
switch (ch) {
113-
case 'i':
114-
flags = addFlag(source, flags, i, IGNORE_CASE);
115-
break;
116-
case 'm':
117-
flags = addFlag(source, flags, i, MULTILINE);
118-
break;
119-
case 'g':
120-
flags = addFlag(source, flags, i, GLOBAL);
121-
break;
122-
case 'y':
123-
flags = addFlag(source, flags, i, STICKY);
124-
break;
125-
case 'u':
126-
if ((flags & UNICODE_SETS) != 0) {
127-
throw RegexSyntaxException.createFlags(source, JsErrorMessages.BOTH_FLAGS_SET_U_V, i);
128-
}
129-
flags = addFlag(source, flags, i, UNICODE);
130-
break;
131-
case 's':
132-
flags = addFlag(source, flags, i, DOT_ALL);
133-
break;
134-
case 'd':
135-
flags = addFlag(source, flags, i, HAS_INDICES);
136-
break;
137-
case 'v':
138-
if ((flags & UNICODE) != 0) {
139-
throw RegexSyntaxException.createFlags(source, JsErrorMessages.BOTH_FLAGS_SET_U_V, i);
140-
}
141-
flags = addFlag(source, flags, i, UNICODE_SETS);
142-
break;
143-
default:
144-
throw RegexSyntaxException.createFlags(source, JsErrorMessages.UNSUPPORTED_FLAG, i);
131+
if (!isValidFlagChar(ch)) {
132+
throw RegexSyntaxException.createFlags(source, JsErrorMessages.UNSUPPORTED_FLAG, i);
133+
}
134+
int flag = maskForFlag(ch);
135+
if ((flags & flag) != 0) {
136+
throw RegexSyntaxException.createFlags(source, JsErrorMessages.REPEATED_FLAG, i);
137+
}
138+
flags |= flag;
139+
if ((flags & (UNICODE | UNICODE_SETS)) == (UNICODE | UNICODE_SETS)) {
140+
throw RegexSyntaxException.createFlags(source, JsErrorMessages.BOTH_FLAGS_SET_U_V, i);
145141
}
146142
}
147143
return new RegexFlags(flagsStr, flags);
148144
}
149145

150-
private static int addFlag(RegexSource source, int flags, int i, int flag) {
151-
if ((flags & flag) != 0) {
152-
throw RegexSyntaxException.createFlags(source, JsErrorMessages.REPEATED_FLAG, i);
153-
}
154-
return flags | flag;
155-
}
156-
157146
public String getSource() {
158147
return source;
159148
}
@@ -202,6 +191,34 @@ private boolean isSet(int flag) {
202191
return (value & flag) != NONE;
203192
}
204193

194+
public static boolean isValidFlagChar(char candidateChar) {
195+
return ALL_FLAG_CHARS.get(candidateChar);
196+
}
197+
198+
public static boolean isValidLocalFlagChar(char candidateChar) {
199+
return LOCAL_FLAG_CHARS.get(candidateChar);
200+
}
201+
202+
public RegexFlags addNewFlagModifier(RegexSource regexSource, char flagChar) {
203+
int flag = maskForFlag(flagChar);
204+
if (isSet(flag)) {
205+
throw RegexSyntaxException.createFlags(regexSource, JsErrorMessages.REPEATED_FLAG_IN_MODIFIER);
206+
}
207+
return new RegexFlags(this.value | flag);
208+
}
209+
210+
public RegexFlags addFlags(RegexFlags otherFlags) {
211+
return new RegexFlags(this.value | otherFlags.value);
212+
}
213+
214+
public RegexFlags delFlags(RegexFlags otherFlags) {
215+
return new RegexFlags(this.value & ~otherFlags.value);
216+
}
217+
218+
public boolean overlaps(RegexFlags otherFlags) {
219+
return (this.value & otherFlags.value) != 0;
220+
}
221+
205222
@Override
206223
public String toString() {
207224
return source;
@@ -287,6 +304,35 @@ public Object toDisplayString(@SuppressWarnings("unused") boolean allowSideEffec
287304
return "TRegexJSFlags{flags=" + toString() + '}';
288305
}
289306

307+
private static String generateSource(int value) {
308+
StringBuilder sb = new StringBuilder(8);
309+
if ((value & IGNORE_CASE) != 0) {
310+
sb.append("i");
311+
}
312+
if ((value & MULTILINE) != 0) {
313+
sb.append("m");
314+
}
315+
if ((value & STICKY) != 0) {
316+
sb.append("y");
317+
}
318+
if ((value & GLOBAL) != 0) {
319+
sb.append("g");
320+
}
321+
if ((value & UNICODE) != 0) {
322+
sb.append("u");
323+
}
324+
if ((value & DOT_ALL) != 0) {
325+
sb.append("s");
326+
}
327+
if ((value & HAS_INDICES) != 0) {
328+
sb.append("d");
329+
}
330+
if ((value & UNICODE_SETS) != 0) {
331+
sb.append("v");
332+
}
333+
return sb.toString();
334+
}
335+
290336
public static final class Builder {
291337

292338
private int value;
@@ -342,7 +388,7 @@ public Builder unicodeSets(boolean enabled) {
342388

343389
@TruffleBoundary
344390
public RegexFlags build() {
345-
return new RegexFlags(generateSource(), this.value);
391+
return new RegexFlags(generateSource(this.value), this.value);
346392
}
347393

348394
private void updateFlag(boolean enabled, int bitMask) {
@@ -352,38 +398,5 @@ private void updateFlag(boolean enabled, int bitMask) {
352398
this.value &= ~bitMask;
353399
}
354400
}
355-
356-
private boolean isSet(int flag) {
357-
return (value & flag) != NONE;
358-
}
359-
360-
private String generateSource() {
361-
StringBuilder sb = new StringBuilder(7);
362-
if (isSet(IGNORE_CASE)) {
363-
sb.append("i");
364-
}
365-
if (isSet(MULTILINE)) {
366-
sb.append("m");
367-
}
368-
if (isSet(STICKY)) {
369-
sb.append("y");
370-
}
371-
if (isSet(GLOBAL)) {
372-
sb.append("g");
373-
}
374-
if (isSet(UNICODE)) {
375-
sb.append("u");
376-
}
377-
if (isSet(DOT_ALL)) {
378-
sb.append("s");
379-
}
380-
if (isSet(HAS_INDICES)) {
381-
sb.append("d");
382-
}
383-
if (isSet(UNICODE_SETS)) {
384-
sb.append("v");
385-
}
386-
return sb.toString();
387-
}
388401
}
389402
}

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/errors/JsErrorMessages.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -50,30 +50,36 @@ public class JsErrorMessages {
5050
public static final String CHAR_CLASS_RANGE_OUT_OF_ORDER = "Range out of order in character class";
5151
public static final String COMPLEMENT_OF_STRING_SET = "Negated character class may contain strings";
5252
public static final String EMPTY_GROUP_NAME = "Empty named capture group name";
53+
public static final String EMPTY_MODIFIER = "No flags in modifier";
5354
public static final String ENDS_WITH_UNFINISHED_ESCAPE_SEQUENCE = "Ends with an unfinished escape sequence";
5455
public static final String ENDS_WITH_UNFINISHED_UNICODE_PROPERTY = "Ends with an unfinished Unicode property escape";
5556
public static final String INCOMPLETE_QUANTIFIER = "Incomplete quantifier";
57+
public static final String INCOMPLETE_MODIFIER = "Incomplete modifier";
5658
public static final String INVALID_CHARACTER_CLASS = "Invalid character class";
5759
public static final String INVALID_CHARACTER_IN_CHARACTER_CLASS = "Invalid character in character class";
5860
public static final String INVALID_CONTROL_CHAR_ESCAPE = "Invalid control char escape";
5961
public static final String INVALID_ESCAPE = "Invalid escape";
6062
public static final String INVALID_GROUP = "Invalid group";
6163
public static final String INVALID_GROUP_NAME_PART = "Invalid character in group name";
6264
public static final String INVALID_GROUP_NAME_START = "Invalid character at start of group name";
65+
public static final String INVALID_MODIFIER = "Invalid modifier";
6366
public static final String INVALID_UNICODE_ESCAPE = "Invalid Unicode escape";
6467
public static final String INVALID_UNICODE_PROPERTY = "Invalid Unicode property escape";
6568
public static final String MISSING_GROUP_FOR_BACKREFERENCE = "Missing capture group for backreference";
6669
public static final String MISSING_GROUP_NAME = "Missing group name in named capture group reference";
70+
public static final String MODIFIER_BOTH_ADDING_AND_REMOVING_FLAG = "Modifier is both adding and removing the same flag";
6771
public static final String MULTIPLE_GROUPS_SAME_NAME = "Multiple named capture groups with the same name";
6872
public static final String QUANTIFIER_ON_LOOKAHEAD_ASSERTION = "Quantifier on lookahead assertion";
6973
public static final String QUANTIFIER_ON_LOOKBEHIND_ASSERTION = "Quantifier on lookbehind assertion";
7074
public static final String QUANTIFIER_ON_QUANTIFIER = "Quantifier on quantifier";
7175
public static final String QUANTIFIER_OUT_OF_ORDER = "Numbers out of order in {} quantifier";
7276
public static final String QUANTIFIER_WITHOUT_TARGET = "Quantifier without target";
77+
public static final String REPEATED_FLAG_IN_MODIFIER = "Repeated regex flag in modifier";
7378
public static final String UNMATCHED_LEFT_BRACKET = "Unterminated character class";
7479
public static final String UNMATCHED_RIGHT_BRACKET = "Unmatched ']'";
7580
public static final String UNMATCHED_RIGHT_PARENTHESIS = "Unmatched ')'";
7681
public static final String UNMATCHED_RIGHT_BRACE = "Unmatched '}'";
82+
public static final String UNSUPPORTED_FLAG_IN_MODIFIER = "Invalid regular expression flag in modifier";
7783
public static final String UNTERMINATED_GROUP = "Unterminated group";
7884
public static final String UNTERMINATED_GROUP_NAME = "Unterminated group name";
7985
public static final String UNTERMINATED_STRING_SET = "Unterminated string set";
@@ -103,6 +109,10 @@ public static String invalidRegularExpression(RegexSource source, String message
103109
return String.format("Invalid regular expression: %s: %s", source, message);
104110
}
105111

112+
public static String flagNotAllowedInModifier(char flagChar) {
113+
return String.format("Flag '%s' not allowed in modifier", flagChar);
114+
}
115+
106116
/* flag related errors */
107117

108118
public static final String REPEATED_FLAG = "Repeated regex flag";

0 commit comments

Comments
 (0)