Skip to content

Commit a6a86b6

Browse files
committed
TRegex: add support for Ruby-specific Unicode property name mangling.
1 parent 6055f0a commit a6a86b6

File tree

8 files changed

+110
-98
lines changed

8 files changed

+110
-98
lines changed

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,4 +643,14 @@ public void gh3167() {
643643
public void gr52472() {
644644
test("(|a+?){0,4}b", "", "aaab", 0, true, 0, 4, 1, 3);
645645
}
646+
647+
@Test
648+
public void unicodePropertyNameMangling() {
649+
test("\\p{private_use}", "", "\ue000", 0, true, 0, 1);
650+
test("\\p{private-use}", "", "\ue000", 0, true, 0, 1);
651+
test("\\p{PRIVATE use}", "", "\ue000", 0, true, 0, 1);
652+
test("\\p{private use}", "", "\ue000", 0, true, 0, 1);
653+
test("\\p{privateuse}", "", "\ue000", 0, true, 0, 1);
654+
test("\\p{p R iV__- --_aTe Us e___}", "", "\ue000", 0, true, 0, 1);
655+
}
646656
}

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/UnicodeProperties.java

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -63,35 +63,43 @@ public class UnicodeProperties {
6363
OTHER_PROPERTIES_NAMES_SET.addAll(List.of(OTHER_PROPERTIES_NAMES));
6464
}
6565

66-
/**
67-
* Match all unicode property names in case-insensitive mode.
68-
*/
69-
public static final int CASE_INSENSITIVE = 1;
66+
public enum NameMatchingMode {
67+
exact,
68+
ignoreCase,
69+
ruby;
70+
71+
public String normalize(String name) {
72+
return switch (this) {
73+
case exact -> name;
74+
case ignoreCase -> name.toLowerCase();
75+
case ruby -> name.replaceAll("[-_ ]", "").toLowerCase();
76+
};
77+
}
78+
}
79+
7080
/**
7181
* Expose {@code blk=} unicode block ranges.
7282
*/
73-
public static final int BLOCKS = 1 << 1;
83+
public static final int BLOCKS = 1;
7484
/**
7585
* Expose "Other" unicode properties, see {@code OTHER_PROPERTIES_NAMES}.
7686
*/
77-
public static final int OTHER_PROPERTIES = 1 << 2;
87+
public static final int OTHER_PROPERTIES = 1 << 1;
7888

7989
private final UnicodePropertyData data;
8090
private final int flags;
91+
private final NameMatchingMode nameMatchingMode;
8192

82-
public UnicodeProperties(UnicodePropertyData data, int flags) {
93+
public UnicodeProperties(UnicodePropertyData data, int flags, NameMatchingMode nameMatchingMode) {
8394
this.data = data;
8495
this.flags = flags;
96+
this.nameMatchingMode = nameMatchingMode;
8597
}
8698

8799
private boolean isFlagSet(int flag) {
88100
return (flags & flag) != 0;
89101
}
90102

91-
private boolean isCaseInsensitive() {
92-
return isFlagSet(CASE_INSENSITIVE);
93-
}
94-
95103
private boolean withBlocks() {
96104
return isFlagSet(BLOCKS);
97105
}
@@ -161,43 +169,43 @@ private String normalizePropertySpec(String propertySpec) {
161169
}
162170

163171
private String normalizePropertyName(String propertyName) {
164-
String name = returnOrThrow(propertyName, "character property", data.lookupPropertyAlias(propertyName, isCaseInsensitive()));
172+
String name = returnOrThrow(propertyName, "character property", data.lookupPropertyAlias(propertyName, nameMatchingMode));
165173
if (!withOtherProperties() && OTHER_PROPERTIES_NAMES_SET.contains(name)) {
166174
throw new IllegalArgumentException(String.format("Unsupported Unicode character property '%s'", propertyName));
167175
}
168176
return name;
169177
}
170178

171179
private String normalizeGeneralCategoryName(String generalCategoryName) {
172-
return returnOrThrow(generalCategoryName, "character general category", data.lookupGeneralCategoryAlias(generalCategoryName, isCaseInsensitive()));
180+
return returnOrThrow(generalCategoryName, "character general category", data.lookupGeneralCategoryAlias(generalCategoryName, nameMatchingMode));
173181
}
174182

175183
private String normalizeScriptName(String scriptName) {
176-
return returnOrThrow(scriptName, "script name", data.lookupScriptAlias(scriptName, isCaseInsensitive()));
184+
return returnOrThrow(scriptName, "script name", data.lookupScriptAlias(scriptName, nameMatchingMode));
177185
}
178186

179187
private String normalizeBlockName(String blockName) {
180188
if (!withBlocks()) {
181189
throw new IllegalArgumentException("Unsupported Unicode character property escape");
182190
}
183-
return returnOrThrow(blockName, "block name", data.lookupBlockAlias(blockName, isCaseInsensitive()));
191+
return returnOrThrow(blockName, "block name", data.lookupBlockAlias(blockName, nameMatchingMode));
184192
}
185193

186194
public boolean isSupportedProperty(String propertyName) {
187-
return data.lookupPropertyAlias(propertyName, isCaseInsensitive()) != null;
195+
return data.lookupPropertyAlias(propertyName, nameMatchingMode) != null;
188196
}
189197

190198
public boolean isSupportedGeneralCategory(String generalCategoryName) {
191-
return data.lookupGeneralCategoryAlias(generalCategoryName, isCaseInsensitive()) != null;
199+
return data.lookupGeneralCategoryAlias(generalCategoryName, nameMatchingMode) != null;
192200
}
193201

194202
public boolean isSupportedScript(String scriptName) {
195-
return data.lookupScriptAlias(scriptName, isCaseInsensitive()) != null;
203+
return data.lookupScriptAlias(scriptName, nameMatchingMode) != null;
196204
}
197205

198206
public boolean isSupportedBlock(String blockName) {
199207
assert withBlocks();
200-
return data.lookupBlockAlias(blockName, isCaseInsensitive()) != null;
208+
return data.lookupBlockAlias(blockName, nameMatchingMode) != null;
201209
}
202210

203211
private static String returnOrThrow(String propertyName, String errorName, String name) {

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/UnicodePropertyData.java

Lines changed: 55 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,41 @@
4949
import org.graalvm.collections.EconomicSet;
5050
import org.graalvm.collections.MapCursor;
5151

52+
import com.oracle.truffle.regex.charset.UnicodeProperties.NameMatchingMode;
5253
import com.oracle.truffle.regex.charset.UnicodePropertyDataDiff.CodePointSetDiff;
5354

5455
public class UnicodePropertyData {
5556

57+
private record Aliases(
58+
EconomicMap<String, String> prop,
59+
EconomicMap<String, String> gc,
60+
EconomicMap<String, String> sc,
61+
EconomicMap<String, String> blk) {
62+
63+
private Aliases transform(NameMatchingMode nameMatchingMode) {
64+
return new Aliases(
65+
transformMap(prop, nameMatchingMode),
66+
transformMap(gc, nameMatchingMode),
67+
transformMap(sc, nameMatchingMode),
68+
transformMap(blk, nameMatchingMode));
69+
}
70+
71+
private static EconomicMap<String, String> transformMap(EconomicMap<String, String> source, NameMatchingMode nameMatchingMode) {
72+
EconomicMap<String, String> target = EconomicMap.create(source.size());
73+
MapCursor<String, String> cursor = source.getEntries();
74+
while (cursor.advance()) {
75+
String transformedKey = nameMatchingMode.normalize(cursor.getKey());
76+
assert !target.containsKey(transformedKey) || target.get(transformedKey).equals(cursor.getValue());
77+
target.put(transformedKey, cursor.getValue());
78+
}
79+
return target;
80+
}
81+
}
82+
5683
private final EconomicMap<String, CodePointSet> properties;
5784
protected final EconomicMap<String, ClassSetContents> emoji;
58-
protected final EconomicMap<String, String> propAliases;
59-
protected final EconomicMap<String, String> gcAliases;
60-
protected final EconomicMap<String, String> scAliases;
61-
protected final EconomicMap<String, String> blkAliases;
6285
private ClassSetContents rgiEmoji;
63-
private EconomicMap<String, String> propAliasesCaseInsensitive;
64-
private EconomicMap<String, String> gcAliasesCaseInsensitive;
65-
private EconomicMap<String, String> scAliasesCaseInsensitive;
66-
private EconomicMap<String, String> blkAliasesCaseInsensitive;
86+
private final Aliases[] aliases = new Aliases[NameMatchingMode.values().length];
6787

6888
UnicodePropertyData(
6989
EconomicMap<String, CodePointSet> properties,
@@ -74,10 +94,7 @@ public class UnicodePropertyData {
7494
EconomicMap<String, String> blkAliases) {
7595
this.properties = properties;
7696
this.emoji = emoji;
77-
this.propAliases = propAliases;
78-
this.gcAliases = gcAliases;
79-
this.scAliases = scAliases;
80-
this.blkAliases = blkAliases;
97+
aliases[NameMatchingMode.exact.ordinal()] = new Aliases(propAliases, gcAliases, scAliases, blkAliases);
8198
}
8299

83100
CodePointSet retrieveProperty(String propertySpec) {
@@ -118,64 +135,50 @@ protected ClassSetContents getRGIEmoji() {
118135
return rgiEmoji;
119136
}
120137

121-
String lookupPropertyAlias(String alias, boolean caseInsensitive) {
122-
String name = propAliases.get(alias);
123-
if (name == null && caseInsensitive) {
124-
return lookupPropertyAliasCaseInsensitive(alias);
125-
}
126-
return name;
138+
Aliases getExactAliases() {
139+
return aliases[NameMatchingMode.exact.ordinal()];
127140
}
128141

129-
private String lookupPropertyAliasCaseInsensitive(String alias) {
130-
if (propAliasesCaseInsensitive == null) {
131-
propAliasesCaseInsensitive = createCaseInsensitiveMap(propAliases);
142+
Aliases getAliases(NameMatchingMode nameMatchingMode) {
143+
Aliases lookup = aliases[nameMatchingMode.ordinal()];
144+
if (lookup == null) {
145+
assert nameMatchingMode != NameMatchingMode.exact;
146+
lookup = getExactAliases().transform(nameMatchingMode);
147+
aliases[nameMatchingMode.ordinal()] = lookup;
132148
}
133-
return propAliasesCaseInsensitive.get(alias.toLowerCase());
149+
return lookup;
134150
}
135151

136-
String lookupGeneralCategoryAlias(String alias, boolean caseInsensitive) {
137-
String name = gcAliases.get(alias);
138-
if (name == null && caseInsensitive) {
139-
return lookupGcAliasCaseInsensitive(alias);
152+
String lookupPropertyAlias(String alias, NameMatchingMode nameMatchingMode) {
153+
String name = getExactAliases().prop.get(alias);
154+
if (name == null && nameMatchingMode != NameMatchingMode.exact) {
155+
return getAliases(nameMatchingMode).prop.get(nameMatchingMode.normalize(alias));
140156
}
141157
return name;
142158
}
143159

144-
private String lookupGcAliasCaseInsensitive(String alias) {
145-
if (gcAliasesCaseInsensitive == null) {
146-
gcAliasesCaseInsensitive = createCaseInsensitiveMap(gcAliases);
147-
}
148-
return gcAliasesCaseInsensitive.get(alias.toLowerCase());
149-
}
150-
151-
String lookupScriptAlias(String alias, boolean caseInsensitive) {
152-
String name = scAliases.get(alias);
153-
if (name == null && caseInsensitive) {
154-
return lookupScAliasCaseInsensitive(alias);
160+
String lookupGeneralCategoryAlias(String alias, NameMatchingMode nameMatchingMode) {
161+
String name = getExactAliases().gc.get(alias);
162+
if (name == null && nameMatchingMode != NameMatchingMode.exact) {
163+
return getAliases(nameMatchingMode).gc.get(nameMatchingMode.normalize(alias));
155164
}
156165
return name;
157166
}
158167

159-
private String lookupScAliasCaseInsensitive(String alias) {
160-
if (scAliasesCaseInsensitive == null) {
161-
scAliasesCaseInsensitive = createCaseInsensitiveMap(scAliases);
162-
}
163-
return scAliasesCaseInsensitive.get(alias.toLowerCase());
164-
}
165-
166-
String lookupBlockAlias(String alias, boolean caseInsensitive) {
167-
String name = blkAliases.get(alias);
168-
if (name == null && caseInsensitive) {
169-
return lookupBlkAliasCaseInsensitive(alias);
168+
String lookupScriptAlias(String alias, NameMatchingMode nameMatchingMode) {
169+
String name = getExactAliases().sc.get(alias);
170+
if (name == null && nameMatchingMode != NameMatchingMode.exact) {
171+
return getAliases(nameMatchingMode).sc.get(nameMatchingMode.normalize(alias));
170172
}
171173
return name;
172174
}
173175

174-
private String lookupBlkAliasCaseInsensitive(String alias) {
175-
if (blkAliasesCaseInsensitive == null) {
176-
blkAliasesCaseInsensitive = createCaseInsensitiveMap(blkAliases);
176+
String lookupBlockAlias(String alias, NameMatchingMode nameMatchingMode) {
177+
String name = getExactAliases().blk.get(alias);
178+
if (name == null && nameMatchingMode != NameMatchingMode.exact) {
179+
return getAliases(nameMatchingMode).blk.get(nameMatchingMode.normalize(alias));
177180
}
178-
return blkAliasesCaseInsensitive.get(alias.toLowerCase());
181+
return name;
179182
}
180183

181184
private static EconomicSet<String> stringSet(String... strings) {
@@ -186,17 +189,6 @@ private static EconomicSet<String> stringSet(String... strings) {
186189
return set;
187190
}
188191

189-
private static EconomicMap<String, String> createCaseInsensitiveMap(EconomicMap<String, String> source) {
190-
EconomicMap<String, String> target = EconomicMap.create(source.size());
191-
MapCursor<String, String> cursor = source.getEntries();
192-
while (cursor.advance()) {
193-
String lowerCaseKey = cursor.getKey().toLowerCase();
194-
assert !target.containsKey(lowerCaseKey);
195-
target.put(lowerCaseKey, cursor.getValue());
196-
}
197-
return target;
198-
}
199-
200192
/* GENERATED CODE BEGIN - KEEP THIS MARKER FOR AUTOMATIC UPDATES */
201193

202194
private static final EconomicMap<String, String> PROPERTY_ALIASES = EconomicMap.create(124);

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/charset/UnicodePropertyDataDiff.java

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242

4343
import org.graalvm.collections.EconomicMap;
4444

45+
import com.oracle.truffle.regex.charset.UnicodeProperties.NameMatchingMode;
46+
4547
final class UnicodePropertyDataDiff extends UnicodePropertyData {
4648

4749
static final class CodePointSetDiff {
@@ -115,37 +117,37 @@ ClassSetContents retrievePropertyOfStrings(String propertySpec) {
115117
}
116118

117119
@Override
118-
String lookupPropertyAlias(String alias, boolean caseInsensitive) {
119-
String name = super.lookupPropertyAlias(alias, caseInsensitive);
120+
String lookupPropertyAlias(String alias, NameMatchingMode nameMatchingMode) {
121+
String name = super.lookupPropertyAlias(alias, nameMatchingMode);
120122
if (name == null) {
121-
return parent.lookupPropertyAlias(alias, caseInsensitive);
123+
return parent.lookupPropertyAlias(alias, nameMatchingMode);
122124
}
123125
return name;
124126
}
125127

126128
@Override
127-
String lookupGeneralCategoryAlias(String alias, boolean caseInsensitive) {
128-
String name = super.lookupGeneralCategoryAlias(alias, caseInsensitive);
129+
String lookupGeneralCategoryAlias(String alias, NameMatchingMode nameMatchingMode) {
130+
String name = super.lookupGeneralCategoryAlias(alias, nameMatchingMode);
129131
if (name == null) {
130-
return parent.lookupGeneralCategoryAlias(alias, caseInsensitive);
132+
return parent.lookupGeneralCategoryAlias(alias, nameMatchingMode);
131133
}
132134
return name;
133135
}
134136

135137
@Override
136-
String lookupScriptAlias(String alias, boolean caseInsensitive) {
137-
String name = super.lookupScriptAlias(alias, caseInsensitive);
138+
String lookupScriptAlias(String alias, NameMatchingMode nameMatchingMode) {
139+
String name = super.lookupScriptAlias(alias, nameMatchingMode);
138140
if (name == null) {
139-
return parent.lookupScriptAlias(alias, caseInsensitive);
141+
return parent.lookupScriptAlias(alias, nameMatchingMode);
140142
}
141143
return name;
142144
}
143145

144146
@Override
145-
String lookupBlockAlias(String alias, boolean caseInsensitive) {
146-
String name = super.lookupBlockAlias(alias, caseInsensitive);
147+
String lookupBlockAlias(String alias, NameMatchingMode nameMatchingMode) {
148+
String name = super.lookupBlockAlias(alias, nameMatchingMode);
147149
if (name == null) {
148-
return parent.lookupBlockAlias(alias, caseInsensitive);
150+
return parent.lookupBlockAlias(alias, nameMatchingMode);
149151
}
150152
return name;
151153
}

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/ECMAScriptFlavor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
public final class ECMAScriptFlavor extends RegexFlavor {
5757

5858
public static final ECMAScriptFlavor INSTANCE = new ECMAScriptFlavor();
59-
public static final UnicodeProperties UNICODE = new UnicodeProperties(UnicodePropertyDataVersion.UNICODE_16_0_0, 0);
59+
public static final UnicodeProperties UNICODE = new UnicodeProperties(UnicodePropertyDataVersion.UNICODE_16_0_0, 0, UnicodeProperties.NameMatchingMode.exact);
6060

6161
private ECMAScriptFlavor() {
6262
super(0);

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/PythonFlavor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
public final class PythonFlavor extends RegexFlavor {
6464

6565
public static final PythonFlavor INSTANCE = new PythonFlavor();
66-
public static final UnicodeProperties UNICODE = new UnicodeProperties(UnicodePropertyDataVersion.UNICODE_15_1_0, 0);
66+
public static final UnicodeProperties UNICODE = new UnicodeProperties(UnicodePropertyDataVersion.UNICODE_15_1_0, 0, UnicodeProperties.NameMatchingMode.exact);
6767

6868
private PythonFlavor() {
6969
super(BACKREFERENCES_TO_UNMATCHED_GROUPS_FAIL | NESTED_CAPTURE_GROUPS_KEPT_ON_LOOP_REENTRY | FAILING_EMPTY_CHECKS_DONT_BACKTRACK | USES_LAST_GROUP_RESULT_FIELD |

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@
224224
public final class RubyFlavor extends RegexFlavor {
225225

226226
public static final RubyFlavor INSTANCE = new RubyFlavor();
227-
public static final UnicodeProperties UNICODE = new UnicodeProperties(UnicodePropertyDataVersion.UNICODE_15_1_0, UnicodeProperties.CASE_INSENSITIVE);
227+
public static final UnicodeProperties UNICODE = new UnicodeProperties(UnicodePropertyDataVersion.UNICODE_15_1_0, 0, UnicodeProperties.NameMatchingMode.ruby);
228228

229229
private RubyFlavor() {
230230
super(BACKREFERENCES_TO_UNMATCHED_GROUPS_FAIL | EMPTY_CHECKS_MONITOR_CAPTURE_GROUPS | NESTED_CAPTURE_GROUPS_KEPT_ON_LOOP_REENTRY | FAILING_EMPTY_CHECKS_DONT_BACKTRACK |

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/java/JavaUnicodeProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ static JavaUnicodeProperties create(RegexOptions options) {
242242
if (cached != null) {
243243
return cached;
244244
}
245-
UnicodeProperties unicode = new UnicodeProperties(unicodePropertyData, UnicodeProperties.CASE_INSENSITIVE | UnicodeProperties.BLOCKS | UnicodeProperties.OTHER_PROPERTIES);
245+
UnicodeProperties unicode = new UnicodeProperties(unicodePropertyData, UnicodeProperties.BLOCKS | UnicodeProperties.OTHER_PROPERTIES, UnicodeProperties.NameMatchingMode.ignoreCase);
246246
JavaUnicodeProperties ret = new JavaUnicodeProperties(unicode);
247247
CACHE[cacheIndex] = ret;
248248
return ret;

0 commit comments

Comments
 (0)