@@ -18,6 +18,20 @@ module Make<RegexTreeViewSig TreeImpl> {
18
18
exists ( int code | code = ascii ( c ) | code + 1 = ascii ( result ) )
19
19
}
20
20
21
+ /**
22
+ * Gets the `i`th codepoint in `s`.
23
+ */
24
+ bindingset [ s]
25
+ private string getCodepointAt ( string s , int i ) { result = s .regexpFind ( "(.|\\s)" , i , _) }
26
+
27
+ /**
28
+ * Gets the length of `s` in codepoints.
29
+ */
30
+ bindingset [ str]
31
+ private int getCodepointLength ( string str ) {
32
+ result = max ( int m | exists ( str .regexpFind ( "(.|\\s)" , m - 1 , _) ) or m = 0 )
33
+ }
34
+
21
35
/**
22
36
* Gets an approximation for the ASCII code for `char`.
23
37
* Only the easily printable chars are included (so no newline, tab, null, etc).
@@ -190,17 +204,17 @@ module Make<RegexTreeViewSig TreeImpl> {
190
204
/** An input symbol corresponding to character `c`. */
191
205
Char ( string c ) {
192
206
c =
193
- any ( RegexpCharacterConstant cc |
194
- cc instanceof RelevantRegExpTerm and
195
- not isIgnoreCase ( cc .getRootTerm ( ) )
196
- ) .getValue ( ) . charAt ( _)
207
+ getCodepointAt ( any ( RegexpCharacterConstant cc |
208
+ cc instanceof RelevantRegExpTerm and
209
+ not isIgnoreCase ( cc .getRootTerm ( ) )
210
+ ) .getValue ( ) , _)
197
211
or
198
212
// normalize everything to lower case if the regexp is case insensitive
199
213
c =
200
214
any ( RegexpCharacterConstant cc , string char |
201
215
cc instanceof RelevantRegExpTerm and
202
216
isIgnoreCase ( cc .getRootTerm ( ) ) and
203
- char = cc .getValue ( ) . charAt ( _)
217
+ char = getCodepointAt ( cc .getValue ( ) , _)
204
218
|
205
219
char .toLowerCase ( )
206
220
)
@@ -396,7 +410,7 @@ module Make<RegexTreeViewSig TreeImpl> {
396
410
string getARelevantChar ( ) {
397
411
exists ( ascii ( result ) )
398
412
or
399
- exists ( RegexpCharacterConstant c | result = c .getValue ( ) . charAt ( _) )
413
+ exists ( RegexpCharacterConstant c | result = getCodepointAt ( c .getValue ( ) , _) )
400
414
or
401
415
classEscapeMatches ( _, result )
402
416
}
@@ -702,16 +716,16 @@ module Make<RegexTreeViewSig TreeImpl> {
702
716
q1 = Match ( s , i ) and
703
717
(
704
718
not isIgnoreCase ( s .getRootTerm ( ) ) and
705
- lbl = Char ( s .getValue ( ) . charAt ( i ) )
719
+ lbl = Char ( getCodepointAt ( s .getValue ( ) , i ) )
706
720
or
707
721
// normalize everything to lower case if the regexp is case insensitive
708
722
isIgnoreCase ( s .getRootTerm ( ) ) and
709
- exists ( string c | c = s .getValue ( ) . charAt ( i ) | lbl = Char ( c .toLowerCase ( ) ) )
723
+ exists ( string c | c = getCodepointAt ( s .getValue ( ) , i ) | lbl = Char ( c .toLowerCase ( ) ) )
710
724
) and
711
725
(
712
726
q2 = Match ( s , i + 1 )
713
727
or
714
- s .getValue ( ) . length ( ) = i + 1 and
728
+ getCodepointLength ( s .getValue ( ) ) = i + 1 and
715
729
q2 = after ( s )
716
730
)
717
731
)
@@ -812,7 +826,7 @@ module Make<RegexTreeViewSig TreeImpl> {
812
826
Match ( RelevantRegExpTerm t , int i ) {
813
827
i = 0
814
828
or
815
- exists ( t .( RegexpCharacterConstant ) .getValue ( ) . charAt ( i ) )
829
+ exists ( getCodepointAt ( t .( RegexpCharacterConstant ) .getValue ( ) , i ) )
816
830
} or
817
831
/**
818
832
* An accept state, where exactly the given input string is accepted.
@@ -1105,7 +1119,9 @@ module Make<RegexTreeViewSig TreeImpl> {
1105
1119
*/
1106
1120
predicate reachesOnlyRejectableSuffixes ( State fork , string w ) {
1107
1121
isReDoSCandidate ( fork , w ) and
1108
- forex ( State next | next = process ( fork , w , w .length ( ) - 1 ) | isLikelyRejectable ( next ) ) and
1122
+ forex ( State next | next = process ( fork , w , getCodepointLength ( w ) - 1 ) |
1123
+ isLikelyRejectable ( next )
1124
+ ) and
1109
1125
not getProcessPrevious ( fork , _, w ) = acceptsAnySuffix ( ) // we stop `process(..)` early if we can, check here if it happened.
1110
1126
}
1111
1127
@@ -1215,6 +1231,12 @@ module Make<RegexTreeViewSig TreeImpl> {
1215
1231
exists ( string char | char = [ "|" , "\n" , "Z" ] | not deltaClosedChar ( s , char , _) )
1216
1232
}
1217
1233
1234
+ // `process` can't use pragma[inline] predicates. So a materialized version of `getCodepointAt` is needed.
1235
+ private string getCodePointAtForProcess ( string str , int i ) {
1236
+ result = getCodepointAt ( str , i ) and
1237
+ exists ( getProcessPrevious ( _, _, str ) )
1238
+ }
1239
+
1218
1240
/**
1219
1241
* Gets a state that can be reached from pumpable `fork` consuming all
1220
1242
* chars in `w` any number of times followed by the first `i+1` characters of `w`.
@@ -1224,7 +1246,7 @@ module Make<RegexTreeViewSig TreeImpl> {
1224
1246
exists ( State prev | prev = getProcessPrevious ( fork , i , w ) |
1225
1247
not prev = acceptsAnySuffix ( ) and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected.
1226
1248
exists ( string char , InputSymbol sym |
1227
- char = w . charAt ( i ) and
1249
+ char = getCodePointAtForProcess ( w , i ) and
1228
1250
deltaClosed ( prev , sym , result ) and
1229
1251
// noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
1230
1252
// Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
@@ -1246,7 +1268,7 @@ module Make<RegexTreeViewSig TreeImpl> {
1246
1268
or
1247
1269
// repeat until fixpoint
1248
1270
i = 0 and
1249
- result = process ( fork , w , w . length ( ) - 1 )
1271
+ result = process ( fork , w , getCodepointLength ( w ) - 1 )
1250
1272
)
1251
1273
}
1252
1274
@@ -1262,7 +1284,9 @@ module Make<RegexTreeViewSig TreeImpl> {
1262
1284
/**
1263
1285
* Gets a `char` that occurs in a `pump` string.
1264
1286
*/
1265
- private string getAProcessChar ( ) { result = any ( string s | isReDoSCandidate ( _, s ) ) .charAt ( _) }
1287
+ private string getAProcessChar ( ) {
1288
+ result = getCodepointAt ( any ( string s | isReDoSCandidate ( _, s ) ) , _)
1289
+ }
1266
1290
}
1267
1291
1268
1292
/**
@@ -1317,7 +1341,8 @@ module Make<RegexTreeViewSig TreeImpl> {
1317
1341
*/
1318
1342
bindingset [ s]
1319
1343
private string escapeUnicodeString ( string s ) {
1320
- result = concat ( int i , string char | char = escapeUnicodeChar ( s .charAt ( i ) ) | char order by i )
1344
+ result =
1345
+ concat ( int i , string char | char = escapeUnicodeChar ( getCodepointAt ( s , i ) ) | char order by i )
1321
1346
}
1322
1347
1323
1348
/**
@@ -1328,7 +1353,10 @@ module Make<RegexTreeViewSig TreeImpl> {
1328
1353
private string escapeUnicodeChar ( string char ) {
1329
1354
if isPrintable ( char )
1330
1355
then result = char
1331
- else result = "\\u" + to4digitHex ( any ( int i | i .toUnicode ( ) = char ) )
1356
+ else
1357
+ if exists ( to4digitHex ( any ( int i | i .toUnicode ( ) = char ) ) )
1358
+ then result = "\\u" + to4digitHex ( any ( int i | i .toUnicode ( ) = char ) )
1359
+ else result = "\\u{" + toHex ( any ( int i | i .toUnicode ( ) = char ) ) + "}"
1332
1360
}
1333
1361
1334
1362
/** Holds if `char` is easily printable char, or whitespace. */
0 commit comments