@@ -121,8 +121,57 @@ deprecated string mode_from_mode_object(Value obj) {
121
121
abstract class RegexString extends Expr {
122
122
RegexString ( ) { ( this instanceof Bytes or this instanceof Unicode ) }
123
123
124
+ override string toString ( ) {
125
+ result = this .( Bytes ) .getText ( )
126
+ or
127
+ result = this .( Unicode ) .getText ( )
128
+ }
129
+
130
+ /** result is true for those start chars that actually mark a start of a char set. */
131
+ boolean char_set_start ( int pos ) {
132
+ exists ( int index |
133
+ char_set_delimiter ( index , pos ) = true and
134
+ (
135
+ index = 1 and result = true // if a '[' is first in the string (among brackets), it starts a char set
136
+ or
137
+ index > 1 and
138
+ not char_set_delimiter ( index - 1 , _) = false and
139
+ result = false
140
+ or
141
+ exists ( int p1 |
142
+ char_set_delimiter ( index - 1 , p1 ) = false and // if it is preceded by a closing bracket, it starts a char set
143
+ if
144
+ exists ( int p2 |
145
+ p1 = p2 + 1
146
+ or
147
+ this .getChar ( p2 + 1 ) = "^" and
148
+ p1 = p2 + 2
149
+ |
150
+ char_set_delimiter ( index - 2 , p2 ) = true // but the closing bracket only closes...
151
+ )
152
+ then
153
+ exists ( int p2 | char_set_delimiter ( index - 2 , p2 ) = true |
154
+ result = char_set_start ( p2 ) .booleanNot ( ) // ...if it is not the first in a char set
155
+ )
156
+ else result = true
157
+ )
158
+ )
159
+ )
160
+ }
161
+
162
+ /** result denotes if the index is a left bracket */
163
+ boolean char_set_delimiter ( int index , int pos ) {
164
+ pos = rank [ index ] ( int p | this .nonEscapedCharAt ( p ) = "[" or this .nonEscapedCharAt ( p ) = "]" ) and
165
+ (
166
+ this .nonEscapedCharAt ( pos ) = "[" and result = true
167
+ or
168
+ this .nonEscapedCharAt ( pos ) = "]" and result = false
169
+ )
170
+ }
171
+
172
+ /** Hold is a character set starts between `start` and `end`. */
124
173
predicate char_set_start ( int start , int end ) {
125
- this .nonEscapedCharAt ( start ) = "[" and
174
+ this .char_set_start ( start ) = true and
126
175
(
127
176
this .getChar ( start + 1 ) = "^" and end = start + 2
128
177
or
@@ -143,23 +192,80 @@ abstract class RegexString extends Expr {
143
192
)
144
193
}
145
194
195
+ /** An indexed version of `char_set_token/3` */
196
+ private predicate char_set_token ( int charset_start , int index , int token_start , int token_end ) {
197
+ token_start =
198
+ rank [ index ] ( int start , int end | this .char_set_token ( charset_start , start , end ) | start ) and
199
+ this .char_set_token ( charset_start , token_start , token_end )
200
+ }
201
+
202
+ /** Either a char or a - */
203
+ private predicate char_set_token ( int charset_start , int start , int end ) {
204
+ this .char_set_start ( charset_start , start ) and
205
+ (
206
+ this .escapedCharacter ( start , end )
207
+ or
208
+ exists ( this .nonEscapedCharAt ( start ) ) and end = start + 1
209
+ )
210
+ or
211
+ this .char_set_token ( charset_start , _, start ) and
212
+ (
213
+ this .escapedCharacter ( start , end )
214
+ or
215
+ exists ( this .nonEscapedCharAt ( start ) ) and
216
+ end = start + 1 and
217
+ not this .getChar ( start ) = "]"
218
+ )
219
+ }
220
+
221
+ /**
222
+ * Holds if the character set starting at `charset_start` contains either
223
+ * a character or a range found between `start` and `end`.
224
+ */
225
+ predicate char_set_child ( int charset_start , int start , int end ) {
226
+ this .char_set_token ( charset_start , start , end ) and
227
+ not exists ( int range_start , int range_end |
228
+ this .charRange ( charset_start , range_start , _, _, range_end ) and
229
+ range_start <= start and
230
+ range_end >= end
231
+ )
232
+ or
233
+ this .charRange ( charset_start , start , _, _, end )
234
+ }
235
+
146
236
/**
147
237
* Holds if the character set starting at `charset_start` contains a character range
148
238
* with lower bound found between `start` and `lower_end`
149
239
* and upper bound found between `upper_start` and `end`.
150
240
*/
151
241
predicate charRange ( int charset_start , int start , int lower_end , int upper_start , int end ) {
152
- // mirror logic from `simpleCharacter`
153
- exists ( int x , int y |
154
- this .charSet ( charset_start , y ) and
155
- this .char_set_start ( charset_start , x )
156
- |
157
- x <= start and
158
- this .simpleCharacter ( start , lower_end ) and
159
- this .nonEscapedCharAt ( lower_end ) = "-" and
160
- lower_end + 1 = upper_start and
161
- this .simpleCharacter ( upper_start , end ) and
162
- end < y
242
+ exists ( int index |
243
+ this .charRangeEnd ( charset_start , index ) = true and
244
+ this .char_set_token ( charset_start , index - 2 , start , lower_end ) and
245
+ this .char_set_token ( charset_start , index , upper_start , end )
246
+ )
247
+ }
248
+
249
+ private boolean charRangeEnd ( int charset_start , int index ) {
250
+ this .char_set_token ( charset_start , index , _, _) and
251
+ (
252
+ index in [ 1 , 2 ] and result = false
253
+ or
254
+ index > 2 and
255
+ exists ( int connector_start |
256
+ this .char_set_token ( charset_start , index - 1 , connector_start , _) and
257
+ this .nonEscapedCharAt ( connector_start ) = "-" and
258
+ result =
259
+ this .charRangeEnd ( charset_start , index - 2 )
260
+ .booleanNot ( )
261
+ .booleanAnd ( this .charRangeEnd ( charset_start , index - 1 ) .booleanNot ( ) )
262
+ )
263
+ or
264
+ not exists ( int connector_start |
265
+ this .char_set_token ( charset_start , index - 1 , connector_start , _) and
266
+ this .nonEscapedCharAt ( connector_start ) = "-"
267
+ ) and
268
+ result = false
163
269
)
164
270
}
165
271
@@ -184,14 +290,14 @@ abstract class RegexString extends Expr {
184
290
185
291
string nonEscapedCharAt ( int i ) {
186
292
result = this .getText ( ) .charAt ( i ) and
187
- not this .escapingChar ( i - 1 )
293
+ not exists ( int x , int y | this .escapedCharacter ( x , y ) and i in [ x .. y - 1 ] )
188
294
}
189
295
190
296
private predicate isOptionDivider ( int i ) { this .nonEscapedCharAt ( i ) = "|" }
191
297
192
- private predicate isGroupEnd ( int i ) { this .nonEscapedCharAt ( i ) = ")" }
298
+ private predicate isGroupEnd ( int i ) { this .nonEscapedCharAt ( i ) = ")" and not this . inCharSet ( i ) }
193
299
194
- private predicate isGroupStart ( int i ) { this .nonEscapedCharAt ( i ) = "(" }
300
+ private predicate isGroupStart ( int i ) { this .nonEscapedCharAt ( i ) = "(" and not this . inCharSet ( i ) }
195
301
196
302
predicate failedToParse ( int i ) {
197
303
exists ( this .getChar ( i ) ) and
@@ -219,14 +325,18 @@ abstract class RegexString extends Expr {
219
325
*/
220
326
predicate escapedCharacter ( int start , int end ) {
221
327
this .escapingChar ( start ) and
222
- not exists ( this .getText ( ) . substring ( start + 1 , end + 1 ) . toInt ( ) ) and
328
+ not this .numbered_backreference ( start , _ , _ ) and
223
329
(
224
330
// hex value \xhh
225
331
this .getChar ( start + 1 ) = "x" and end = start + 4
226
332
or
227
333
// octal value \ooo
228
334
end in [ start + 2 .. start + 4 ] and
229
- exists ( this .getText ( ) .substring ( start + 1 , end ) .toInt ( ) )
335
+ this .getText ( ) .substring ( start + 1 , end ) .toInt ( ) >= 0 and
336
+ not (
337
+ end < start + 4 and
338
+ exists ( this .getText ( ) .substring ( start + 1 , end + 1 ) .toInt ( ) )
339
+ )
230
340
or
231
341
// 16-bit hex value \uhhhh
232
342
this .getChar ( start + 1 ) = "u" and end = start + 6
@@ -238,11 +348,13 @@ abstract class RegexString extends Expr {
238
348
or
239
349
// escape not handled above, update when adding a new case
240
350
not this .getChar ( start + 1 ) in [ "x" , "u" , "U" , "N" ] and
351
+ not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
241
352
end = start + 2
242
353
)
243
354
}
244
355
245
- private predicate inCharSet ( int index ) {
356
+ /** Holds if `index` is inside a character set. */
357
+ predicate inCharSet ( int index ) {
246
358
exists ( int x , int y | this .charSet ( x , y ) and index in [ x + 1 .. y - 2 ] )
247
359
}
248
360
@@ -262,7 +374,7 @@ abstract class RegexString extends Expr {
262
374
or
263
375
start = z - 2
264
376
or
265
- start > y and start < z - 2 and not c = "-"
377
+ start > y and start < z - 2 and not this . charRange ( _ , _ , start , end , _ )
266
378
)
267
379
or
268
380
not this .inCharSet ( start ) and
@@ -281,7 +393,8 @@ abstract class RegexString extends Expr {
281
393
or
282
394
this .escapedCharacter ( start , end )
283
395
) and
284
- not exists ( int x , int y | this .group_start ( x , y ) and x <= start and y >= end )
396
+ not exists ( int x , int y | this .group_start ( x , y ) and x <= start and y >= end ) and
397
+ not exists ( int x , int y | this .backreference ( x , y ) and x <= start and y >= end )
285
398
}
286
399
287
400
predicate normalCharacter ( int start , int end ) {
@@ -326,12 +439,13 @@ abstract class RegexString extends Expr {
326
439
or
327
440
this .negativeAssertionGroup ( start , end )
328
441
or
329
- positiveLookaheadAssertionGroup ( start , end )
442
+ this . positiveLookaheadAssertionGroup ( start , end )
330
443
or
331
444
this .positiveLookbehindAssertionGroup ( start , end )
332
445
}
333
446
334
- private predicate emptyGroup ( int start , int end ) {
447
+ /** Holds if an empty group is found between `start` and `end`. */
448
+ predicate emptyGroup ( int start , int end ) {
335
449
exists ( int endm1 | end = endm1 + 1 |
336
450
this .group_start ( start , endm1 ) and
337
451
this .isGroupEnd ( endm1 )
@@ -364,13 +478,29 @@ abstract class RegexString extends Expr {
364
478
)
365
479
}
366
480
367
- private predicate positiveLookaheadAssertionGroup ( int start , int end ) {
481
+ /** Holds if a negative lookahead is found between `start` and `end` */
482
+ predicate negativeLookaheadAssertionGroup ( int start , int end ) {
483
+ exists ( int in_start | this .negative_lookahead_assertion_start ( start , in_start ) |
484
+ this .groupContents ( start , end , in_start , _)
485
+ )
486
+ }
487
+
488
+ /** Holds if a negative lookbehind is found between `start` and `end` */
489
+ predicate negativeLookbehindAssertionGroup ( int start , int end ) {
490
+ exists ( int in_start | this .negative_lookbehind_assertion_start ( start , in_start ) |
491
+ this .groupContents ( start , end , in_start , _)
492
+ )
493
+ }
494
+
495
+ /** Holds if a positive lookahead is found between `start` and `end` */
496
+ predicate positiveLookaheadAssertionGroup ( int start , int end ) {
368
497
exists ( int in_start | this .lookahead_assertion_start ( start , in_start ) |
369
498
this .groupContents ( start , end , in_start , _)
370
499
)
371
500
}
372
501
373
- private predicate positiveLookbehindAssertionGroup ( int start , int end ) {
502
+ /** Holds if a positive lookbehind is found between `start` and `end` */
503
+ predicate positiveLookbehindAssertionGroup ( int start , int end ) {
374
504
exists ( int in_start | this .lookbehind_assertion_start ( start , in_start ) |
375
505
this .groupContents ( start , end , in_start , _)
376
506
)
@@ -429,6 +559,8 @@ abstract class RegexString extends Expr {
429
559
this .getChar ( start + 1 ) = "?" and
430
560
this .getChar ( start + 2 ) = "P" and
431
561
this .getChar ( start + 3 ) = "=" and
562
+ // Should this be looking for unescaped ")"?
563
+ // TODO: test this
432
564
end = min ( int i | i > start + 4 and this .getChar ( i ) = "?" )
433
565
}
434
566
@@ -519,6 +651,7 @@ abstract class RegexString extends Expr {
519
651
520
652
private predicate numbered_backreference ( int start , int end , int value ) {
521
653
this .escapingChar ( start ) and
654
+ not this .getChar ( start + 1 ) = "0" and
522
655
exists ( string text , string svalue , int len |
523
656
end = start + len and
524
657
text = this .getText ( ) and
@@ -527,7 +660,7 @@ abstract class RegexString extends Expr {
527
660
svalue = text .substring ( start + 1 , start + len ) and
528
661
value = svalue .toInt ( ) and
529
662
not exists ( text .substring ( start + 1 , start + len + 1 ) .toInt ( ) ) and
530
- value != 0
663
+ value > 0
531
664
)
532
665
}
533
666
@@ -551,6 +684,8 @@ abstract class RegexString extends Expr {
551
684
this .group ( start , end )
552
685
or
553
686
this .charSet ( start , end )
687
+ or
688
+ this .backreference ( start , end )
554
689
}
555
690
556
691
private predicate qualifier ( int start , int end , boolean maybe_empty ) {
0 commit comments