@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
199
199
/**
200
200
* Holds if `a` and `b` are input symbols from the same regexp.
201
201
*/
202
- private predicate sharesRoot ( TInputSymbol a , TInputSymbol b ) {
202
+ private predicate sharesRoot ( InputSymbol a , InputSymbol b ) {
203
203
exists ( RegExpRoot root |
204
204
belongsTo ( a , root ) and
205
205
belongsTo ( b , root )
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
209
209
/**
210
210
* Holds if the `a` is an input symbol from a regexp that has root `root`.
211
211
*/
212
- private predicate belongsTo ( TInputSymbol a , RegExpRoot root ) {
212
+ private predicate belongsTo ( InputSymbol a , RegExpRoot root ) {
213
213
exists ( State s | getRoot ( s .getRepr ( ) ) = root |
214
214
delta ( s , a , _)
215
215
or
@@ -378,6 +378,13 @@ private module CharacterClasses {
378
378
)
379
379
}
380
380
381
+ bindingset [ char, cc]
382
+ private string caseNormalize ( string char , RegExpTerm cc ) {
383
+ if RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
384
+ then result = char .toLowerCase ( )
385
+ else result = char
386
+ }
387
+
381
388
/**
382
389
* An implementation of `CharacterClass` for positive (non inverted) character classes.
383
390
*/
@@ -386,7 +393,7 @@ private module CharacterClasses {
386
393
387
394
PositiveCharacterClass ( ) { this = getCanonicalCharClass ( cc ) and not cc .isInverted ( ) }
388
395
389
- override string getARelevantChar ( ) { result = getAMentionedChar ( cc ) }
396
+ override string getARelevantChar ( ) { result = caseNormalize ( getAMentionedChar ( cc ) , cc ) }
390
397
391
398
override predicate matches ( string char ) { hasChildThatMatches ( cc , char ) }
392
399
}
@@ -400,8 +407,8 @@ private module CharacterClasses {
400
407
InvertedCharacterClass ( ) { this = getCanonicalCharClass ( cc ) and cc .isInverted ( ) }
401
408
402
409
override string getARelevantChar ( ) {
403
- result = nextChar ( getAMentionedChar ( cc ) ) or
404
- nextChar ( result ) = getAMentionedChar ( cc )
410
+ result = nextChar ( caseNormalize ( getAMentionedChar ( cc ) , cc ) ) or
411
+ nextChar ( result ) = caseNormalize ( getAMentionedChar ( cc ) , cc )
405
412
}
406
413
407
414
bindingset [ char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
428
435
*/
429
436
private class PositiveCharacterClassEscape extends CharacterClass {
430
437
string charClass ;
438
+ RegExpTerm cc ;
431
439
432
440
PositiveCharacterClassEscape ( ) {
433
- exists ( RegExpTerm cc |
434
- isEscapeClass ( cc , charClass ) and
435
- this = getCanonicalCharClass ( cc ) and
436
- charClass = [ "d" , "s" , "w" ]
437
- )
441
+ isEscapeClass ( cc , charClass ) and
442
+ this = getCanonicalCharClass ( cc ) and
443
+ charClass = [ "d" , "s" , "w" ]
438
444
}
439
445
440
446
override string getARelevantChar ( ) {
@@ -445,7 +451,9 @@ private module CharacterClasses {
445
451
result = " "
446
452
or
447
453
charClass = "w" and
448
- result = [ "a" , "Z" , "_" , "0" , "9" ]
454
+ if RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
455
+ then result = [ "a" , "z" , "_" , "0" , "9" ]
456
+ else result = [ "a" , "Z" , "_" , "0" , "9" ]
449
457
}
450
458
451
459
override predicate matches ( string char ) { classEscapeMatches ( charClass , char ) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
492
500
not classEscapeMatches ( charClass .toLowerCase ( ) , char )
493
501
}
494
502
}
503
+
504
+ /** Gets a representative for all char classes that match the same chars as `c`. */
505
+ CharacterClass normalize ( CharacterClass c ) {
506
+ exists ( string normalization |
507
+ normalization = getMormalizationString ( c ) and
508
+ result =
509
+ min ( CharacterClass cc , string raw |
510
+ getMormalizationString ( cc ) = normalization and cc = CharClass ( raw )
511
+ |
512
+ cc order by raw
513
+ )
514
+ )
515
+ }
516
+
517
+ /** Gets a string representing all the chars matched by `c` */
518
+ private string getMormalizationString ( CharacterClass c ) {
519
+ ( c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape ) and
520
+ result = concat ( string char | c .matches ( char ) and char = CharacterClasses:: getARelevantChar ( ) )
521
+ or
522
+ ( c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape ) and
523
+ // the string produced by the concat can not contain repeated chars
524
+ // so by starting the below with "nn" we can guarantee that
525
+ // it will not overlap with the above case.
526
+ // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
527
+ result =
528
+ "nn:" +
529
+ concat ( string char | not c .matches ( char ) and char = CharacterClasses:: getARelevantChar ( ) )
530
+ }
495
531
}
496
532
497
533
private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
620
656
cc .isUniversalClass ( ) and q1 = before ( cc ) and lbl = Any ( ) and q2 = after ( cc )
621
657
or
622
658
q1 = before ( cc ) and
623
- lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) and
659
+ lbl =
660
+ CharacterClasses:: normalize ( CharClass ( cc .getRawValue ( ) + "|" +
661
+ getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) ) and
624
662
q2 = after ( cc )
625
663
)
626
664
or
627
665
exists ( RegExpTerm cc | isEscapeClass ( cc , _) |
628
666
q1 = before ( cc ) and
629
- lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) and
667
+ lbl =
668
+ CharacterClasses:: normalize ( CharClass ( cc .getRawValue ( ) + "|" +
669
+ getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) ) and
630
670
q2 = after ( cc )
631
671
)
632
672
or
0 commit comments