Skip to content

Commit 7fb3d81

Browse files
committed
add further normalization of char classses
1 parent 3be4a86 commit 7fb3d81

File tree

9 files changed

+415
-252
lines changed

9 files changed

+415
-252
lines changed

java/ql/lib/semmle/code/java/security/performance/ReDoSUtil.qll

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
199199
/**
200200
* Holds if `a` and `b` are input symbols from the same regexp.
201201
*/
202-
private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
202+
private predicate sharesRoot(InputSymbol a, InputSymbol b) {
203203
exists(RegExpRoot root |
204204
belongsTo(a, root) and
205205
belongsTo(b, root)
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
209209
/**
210210
* Holds if the `a` is an input symbol from a regexp that has root `root`.
211211
*/
212-
private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
212+
private predicate belongsTo(InputSymbol a, RegExpRoot root) {
213213
exists(State s | getRoot(s.getRepr()) = root |
214214
delta(s, a, _)
215215
or
@@ -378,6 +378,13 @@ private module CharacterClasses {
378378
)
379379
}
380380

381+
bindingset[char, cc]
382+
private string caseNormalize(string char, RegExpTerm cc) {
383+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
384+
then result = char.toLowerCase()
385+
else result = char
386+
}
387+
381388
/**
382389
* An implementation of `CharacterClass` for positive (non inverted) character classes.
383390
*/
@@ -386,7 +393,7 @@ private module CharacterClasses {
386393

387394
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
388395

389-
override string getARelevantChar() { result = getAMentionedChar(cc) }
396+
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
390397

391398
override predicate matches(string char) { hasChildThatMatches(cc, char) }
392399
}
@@ -400,8 +407,8 @@ private module CharacterClasses {
400407
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
401408

402409
override string getARelevantChar() {
403-
result = nextChar(getAMentionedChar(cc)) or
404-
nextChar(result) = getAMentionedChar(cc)
410+
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
411+
nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
405412
}
406413

407414
bindingset[char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
428435
*/
429436
private class PositiveCharacterClassEscape extends CharacterClass {
430437
string charClass;
438+
RegExpTerm cc;
431439

432440
PositiveCharacterClassEscape() {
433-
exists(RegExpTerm cc |
434-
isEscapeClass(cc, charClass) and
435-
this = getCanonicalCharClass(cc) and
436-
charClass = ["d", "s", "w"]
437-
)
441+
isEscapeClass(cc, charClass) and
442+
this = getCanonicalCharClass(cc) and
443+
charClass = ["d", "s", "w"]
438444
}
439445

440446
override string getARelevantChar() {
@@ -445,7 +451,9 @@ private module CharacterClasses {
445451
result = " "
446452
or
447453
charClass = "w" and
448-
result = ["a", "Z", "_", "0", "9"]
454+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
455+
then result = ["a", "z", "_", "0", "9"]
456+
else result = ["a", "Z", "_", "0", "9"]
449457
}
450458

451459
override predicate matches(string char) { classEscapeMatches(charClass, char) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
492500
not classEscapeMatches(charClass.toLowerCase(), char)
493501
}
494502
}
503+
504+
/** Gets a representative for all char classes that match the same chars as `c`. */
505+
CharacterClass normalize(CharacterClass c) {
506+
exists(string normalization |
507+
normalization = getMormalizationString(c) and
508+
result =
509+
min(CharacterClass cc, string raw |
510+
getMormalizationString(cc) = normalization and cc = CharClass(raw)
511+
|
512+
cc order by raw
513+
)
514+
)
515+
}
516+
517+
/** Gets a string representing all the chars matched by `c` */
518+
private string getMormalizationString(CharacterClass c) {
519+
(c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
520+
result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
521+
or
522+
(c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
523+
// the string produced by the concat can not contain repeated chars
524+
// so by starting the below with "nn" we can guarantee that
525+
// it will not overlap with the above case.
526+
// and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
527+
result =
528+
"nn:" +
529+
concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
530+
}
495531
}
496532

497533
private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
620656
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
621657
or
622658
q1 = before(cc) and
623-
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
659+
lbl =
660+
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
661+
getCanonicalizationFlags(cc.getRootTerm()))) and
624662
q2 = after(cc)
625663
)
626664
or
627665
exists(RegExpTerm cc | isEscapeClass(cc, _) |
628666
q1 = before(cc) and
629-
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
667+
lbl =
668+
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
669+
getCanonicalizationFlags(cc.getRootTerm()))) and
630670
q2 = after(cc)
631671
)
632672
or

javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
199199
/**
200200
* Holds if `a` and `b` are input symbols from the same regexp.
201201
*/
202-
private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
202+
private predicate sharesRoot(InputSymbol a, InputSymbol b) {
203203
exists(RegExpRoot root |
204204
belongsTo(a, root) and
205205
belongsTo(b, root)
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
209209
/**
210210
* Holds if the `a` is an input symbol from a regexp that has root `root`.
211211
*/
212-
private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
212+
private predicate belongsTo(InputSymbol a, RegExpRoot root) {
213213
exists(State s | getRoot(s.getRepr()) = root |
214214
delta(s, a, _)
215215
or
@@ -378,6 +378,13 @@ private module CharacterClasses {
378378
)
379379
}
380380

381+
bindingset[char, cc]
382+
private string caseNormalize(string char, RegExpTerm cc) {
383+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
384+
then result = char.toLowerCase()
385+
else result = char
386+
}
387+
381388
/**
382389
* An implementation of `CharacterClass` for positive (non inverted) character classes.
383390
*/
@@ -386,7 +393,7 @@ private module CharacterClasses {
386393

387394
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
388395

389-
override string getARelevantChar() { result = getAMentionedChar(cc) }
396+
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
390397

391398
override predicate matches(string char) { hasChildThatMatches(cc, char) }
392399
}
@@ -400,8 +407,8 @@ private module CharacterClasses {
400407
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
401408

402409
override string getARelevantChar() {
403-
result = nextChar(getAMentionedChar(cc)) or
404-
nextChar(result) = getAMentionedChar(cc)
410+
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
411+
nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
405412
}
406413

407414
bindingset[char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
428435
*/
429436
private class PositiveCharacterClassEscape extends CharacterClass {
430437
string charClass;
438+
RegExpTerm cc;
431439

432440
PositiveCharacterClassEscape() {
433-
exists(RegExpTerm cc |
434-
isEscapeClass(cc, charClass) and
435-
this = getCanonicalCharClass(cc) and
436-
charClass = ["d", "s", "w"]
437-
)
441+
isEscapeClass(cc, charClass) and
442+
this = getCanonicalCharClass(cc) and
443+
charClass = ["d", "s", "w"]
438444
}
439445

440446
override string getARelevantChar() {
@@ -445,7 +451,9 @@ private module CharacterClasses {
445451
result = " "
446452
or
447453
charClass = "w" and
448-
result = ["a", "Z", "_", "0", "9"]
454+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
455+
then result = ["a", "z", "_", "0", "9"]
456+
else result = ["a", "Z", "_", "0", "9"]
449457
}
450458

451459
override predicate matches(string char) { classEscapeMatches(charClass, char) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
492500
not classEscapeMatches(charClass.toLowerCase(), char)
493501
}
494502
}
503+
504+
/** Gets a representative for all char classes that match the same chars as `c`. */
505+
CharacterClass normalize(CharacterClass c) {
506+
exists(string normalization |
507+
normalization = getMormalizationString(c) and
508+
result =
509+
min(CharacterClass cc, string raw |
510+
getMormalizationString(cc) = normalization and cc = CharClass(raw)
511+
|
512+
cc order by raw
513+
)
514+
)
515+
}
516+
517+
/** Gets a string representing all the chars matched by `c` */
518+
private string getMormalizationString(CharacterClass c) {
519+
(c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
520+
result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
521+
or
522+
(c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
523+
// the string produced by the concat can not contain repeated chars
524+
// so by starting the below with "nn" we can guarantee that
525+
// it will not overlap with the above case.
526+
// and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
527+
result =
528+
"nn:" +
529+
concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
530+
}
495531
}
496532

497533
private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
620656
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
621657
or
622658
q1 = before(cc) and
623-
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
659+
lbl =
660+
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
661+
getCanonicalizationFlags(cc.getRootTerm()))) and
624662
q2 = after(cc)
625663
)
626664
or
627665
exists(RegExpTerm cc | isEscapeClass(cc, _) |
628666
q1 = before(cc) and
629-
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
667+
lbl =
668+
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
669+
getCanonicalizationFlags(cc.getRootTerm()))) and
630670
q2 = after(cc)
631671
)
632672
or

0 commit comments

Comments
 (0)