Skip to content

Commit d67235b

Browse files
authored
Merge pull request github#11071 from erik-krogh/fixCanon
ReDoS: fix canonicalization in NfaUtils
2 parents eb365c1 + c15f63c commit d67235b

File tree

6 files changed

+102
-76
lines changed

6 files changed

+102
-76
lines changed

java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -129,19 +129,20 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
129129
min(RelevantRegExpTerm t, Location loc, File file |
130130
loc = t.getLocation() and
131131
file = t.getFile() and
132-
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
132+
str = getCanonicalizationString(t)
133133
|
134134
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
135135
)
136136
}
137137

138138
/**
139-
* Gets a string representation of the flags used with the regular expression.
140-
* Only the flags that are relevant for the canonicalization are included.
139+
* Gets a string representation of `term` that is used for canonicalization.
141140
*/
142-
string getCanonicalizationFlags(RegExpTerm root) {
143-
root.isRootTerm() and
144-
(if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
141+
private string getCanonicalizationString(RelevantRegExpTerm term) {
142+
exists(string ignoreCase |
143+
(if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
144+
result = term.getRawValue() + "|" + ignoreCase
145+
)
145146
}
146147

147148
/**
@@ -186,12 +187,19 @@ private newtype TInputSymbol =
186187
Epsilon()
187188

188189
/**
189-
* Gets the canonical CharClass for `term`.
190+
* Gets the the CharClass corresponding to the canonical representative `term`.
190191
*/
191-
CharClass getCanonicalCharClass(RegExpTerm term) {
192+
private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
192193
exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
193194
}
194195

196+
/**
197+
* Gets a char class that represents `term`, even when `term` is not the canonical representative.
198+
*/
199+
CharacterClass getCanonicalCharClass(RegExpTerm term) {
200+
exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
201+
}
202+
195203
/**
196204
* Holds if `a` and `b` are input symbols from the same regexp.
197205
*/
@@ -284,7 +292,7 @@ private module CharacterClasses {
284292
*/
285293
pragma[noinline]
286294
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
287-
exists(getCanonicalCharClass(cc)) and
295+
exists(getCharClassForCanonicalTerm(cc)) and
288296
exists(RegExpTerm child | child = cc.getAChild() |
289297
char = child.(RegexpCharacterConstant).getValue()
290298
or
@@ -387,7 +395,7 @@ private module CharacterClasses {
387395
private class PositiveCharacterClass extends CharacterClass {
388396
RegExpCharacterClass cc;
389397

390-
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
398+
PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
391399

392400
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
393401

@@ -400,7 +408,7 @@ private module CharacterClasses {
400408
private class InvertedCharacterClass extends CharacterClass {
401409
RegExpCharacterClass cc;
402410

403-
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
411+
InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
404412

405413
override string getARelevantChar() {
406414
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
@@ -435,7 +443,7 @@ private module CharacterClasses {
435443

436444
PositiveCharacterClassEscape() {
437445
isEscapeClass(cc, charClass) and
438-
this = getCanonicalCharClass(cc) and
446+
this = getCharClassForCanonicalTerm(cc) and
439447
charClass = ["d", "s", "w"]
440448
}
441449

@@ -475,7 +483,7 @@ private module CharacterClasses {
475483
NegativeCharacterClassEscape() {
476484
exists(RegExpTerm cc |
477485
isEscapeClass(cc, charClass) and
478-
this = getCanonicalCharClass(cc) and
486+
this = getCharClassForCanonicalTerm(cc) and
479487
charClass = ["D", "S", "W"]
480488
)
481489
}
@@ -652,17 +660,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
652660
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
653661
or
654662
q1 = before(cc) and
655-
lbl =
656-
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
657-
getCanonicalizationFlags(cc.getRootTerm()))) and
663+
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
658664
q2 = after(cc)
659665
)
660666
or
661667
exists(RegExpTerm cc | isEscapeClass(cc, _) |
662668
q1 = before(cc) and
663-
lbl =
664-
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
665-
getCanonicalizationFlags(cc.getRootTerm()))) and
669+
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
666670
q2 = after(cc)
667671
)
668672
or

javascript/ql/lib/semmle/javascript/security/regexp/NfaUtils.qll

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -129,19 +129,20 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
129129
min(RelevantRegExpTerm t, Location loc, File file |
130130
loc = t.getLocation() and
131131
file = t.getFile() and
132-
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
132+
str = getCanonicalizationString(t)
133133
|
134134
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
135135
)
136136
}
137137

138138
/**
139-
* Gets a string representation of the flags used with the regular expression.
140-
* Only the flags that are relevant for the canonicalization are included.
139+
* Gets a string representation of `term` that is used for canonicalization.
141140
*/
142-
string getCanonicalizationFlags(RegExpTerm root) {
143-
root.isRootTerm() and
144-
(if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
141+
private string getCanonicalizationString(RelevantRegExpTerm term) {
142+
exists(string ignoreCase |
143+
(if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
144+
result = term.getRawValue() + "|" + ignoreCase
145+
)
145146
}
146147

147148
/**
@@ -186,12 +187,19 @@ private newtype TInputSymbol =
186187
Epsilon()
187188

188189
/**
189-
* Gets the canonical CharClass for `term`.
190+
* Gets the the CharClass corresponding to the canonical representative `term`.
190191
*/
191-
CharClass getCanonicalCharClass(RegExpTerm term) {
192+
private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
192193
exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
193194
}
194195

196+
/**
197+
* Gets a char class that represents `term`, even when `term` is not the canonical representative.
198+
*/
199+
CharacterClass getCanonicalCharClass(RegExpTerm term) {
200+
exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
201+
}
202+
195203
/**
196204
* Holds if `a` and `b` are input symbols from the same regexp.
197205
*/
@@ -284,7 +292,7 @@ private module CharacterClasses {
284292
*/
285293
pragma[noinline]
286294
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
287-
exists(getCanonicalCharClass(cc)) and
295+
exists(getCharClassForCanonicalTerm(cc)) and
288296
exists(RegExpTerm child | child = cc.getAChild() |
289297
char = child.(RegexpCharacterConstant).getValue()
290298
or
@@ -387,7 +395,7 @@ private module CharacterClasses {
387395
private class PositiveCharacterClass extends CharacterClass {
388396
RegExpCharacterClass cc;
389397

390-
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
398+
PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
391399

392400
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
393401

@@ -400,7 +408,7 @@ private module CharacterClasses {
400408
private class InvertedCharacterClass extends CharacterClass {
401409
RegExpCharacterClass cc;
402410

403-
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
411+
InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
404412

405413
override string getARelevantChar() {
406414
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
@@ -435,7 +443,7 @@ private module CharacterClasses {
435443

436444
PositiveCharacterClassEscape() {
437445
isEscapeClass(cc, charClass) and
438-
this = getCanonicalCharClass(cc) and
446+
this = getCharClassForCanonicalTerm(cc) and
439447
charClass = ["d", "s", "w"]
440448
}
441449

@@ -475,7 +483,7 @@ private module CharacterClasses {
475483
NegativeCharacterClassEscape() {
476484
exists(RegExpTerm cc |
477485
isEscapeClass(cc, charClass) and
478-
this = getCanonicalCharClass(cc) and
486+
this = getCharClassForCanonicalTerm(cc) and
479487
charClass = ["D", "S", "W"]
480488
)
481489
}
@@ -652,17 +660,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
652660
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
653661
or
654662
q1 = before(cc) and
655-
lbl =
656-
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
657-
getCanonicalizationFlags(cc.getRootTerm()))) and
663+
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
658664
q2 = after(cc)
659665
)
660666
or
661667
exists(RegExpTerm cc | isEscapeClass(cc, _) |
662668
q1 = before(cc) and
663-
lbl =
664-
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
665-
getCanonicalizationFlags(cc.getRootTerm()))) and
669+
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
666670
q2 = after(cc)
667671
)
668672
or

javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/IncompleteMultiCharacterSanitization.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@
3737
| tst-multi-character-sanitization.js:143:13:143:56 | content ... /g, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:143:30:143:30 | < | <script |
3838
| tst-multi-character-sanitization.js:144:13:144:91 | content ... /g, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:144:30:144:30 | < | <script |
3939
| tst-multi-character-sanitization.js:145:13:145:90 | content ... /g, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:145:30:145:30 | < | <script |
40+
| tst-multi-character-sanitization.js:148:3:148:99 | n.clone ... gi, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:148:41:148:41 | < | <script |
41+
| tst-multi-character-sanitization.js:152:3:152:99 | n.clone ... gi, '') | This string may still contain $@, which may cause an HTML element injection vulnerability. | tst-multi-character-sanitization.js:152:41:152:41 | < | <script |

javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/tst-multi-character-sanitization.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,12 @@
144144
content = content.replace(/<(script|iframe|video)[\s\S]*?<\/(script|iframe|video)>/g, '') // NOT OK
145145
content = content.replace(/<(script|iframe|video)(.|\s)*?\/(script|iframe|video)>/g, '') // NOT OK
146146
content = content.replace(/<[^<]*>/g, ""); // OK
147+
148+
n.cloneNode(false).outerHTML.replace(/<\/?[\w:\-]+ ?|=[\"][^\"]+\"|=\'[^\']+\'|=[\w\-]+|>/gi, '').replace(/[\w:\-]+/gi, function(a) { // NOT OK
149+
o.push({specified : 1, nodeName : a});
150+
});
151+
152+
n.cloneNode(false).outerHTML.replace(/<\/?[\w:\-]+ ?|=[\"][^\"]+\"|=\'[^\']+\'|=[\w\-]+|>/gi, '').replace(/[\w:\-]+/gi, function(a) { // NOT OK
153+
o.push({specified : 1, nodeName : a});
154+
});
147155
});

python/ql/lib/semmle/python/security/regexp/NfaUtils.qll

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -129,19 +129,20 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
129129
min(RelevantRegExpTerm t, Location loc, File file |
130130
loc = t.getLocation() and
131131
file = t.getFile() and
132-
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
132+
str = getCanonicalizationString(t)
133133
|
134134
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
135135
)
136136
}
137137

138138
/**
139-
* Gets a string representation of the flags used with the regular expression.
140-
* Only the flags that are relevant for the canonicalization are included.
139+
* Gets a string representation of `term` that is used for canonicalization.
141140
*/
142-
string getCanonicalizationFlags(RegExpTerm root) {
143-
root.isRootTerm() and
144-
(if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
141+
private string getCanonicalizationString(RelevantRegExpTerm term) {
142+
exists(string ignoreCase |
143+
(if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
144+
result = term.getRawValue() + "|" + ignoreCase
145+
)
145146
}
146147

147148
/**
@@ -186,12 +187,19 @@ private newtype TInputSymbol =
186187
Epsilon()
187188

188189
/**
189-
* Gets the canonical CharClass for `term`.
190+
* Gets the the CharClass corresponding to the canonical representative `term`.
190191
*/
191-
CharClass getCanonicalCharClass(RegExpTerm term) {
192+
private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
192193
exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
193194
}
194195

196+
/**
197+
* Gets a char class that represents `term`, even when `term` is not the canonical representative.
198+
*/
199+
CharacterClass getCanonicalCharClass(RegExpTerm term) {
200+
exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
201+
}
202+
195203
/**
196204
* Holds if `a` and `b` are input symbols from the same regexp.
197205
*/
@@ -284,7 +292,7 @@ private module CharacterClasses {
284292
*/
285293
pragma[noinline]
286294
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
287-
exists(getCanonicalCharClass(cc)) and
295+
exists(getCharClassForCanonicalTerm(cc)) and
288296
exists(RegExpTerm child | child = cc.getAChild() |
289297
char = child.(RegexpCharacterConstant).getValue()
290298
or
@@ -387,7 +395,7 @@ private module CharacterClasses {
387395
private class PositiveCharacterClass extends CharacterClass {
388396
RegExpCharacterClass cc;
389397

390-
PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }
398+
PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
391399

392400
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
393401

@@ -400,7 +408,7 @@ private module CharacterClasses {
400408
private class InvertedCharacterClass extends CharacterClass {
401409
RegExpCharacterClass cc;
402410

403-
InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }
411+
InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
404412

405413
override string getARelevantChar() {
406414
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
@@ -435,7 +443,7 @@ private module CharacterClasses {
435443

436444
PositiveCharacterClassEscape() {
437445
isEscapeClass(cc, charClass) and
438-
this = getCanonicalCharClass(cc) and
446+
this = getCharClassForCanonicalTerm(cc) and
439447
charClass = ["d", "s", "w"]
440448
}
441449

@@ -475,7 +483,7 @@ private module CharacterClasses {
475483
NegativeCharacterClassEscape() {
476484
exists(RegExpTerm cc |
477485
isEscapeClass(cc, charClass) and
478-
this = getCanonicalCharClass(cc) and
486+
this = getCharClassForCanonicalTerm(cc) and
479487
charClass = ["D", "S", "W"]
480488
)
481489
}
@@ -652,17 +660,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
652660
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
653661
or
654662
q1 = before(cc) and
655-
lbl =
656-
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
657-
getCanonicalizationFlags(cc.getRootTerm()))) and
663+
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
658664
q2 = after(cc)
659665
)
660666
or
661667
exists(RegExpTerm cc | isEscapeClass(cc, _) |
662668
q1 = before(cc) and
663-
lbl =
664-
CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
665-
getCanonicalizationFlags(cc.getRootTerm()))) and
669+
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
666670
q2 = after(cc)
667671
)
668672
or

0 commit comments

Comments
 (0)