Skip to content

Commit 38477d7

Browse files
authored
Merge pull request github#6462 from erik-krogh/repeat
JS: support more regular expressions in js/incomplete-multi-character-sanitization
2 parents 0c0f335 + 5fe6671 commit 38477d7

File tree

6 files changed

+107
-46
lines changed

6 files changed

+107
-46
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
lgtm,codescanning
2+
* The `js/incomplete-multi-character-sanitization` query now flags more regular expressions that can result in bad sanitization.

javascript/ql/src/Security/CWE-116/IncompleteMultiCharacterSanitization.ql

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -56,31 +56,69 @@ DangerousPrefix getADangerousMatchedPrefix(EmptyReplaceRegExpTerm t) {
5656
not exists(EmptyReplaceRegExpTerm pred | pred = t.getPredecessor+() and not pred.isNullable())
5757
}
5858

59+
private import semmle.javascript.security.performance.ReDoSUtil as ReDoSUtil
60+
61+
/**
62+
* Gets a char from a dangerous prefix that is matched by `t`.
63+
*/
64+
pragma[noinline]
65+
DangerousPrefixSubstring getADangerousMatchedChar(EmptyReplaceRegExpTerm t) {
66+
t.isNullable() and result = ""
67+
or
68+
t.getAMatchedString() = result
69+
or
70+
// A substring matched by some character class. This is only used to match the "word" part of a HTML tag (e.g. "iframe" in "<iframe").
71+
exists(ReDoSUtil::CharacterClass cc |
72+
cc = ReDoSUtil::getCanonicalCharClass(t) and
73+
cc.matches(result) and
74+
result.regexpMatch("\\w") and
75+
// excluding character classes that match ">" (e.g. /<[^<]*>/), as these might consume nested HTML tags, and thus prevent the dangerous pattern this query is looking for.
76+
not cc.matches(">")
77+
)
78+
or
79+
t instanceof RegExpDot and
80+
result.length() = 1
81+
or
82+
(
83+
t instanceof RegExpOpt or
84+
t instanceof RegExpStar or
85+
t instanceof RegExpPlus or
86+
t instanceof RegExpGroup or
87+
t instanceof RegExpAlt
88+
) and
89+
result = getADangerousMatchedChar(t.getAChild())
90+
}
91+
5992
/**
6093
* Gets a substring of a dangerous prefix that is in the language starting at `t` (ignoring lookarounds).
6194
*
6295
* Note that the language of `t` is slightly restricted as not all RegExpTerm types are supported.
6396
*/
6497
DangerousPrefixSubstring getADangerousMatchedPrefixSubstring(EmptyReplaceRegExpTerm t) {
65-
exists(string left |
66-
t.isNullable() and left = ""
67-
or
68-
t.getAMatchedString() = left
69-
or
70-
(
71-
t instanceof RegExpOpt or
72-
t instanceof RegExpStar or
73-
t instanceof RegExpPlus or
74-
t instanceof RegExpGroup or
75-
t instanceof RegExpAlt
76-
) and
77-
left = getADangerousMatchedPrefixSubstring(t.getAChild())
78-
|
79-
result = left + getADangerousMatchedPrefixSubstring(t.getSuccessor()) or
80-
result = left
98+
result = getADangerousMatchedChar(t) + getADangerousMatchedPrefixSubstring(t.getSuccessor())
99+
or
100+
result = getADangerousMatchedChar(t)
101+
or
102+
// loop around for repetitions (only considering alphanumeric characters in the repetition)
103+
exists(RepetitionMatcher repetition | t = repetition |
104+
result = getADangerousMatchedPrefixSubstring(repetition) + repetition.getAChar()
81105
)
82106
}
83107

108+
class RepetitionMatcher extends EmptyReplaceRegExpTerm {
109+
string char;
110+
111+
pragma[noinline]
112+
RepetitionMatcher() {
113+
(this instanceof RegExpPlus or this instanceof RegExpStar) and
114+
char = getADangerousMatchedChar(this.getAChild()) and
115+
char.regexpMatch("\\w")
116+
}
117+
118+
pragma[noinline]
119+
string getAChar() { result = char }
120+
}
121+
84122
/**
85123
* Holds if `t` may match the dangerous `prefix` and some suffix, indicating intent to prevent a vulnerablity of kind `kind`.
86124
*/
@@ -151,7 +189,7 @@ where
151189
// skip leading optional elements
152190
not dangerous.isNullable() and
153191
// only warn about the longest match (presumably the most descriptive)
154-
prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length()) and
192+
prefix = max(string m | matchesDangerousPrefix(dangerous, m, kind) | m order by m.length(), m) and
155193
// only warn once per kind
156194
not exists(EmptyReplaceRegExpTerm other |
157195
other = dangerous.getAChild+() or other = dangerous.getPredecessor+()

javascript/ql/src/semmle/javascript/security/performance/ReDoSUtil.qll

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -155,15 +155,22 @@ private class RegexpCharacterConstant extends RegExpConstant {
155155
RegexpCharacterConstant() { this.isCharacter() }
156156
}
157157

158+
/**
159+
* A regexp term that is relevant for this ReDoS analysis.
160+
*/
161+
class RelevantRegExpTerm extends RegExpTerm {
162+
RelevantRegExpTerm() { getRoot(this).isRelevant() }
163+
}
164+
158165
/**
159166
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
160167
*
161168
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
162169
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
163170
*/
164-
private predicate isCanonicalTerm(RegExpTerm term, string str) {
171+
private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
165172
term =
166-
rank[1](RegExpTerm t, Location loc, File file |
173+
min(RelevantRegExpTerm t, Location loc, File file |
167174
loc = t.getLocation() and
168175
file = t.getFile() and
169176
str = t.getRawValue()
@@ -178,15 +185,15 @@ private predicate isCanonicalTerm(RegExpTerm term, string str) {
178185
private newtype TInputSymbol =
179186
/** An input symbol corresponding to character `c`. */
180187
Char(string c) {
181-
c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
188+
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
182189
} or
183190
/**
184191
* An input symbol representing all characters matched by
185192
* a (non-universal) character class that has string representation `charClassString`.
186193
*/
187194
CharClass(string charClassString) {
188-
exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
189-
exists(RegExpTerm recc | isCanonicalTerm(recc, charClassString) |
195+
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
196+
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
190197
recc instanceof RegExpCharacterClass and
191198
not recc.(RegExpCharacterClass).isUniversalClass()
192199
or
@@ -251,7 +258,7 @@ class InputSymbol extends TInputSymbol {
251258
/**
252259
* An abstract input symbol that represents a character class.
253260
*/
254-
abstract private class CharacterClass extends InputSymbol {
261+
abstract class CharacterClass extends InputSymbol {
255262
/**
256263
* Gets a character that is relevant for intersection-tests involving this
257264
* character class.
@@ -626,13 +633,10 @@ RegExpRoot getRoot(RegExpTerm term) {
626633
}
627634

628635
private newtype TState =
629-
Match(RegExpTerm t, int i) {
630-
getRoot(t).isRelevant() and
631-
(
632-
i = 0
633-
or
634-
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
635-
)
636+
Match(RelevantRegExpTerm t, int i) {
637+
i = 0
638+
or
639+
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
636640
} or
637641
Accept(RegExpRoot l) { l.isRelevant() } or
638642
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }

javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/IncompleteMultiCharacterSanitization.expected

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
| tst-multi-character-sanitization.js:3:13:3:57 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:3:30:3:30 | < | <cript |
1+
| tst-multi-character-sanitization.js:3:13:3:57 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:3:30:3:30 | < | <script |
22
| tst-multi-character-sanitization.js:4:13:4:47 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:4:30:4:40 | on\\w+=".*" | on |
33
| tst-multi-character-sanitization.js:5:13:5:49 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:5:30:5:42 | on\\w+=\\'.*\\' | on |
4-
| tst-multi-character-sanitization.js:9:13:9:47 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:9:30:9:30 | < | <cript |
4+
| tst-multi-character-sanitization.js:9:13:9:47 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:9:30:9:30 | < | <script |
55
| tst-multi-character-sanitization.js:10:13:10:49 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:10:30:10:42 | .on\\w+=.*".*" | on |
66
| tst-multi-character-sanitization.js:11:13:11:51 | content ... /g, "") | This string may still contain $@, which may cause a HTML attribute injection vulnerability. | tst-multi-character-sanitization.js:11:30:11:44 | .on\\w+=.*\\'.*\\' | on |
77
| tst-multi-character-sanitization.js:19:3:19:35 | respons ... pt, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:18:18:18:24 | <script | <script |
@@ -31,4 +31,9 @@
3131
| tst-multi-character-sanitization.js:126:7:129:34 | x\\n . ... //, "") | This string may still contain $@, which may cause a path injection vulnerability. | tst-multi-character-sanitization.js:129:21:129:22 | \\/ | /.. |
3232
| tst-multi-character-sanitization.js:135:2:135:44 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:135:19:135:25 | <script | <script |
3333
| tst-multi-character-sanitization.js:136:2:136:46 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:136:19:136:19 | < | <script |
34+
| tst-multi-character-sanitization.js:137:2:137:48 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:137:19:137:20 | .+ | <script |
3435
| tst-multi-character-sanitization.js:138:2:138:48 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:138:21:138:21 | < | <script |
36+
| tst-multi-character-sanitization.js:142:13:142:62 | content ... gi, "") | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:142:30:142:36 | <script | <script |
37+
| tst-multi-character-sanitization.js:143:13:143:56 | content ... /g, '') | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:143:30:143:30 | < | <script |
38+
| tst-multi-character-sanitization.js:144:13:144:91 | content ... /g, '') | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:144:30:144:30 | < | <script |
39+
| tst-multi-character-sanitization.js:145:13:145:90 | content ... /g, '') | This string may still contain $@, which may cause a HTML element injection vulnerability. | tst-multi-character-sanitization.js:145:30:145:30 | < | <script |

javascript/ql/test/query-tests/Security/CWE-116/IncompleteSanitization/tst-multi-character-sanitization.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,14 @@
134134
(function (content) {
135135
content.replace(/<script.*\/script>/gi, ""); // NOT OK
136136
content.replace(/<(script).*\/script>/gi, ""); // NOT OK
137-
content.replace(/.+<(script).*\/script>/gi, ""); // OK
137+
content.replace(/.+<(script).*\/script>/gi, ""); // NOT OK
138138
content.replace(/.*<(script).*\/script>/gi, ""); // NOT OK
139139
});
140+
141+
(function (content) {
142+
content = content.replace(/<script[\s\S]*?<\/script>/gi, ""); // NOT OK
143+
content = content.replace(/<[a-zA-Z\/](.|\n)*?>/g, '') || ' '; // NOT OK
144+
content = content.replace(/<(script|iframe|video)[\s\S]*?<\/(script|iframe|video)>/g, '') // NOT OK
145+
content = content.replace(/<(script|iframe|video)(.|\s)*?\/(script|iframe|video)>/g, '') // NOT OK
146+
content = content.replace(/<[^<]*>/g, ""); // OK
147+
});

python/ql/src/semmle/python/security/performance/ReDoSUtil.qll

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -155,15 +155,22 @@ private class RegexpCharacterConstant extends RegExpConstant {
155155
RegexpCharacterConstant() { this.isCharacter() }
156156
}
157157

158+
/**
159+
* A regexp term that is relevant for this ReDoS analysis.
160+
*/
161+
class RelevantRegExpTerm extends RegExpTerm {
162+
RelevantRegExpTerm() { getRoot(this).isRelevant() }
163+
}
164+
158165
/**
159166
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
160167
*
161168
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
162169
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
163170
*/
164-
private predicate isCanonicalTerm(RegExpTerm term, string str) {
171+
private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
165172
term =
166-
rank[1](RegExpTerm t, Location loc, File file |
173+
min(RelevantRegExpTerm t, Location loc, File file |
167174
loc = t.getLocation() and
168175
file = t.getFile() and
169176
str = t.getRawValue()
@@ -178,15 +185,15 @@ private predicate isCanonicalTerm(RegExpTerm term, string str) {
178185
private newtype TInputSymbol =
179186
/** An input symbol corresponding to character `c`. */
180187
Char(string c) {
181-
c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
188+
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
182189
} or
183190
/**
184191
* An input symbol representing all characters matched by
185192
* a (non-universal) character class that has string representation `charClassString`.
186193
*/
187194
CharClass(string charClassString) {
188-
exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
189-
exists(RegExpTerm recc | isCanonicalTerm(recc, charClassString) |
195+
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
196+
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
190197
recc instanceof RegExpCharacterClass and
191198
not recc.(RegExpCharacterClass).isUniversalClass()
192199
or
@@ -251,7 +258,7 @@ class InputSymbol extends TInputSymbol {
251258
/**
252259
* An abstract input symbol that represents a character class.
253260
*/
254-
abstract private class CharacterClass extends InputSymbol {
261+
abstract class CharacterClass extends InputSymbol {
255262
/**
256263
* Gets a character that is relevant for intersection-tests involving this
257264
* character class.
@@ -626,13 +633,10 @@ RegExpRoot getRoot(RegExpTerm term) {
626633
}
627634

628635
private newtype TState =
629-
Match(RegExpTerm t, int i) {
630-
getRoot(t).isRelevant() and
631-
(
632-
i = 0
633-
or
634-
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
635-
)
636+
Match(RelevantRegExpTerm t, int i) {
637+
i = 0
638+
or
639+
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
636640
} or
637641
Accept(RegExpRoot l) { l.isRelevant() } or
638642
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }

0 commit comments

Comments
 (0)