Skip to content

Commit 6a3b9e1

Browse files
authored
Merge pull request github#13914 from erik-krogh/escape-unicode
ReDoS: escape unicode chars in the output for the ReDoS queries
2 parents fec9626 + fe54256 commit 6a3b9e1

File tree

8 files changed

+135
-34
lines changed

8 files changed

+135
-34
lines changed

javascript/ql/test/query-tests/Security/CWE-400/ReDoS/PolynomialBackTracking.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@
445445
| tst.js:146:15:146:21 | (\\d\|5)* | Strings with many repetitions of '0' can start matching anywhere after the start of the preceeding ((\\d\|5)*)" |
446446
| tst.js:149:15:149:24 | (\\s\|[\\f])* | Strings with many repetitions of '\\t' can start matching anywhere after the start of the preceeding ((\\s\|[\\f])*)" |
447447
| tst.js:152:15:152:28 | (\\s\|[\\v]\|\\\\v)* | Strings with many repetitions of '\\t' can start matching anywhere after the start of the preceeding ((\\s\|[\\v]\|\\\\v)*)" |
448-
| tst.js:155:15:155:24 | (\\f\|[\\f])* | Strings with many repetitions of '\u000c' can start matching anywhere after the start of the preceeding ((\\f\|[\\f])*)" |
448+
| tst.js:155:15:155:24 | (\\f\|[\\f])* | Strings with many repetitions of '\\u000c' can start matching anywhere after the start of the preceeding ((\\f\|[\\f])*)" |
449449
| tst.js:158:15:158:22 | (\\W\|\\D)* | Strings with many repetitions of '/' can start matching anywhere after the start of the preceeding ((\\W\|\\D)*)" |
450450
| tst.js:161:15:161:22 | (\\S\|\\w)* | Strings with many repetitions of '!' can start matching anywhere after the start of the preceeding ((\\S\|\\w)*)" |
451451
| tst.js:164:15:164:24 | (\\S\|[\\w])* | Strings with many repetitions of '!' can start matching anywhere after the start of the preceeding ((\\S\|[\\w])*)" |

javascript/ql/test/query-tests/Security/CWE-400/ReDoS/ReDoS.expected

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,9 @@
123123
| tst.js:137:15:137:21 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
124124
| tst.js:143:15:143:22 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
125125
| tst.js:146:15:146:21 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
126-
| tst.js:149:15:149:24 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
127-
| tst.js:152:15:152:28 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
128-
| tst.js:155:15:155:24 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
126+
| tst.js:149:15:149:24 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
127+
| tst.js:152:15:152:28 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000b'. |
128+
| tst.js:155:15:155:24 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
129129
| tst.js:158:15:158:22 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '/'. |
130130
| tst.js:161:15:161:22 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
131131
| tst.js:164:15:164:24 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
@@ -199,3 +199,5 @@
199199
| tst.js:404:6:405:7 | (g\|gg)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'gg'. |
200200
| tst.js:407:125:407:127 | \\s* | This part of the regular expression may cause exponential backtracking on strings starting with '0/*' and containing many repetitions of ' ;0'. |
201201
| tst.js:411:15:411:19 | a{1,} | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
202+
| tst.js:413:25:413:35 | (\\u0000\|.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\n\\u0000' and containing many repetitions of '\\u0000'. |
203+
| tst.js:415:44:415:57 | (\ud83d\ude80\|.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\n\\u{1f680}' and containing many repetitions of '\\u{1f680}'. |

javascript/ql/test/query-tests/Security/CWE-400/ReDoS/tst.js

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,4 +408,8 @@ var bad98 = /^(?:\*\/\*|[a-zA-Z0-9][a-zA-Z0-9!\#\$&\-\^_\.\+]{0,126}\/(?:\*|[a-z
408408

409409
var good48 = /(\/(?:\/[\w.-]*)*){0,1}:([\w.-]+)/;
410410

411-
var bad99 = /(a{1,})*b/;
411+
var bad99 = /(a{1,})*b/;
412+
413+
var unicode = /^\n\u0000(\u0000|.)+$/;
414+
415+
var largeUnicode = new RegExp("^\n\u{1F680}(\u{1F680}|.)+X$");

python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@
3535
| redos.py:139:25:139:31 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
3636
| redos.py:145:25:145:32 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
3737
| redos.py:148:25:148:31 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
38-
| redos.py:151:25:151:34 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
39-
| redos.py:154:25:154:38 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
40-
| redos.py:157:25:157:34 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
38+
| redos.py:151:25:151:34 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
39+
| redos.py:154:25:154:38 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000b'. |
40+
| redos.py:157:25:157:34 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
4141
| redos.py:160:25:160:32 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' '. |
4242
| redos.py:163:25:163:32 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
4343
| redos.py:166:25:166:34 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
@@ -67,8 +67,8 @@
6767
| redos.py:259:24:259:126 | (.thisisagoddamnlongstringforstresstestingthequery\|\\sthisisagoddamnlongstringforstresstestingthequery)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\tthisisagoddamnlongstringforstresstestingthequery'. |
6868
| redos.py:262:24:262:87 | (thisisagoddamnlongstringforstresstestingthequery\|this\\w+query)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'thisisagoddamnlongstringforstresstestingthequery'. |
6969
| redos.py:262:78:262:80 | \\w+ | This part of the regular expression may cause exponential backtracking on strings starting with 'this' and containing many repetitions of '0querythis'. |
70-
| redos.py:268:28:268:39 | ([\ufffd\ufffd]\|[\ufffd\ufffd])* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\ufffd'. |
71-
| redos.py:271:28:271:41 | ((\ufffd\|\ufffd)\|(\ufffd\|\ufffd))* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\ufffd'. |
70+
| redos.py:268:28:268:39 | ([\ufffd\ufffd]\|[\ufffd\ufffd])* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\\ufffd'. |
71+
| redos.py:271:28:271:41 | ((\ufffd\|\ufffd)\|(\ufffd\|\ufffd))* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\\ufffd'. |
7272
| redos.py:274:31:274:32 | b+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |
7373
| redos.py:277:48:277:50 | \\s* | This part of the regular expression may cause exponential backtracking on strings starting with '<0\\t0=' and containing many repetitions of '""\\t0='. |
7474
| redos.py:283:26:283:27 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
@@ -103,5 +103,5 @@
103103
| redos.py:385:24:385:30 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
104104
| redos.py:386:26:386:32 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
105105
| redos.py:391:15:391:25 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
106-
| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
106+
| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\\u00c6'. |
107107
| unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |

ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
| tst.rb:137:11:137:17 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
3434
| tst.rb:143:11:143:18 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
3535
| tst.rb:146:11:146:17 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
36-
| tst.rb:149:11:149:20 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
37-
| tst.rb:152:11:152:24 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
38-
| tst.rb:155:11:155:20 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
36+
| tst.rb:149:11:149:20 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
37+
| tst.rb:152:11:152:24 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000b'. |
38+
| tst.rb:155:11:155:20 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
3939
| tst.rb:158:11:158:18 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' '. |
4040
| tst.rb:161:11:161:18 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
4141
| tst.rb:164:11:164:20 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |

shared/regex/codeql/regex/nfa/NfaUtils.qll

Lines changed: 86 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*/
44

55
private import codeql.regex.RegexTreeView
6+
private import codeql.util.Numbers
67

78
/**
89
* Classes and predicates that create an NFA and various algorithms for working with it.
@@ -17,6 +18,20 @@ module Make<RegexTreeViewSig TreeImpl> {
1718
exists(int code | code = ascii(c) | code + 1 = ascii(result))
1819
}
1920

21+
/**
22+
* Gets the `i`th codepoint in `s`.
23+
*/
24+
bindingset[s]
25+
private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
26+
27+
/**
28+
* Gets the length of `s` in codepoints.
29+
*/
30+
bindingset[str]
31+
private int getCodepointLength(string str) {
32+
result = str.regexpReplaceAll("(.|\\s)", "x").length()
33+
}
34+
2035
/**
2136
* Gets an approximation for the ASCII code for `char`.
2237
* Only the easily printable chars are included (so no newline, tab, null, etc).
@@ -189,17 +204,17 @@ module Make<RegexTreeViewSig TreeImpl> {
189204
/** An input symbol corresponding to character `c`. */
190205
Char(string c) {
191206
c =
192-
any(RegexpCharacterConstant cc |
193-
cc instanceof RelevantRegExpTerm and
194-
not isIgnoreCase(cc.getRootTerm())
195-
).getValue().charAt(_)
207+
getCodepointAt(any(RegexpCharacterConstant cc |
208+
cc instanceof RelevantRegExpTerm and
209+
not isIgnoreCase(cc.getRootTerm())
210+
).getValue(), _)
196211
or
197212
// normalize everything to lower case if the regexp is case insensitive
198213
c =
199214
any(RegexpCharacterConstant cc, string char |
200215
cc instanceof RelevantRegExpTerm and
201216
isIgnoreCase(cc.getRootTerm()) and
202-
char = cc.getValue().charAt(_)
217+
char = getCodepointAt(cc.getValue(), _)
203218
|
204219
char.toLowerCase()
205220
)
@@ -395,7 +410,7 @@ module Make<RegexTreeViewSig TreeImpl> {
395410
string getARelevantChar() {
396411
exists(ascii(result))
397412
or
398-
exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
413+
exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
399414
or
400415
classEscapeMatches(_, result)
401416
}
@@ -693,6 +708,12 @@ module Make<RegexTreeViewSig TreeImpl> {
693708
)
694709
}
695710

711+
pragma[noinline]
712+
private int getCodepointLengthForState(string s) {
713+
result = getCodepointLength(s) and
714+
s = any(RegexpCharacterConstant reg).getValue()
715+
}
716+
696717
/**
697718
* Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
698719
*/
@@ -701,16 +722,16 @@ module Make<RegexTreeViewSig TreeImpl> {
701722
q1 = Match(s, i) and
702723
(
703724
not isIgnoreCase(s.getRootTerm()) and
704-
lbl = Char(s.getValue().charAt(i))
725+
lbl = Char(getCodepointAt(s.getValue(), i))
705726
or
706727
// normalize everything to lower case if the regexp is case insensitive
707728
isIgnoreCase(s.getRootTerm()) and
708-
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
729+
exists(string c | c = getCodepointAt(s.getValue(), i) | lbl = Char(c.toLowerCase()))
709730
) and
710731
(
711732
q2 = Match(s, i + 1)
712733
or
713-
s.getValue().length() = i + 1 and
734+
getCodepointLengthForState(s.getValue()) = i + 1 and
714735
q2 = after(s)
715736
)
716737
)
@@ -811,7 +832,7 @@ module Make<RegexTreeViewSig TreeImpl> {
811832
Match(RelevantRegExpTerm t, int i) {
812833
i = 0
813834
or
814-
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
835+
exists(getCodepointAt(t.(RegexpCharacterConstant).getValue(), i))
815836
} or
816837
/**
817838
* An accept state, where exactly the given input string is accepted.
@@ -1104,7 +1125,9 @@ module Make<RegexTreeViewSig TreeImpl> {
11041125
*/
11051126
predicate reachesOnlyRejectableSuffixes(State fork, string w) {
11061127
isReDoSCandidate(fork, w) and
1107-
forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and
1128+
forex(State next | next = process(fork, w, getCodepointLengthForCandidate(w) - 1) |
1129+
isLikelyRejectable(next)
1130+
) and
11081131
not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened.
11091132
}
11101133

@@ -1214,6 +1237,13 @@ module Make<RegexTreeViewSig TreeImpl> {
12141237
exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _))
12151238
}
12161239

1240+
// `process` can't use pragma[inline] predicates. So a materialized version of `getCodepointAt` is needed.
1241+
pragma[noinline]
1242+
private string getCodePointAtForProcess(string str, int i) {
1243+
result = getCodepointAt(str, i) and
1244+
isReDoSCandidate(_, str)
1245+
}
1246+
12171247
/**
12181248
* Gets a state that can be reached from pumpable `fork` consuming all
12191249
* chars in `w` any number of times followed by the first `i+1` characters of `w`.
@@ -1223,7 +1253,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12231253
exists(State prev | prev = getProcessPrevious(fork, i, w) |
12241254
not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected.
12251255
exists(string char, InputSymbol sym |
1226-
char = w.charAt(i) and
1256+
char = getCodePointAtForProcess(w, i) and
12271257
deltaClosed(prev, sym, result) and
12281258
// noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
12291259
// Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
@@ -1232,6 +1262,12 @@ module Make<RegexTreeViewSig TreeImpl> {
12321262
)
12331263
}
12341264

1265+
pragma[noinline]
1266+
private int getCodepointLengthForCandidate(string s) {
1267+
result = getCodepointLength(s) and
1268+
isReDoSCandidate(_, s)
1269+
}
1270+
12351271
/**
12361272
* Gets a state that can be reached from pumpable `fork` consuming all
12371273
* chars in `w` any number of times followed by the first `i` characters of `w`.
@@ -1245,7 +1281,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12451281
or
12461282
// repeat until fixpoint
12471283
i = 0 and
1248-
result = process(fork, w, w.length() - 1)
1284+
result = process(fork, w, getCodepointLengthForCandidate(w) - 1)
12491285
)
12501286
}
12511287

@@ -1261,7 +1297,9 @@ module Make<RegexTreeViewSig TreeImpl> {
12611297
/**
12621298
* Gets a `char` that occurs in a `pump` string.
12631299
*/
1264-
private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) }
1300+
private string getAProcessChar() {
1301+
result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
1302+
}
12651303
}
12661304

12671305
/**
@@ -1305,10 +1343,40 @@ module Make<RegexTreeViewSig TreeImpl> {
13051343
bindingset[s]
13061344
private string escape(string s) {
13071345
result =
1308-
s.replaceAll("\\", "\\\\")
1309-
.replaceAll("\n", "\\n")
1310-
.replaceAll("\r", "\\r")
1311-
.replaceAll("\t", "\\t")
1346+
escapeUnicodeString(s.replaceAll("\\", "\\\\")
1347+
.replaceAll("\n", "\\n")
1348+
.replaceAll("\r", "\\r")
1349+
.replaceAll("\t", "\\t"))
1350+
}
1351+
1352+
/**
1353+
* Gets a string where the unicode characters in `s` have been escaped.
1354+
*/
1355+
bindingset[s]
1356+
private string escapeUnicodeString(string s) {
1357+
result =
1358+
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
1359+
}
1360+
1361+
/**
1362+
* Gets a unicode escaped string for `char`.
1363+
* If `char` is a printable char, then `char` is returned.
1364+
*/
1365+
bindingset[char]
1366+
private string escapeUnicodeChar(string char) {
1367+
if isPrintable(char)
1368+
then result = char
1369+
else
1370+
if exists(to4digitHex(any(int i | i.toUnicode() = char)))
1371+
then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
1372+
else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
1373+
}
1374+
1375+
/** Holds if `char` is easily printable char, or whitespace. */
1376+
private predicate isPrintable(string char) {
1377+
exists(ascii(char))
1378+
or
1379+
char = "\n\r\t".charAt(_)
13121380
}
13131381

13141382
/**

shared/regex/qlpack.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ version: 0.1.3-dev
33
groups: shared
44
library: true
55
dependencies:
6+
codeql/util: ${workspace}
67
warnOnImplicitThis: true

0 commit comments

Comments
 (0)