Skip to content

Commit 0bce424

Browse files
committed
support arbitrary codepoints in NfaUtils.qll
1 parent 859e1bf commit 0bce424

File tree

4 files changed

+66
-23
lines changed

4 files changed

+66
-23
lines changed

javascript/ql/test/query-tests/Security/CWE-400/ReDoS/ReDoS.expected

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,4 @@
200200
| tst.js:407:125:407:127 | \\s* | This part of the regular expression may cause exponential backtracking on strings starting with '0/*' and containing many repetitions of ' ;0'. |
201201
| tst.js:411:15:411:19 | a{1,} | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
202202
| tst.js:413:25:413:35 | (\\u0000\|.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\n\\u0000' and containing many repetitions of '\\u0000'. |
203+
| tst.js:415:44:415:57 | (\ud83d\ude80\|.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\n\\u{1f680}' and containing many repetitions of '\\u{1f680}'. |

javascript/ql/test/query-tests/Security/CWE-400/ReDoS/tst.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,4 +410,6 @@ var good48 = /(\/(?:\/[\w.-]*)*){0,1}:([\w.-]+)/;
410410

411411
var bad99 = /(a{1,})*b/;
412412

413-
var unicode = /^\n\u0000(\u0000|.)+$/;
413+
var unicode = /^\n\u0000(\u0000|.)+$/;
414+
415+
var largeUnicode = new RegExp("^\n\u{1F680}(\u{1F680}|.)+X$");

shared/regex/codeql/regex/nfa/NfaUtils.qll

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ module Make<RegexTreeViewSig TreeImpl> {
1818
exists(int code | code = ascii(c) | code + 1 = ascii(result))
1919
}
2020

21+
/**
22+
* Gets the `i`th codepoint in `s`.
23+
*/
24+
bindingset[s]
25+
private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
26+
27+
/**
28+
* Gets the length of `s` in codepoints.
29+
*/
30+
bindingset[str]
31+
private int getCodepointLength(string str) {
32+
result = max(int m | exists(str.regexpFind("(.|\\s)", m - 1, _)) or m = 0)
33+
}
34+
2135
/**
2236
* Gets an approximation for the ASCII code for `char`.
2337
* Only the easily printable chars are included (so no newline, tab, null, etc).
@@ -190,17 +204,17 @@ module Make<RegexTreeViewSig TreeImpl> {
190204
/** An input symbol corresponding to character `c`. */
191205
Char(string c) {
192206
c =
193-
any(RegexpCharacterConstant cc |
194-
cc instanceof RelevantRegExpTerm and
195-
not isIgnoreCase(cc.getRootTerm())
196-
).getValue().charAt(_)
207+
getCodepointAt(any(RegexpCharacterConstant cc |
208+
cc instanceof RelevantRegExpTerm and
209+
not isIgnoreCase(cc.getRootTerm())
210+
).getValue(), _)
197211
or
198212
// normalize everything to lower case if the regexp is case insensitive
199213
c =
200214
any(RegexpCharacterConstant cc, string char |
201215
cc instanceof RelevantRegExpTerm and
202216
isIgnoreCase(cc.getRootTerm()) and
203-
char = cc.getValue().charAt(_)
217+
char = getCodepointAt(cc.getValue(), _)
204218
|
205219
char.toLowerCase()
206220
)
@@ -396,7 +410,7 @@ module Make<RegexTreeViewSig TreeImpl> {
396410
string getARelevantChar() {
397411
exists(ascii(result))
398412
or
399-
exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
413+
exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
400414
or
401415
classEscapeMatches(_, result)
402416
}
@@ -702,16 +716,16 @@ module Make<RegexTreeViewSig TreeImpl> {
702716
q1 = Match(s, i) and
703717
(
704718
not isIgnoreCase(s.getRootTerm()) and
705-
lbl = Char(s.getValue().charAt(i))
719+
lbl = Char(getCodepointAt(s.getValue(), i))
706720
or
707721
// normalize everything to lower case if the regexp is case insensitive
708722
isIgnoreCase(s.getRootTerm()) and
709-
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
723+
exists(string c | c = getCodepointAt(s.getValue(), i) | lbl = Char(c.toLowerCase()))
710724
) and
711725
(
712726
q2 = Match(s, i + 1)
713727
or
714-
s.getValue().length() = i + 1 and
728+
getCodepointLength(s.getValue()) = i + 1 and
715729
q2 = after(s)
716730
)
717731
)
@@ -812,7 +826,7 @@ module Make<RegexTreeViewSig TreeImpl> {
812826
Match(RelevantRegExpTerm t, int i) {
813827
i = 0
814828
or
815-
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
829+
exists(getCodepointAt(t.(RegexpCharacterConstant).getValue(), i))
816830
} or
817831
/**
818832
* An accept state, where exactly the given input string is accepted.
@@ -1105,7 +1119,9 @@ module Make<RegexTreeViewSig TreeImpl> {
11051119
*/
11061120
predicate reachesOnlyRejectableSuffixes(State fork, string w) {
11071121
isReDoSCandidate(fork, w) and
1108-
forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and
1122+
forex(State next | next = process(fork, w, getCodepointLength(w) - 1) |
1123+
isLikelyRejectable(next)
1124+
) and
11091125
not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened.
11101126
}
11111127

@@ -1215,6 +1231,12 @@ module Make<RegexTreeViewSig TreeImpl> {
12151231
exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _))
12161232
}
12171233

1234+
// `process` can't use pragma[inline] predicates. So a materialized version of `getCodepointAt` is needed.
1235+
private string getCodePointAtForProcess(string str, int i) {
1236+
result = getCodepointAt(str, i) and
1237+
exists(getProcessPrevious(_, _, str))
1238+
}
1239+
12181240
/**
12191241
* Gets a state that can be reached from pumpable `fork` consuming all
12201242
* chars in `w` any number of times followed by the first `i+1` characters of `w`.
@@ -1224,7 +1246,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12241246
exists(State prev | prev = getProcessPrevious(fork, i, w) |
12251247
not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected.
12261248
exists(string char, InputSymbol sym |
1227-
char = w.charAt(i) and
1249+
char = getCodePointAtForProcess(w, i) and
12281250
deltaClosed(prev, sym, result) and
12291251
// noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
12301252
// Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
@@ -1246,7 +1268,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12461268
or
12471269
// repeat until fixpoint
12481270
i = 0 and
1249-
result = process(fork, w, w.length() - 1)
1271+
result = process(fork, w, getCodepointLength(w) - 1)
12501272
)
12511273
}
12521274

@@ -1262,7 +1284,9 @@ module Make<RegexTreeViewSig TreeImpl> {
12621284
/**
12631285
* Gets a `char` that occurs in a `pump` string.
12641286
*/
1265-
private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) }
1287+
private string getAProcessChar() {
1288+
result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
1289+
}
12661290
}
12671291

12681292
/**
@@ -1317,7 +1341,8 @@ module Make<RegexTreeViewSig TreeImpl> {
13171341
*/
13181342
bindingset[s]
13191343
private string escapeUnicodeString(string s) {
1320-
result = concat(int i, string char | char = escapeUnicodeChar(s.charAt(i)) | char order by i)
1344+
result =
1345+
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
13211346
}
13221347

13231348
/**
@@ -1328,7 +1353,10 @@ module Make<RegexTreeViewSig TreeImpl> {
13281353
private string escapeUnicodeChar(string char) {
13291354
if isPrintable(char)
13301355
then result = char
1331-
else result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
1356+
else
1357+
if exists(to4digitHex(any(int i | i.toUnicode() = char)))
1358+
then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
1359+
else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
13321360
}
13331361

13341362
/** Holds if `char` is easily printable char, or whitespace. */

shared/util/codeql/util/Numbers.qll

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ int parseHexInt(string hex) {
5050
sum(int index, string c |
5151
c = stripped.charAt(index)
5252
|
53-
sixteenToThe(stripped.length() - 1 - index) * toHex(c)
53+
sixteenToThe(stripped.length() - 1 - index) * charToHex(c)
5454
)
5555
)
5656
}
@@ -83,7 +83,7 @@ int parseOctalInt(string octal) {
8383
}
8484

8585
/** Gets the integer value of the `hex` char. */
86-
private int toHex(string hex) {
86+
private int charToHex(string hex) {
8787
hex = [0 .. 9].toString() and
8888
result = hex.toInt()
8989
or
@@ -107,11 +107,23 @@ bindingset[i]
107107
string to4digitHex(int i) {
108108
i >= 0 and
109109
i <= 65535 and
110+
exists(string hex | hex = toHex(i) |
111+
result = concat(int zeroes | zeroes = [1 .. 4 - hex.length()] | "0") + hex
112+
)
113+
}
114+
115+
/**
116+
* Gets a hex representation of `i`.
117+
*/
118+
bindingset[i]
119+
string toHex(int i) {
110120
result =
111-
"0123456789abcdef".charAt(i.bitShiftRight(12).bitAnd(15)) +
112-
"0123456789abcdef".charAt(i.bitShiftRight(8).bitAnd(15)) +
113-
"0123456789abcdef".charAt(i.bitShiftRight(4).bitAnd(15)) +
114-
"0123456789abcdef".charAt(i.bitAnd(15))
121+
// make the number with lots of preceding zeroes, then remove all preceding zeroes in a post-processing step
122+
concat(int shift |
123+
shift in [28, 24, 20, 16, 12, 8, 4, 0]
124+
|
125+
"0123456789abcdef".charAt(i.bitShiftRight(shift).bitAnd(15)) order by shift desc
126+
).regexpReplaceAll("^0*", "")
115127
}
116128

117129
/**

0 commit comments

Comments
 (0)