Skip to content

Commit b1ad61e

Browse files
authored
Merge pull request github#14481 from erik-krogh/proper-codepoints
ReDoS: use the new codePointAt and codePointCount methods instead of regex hacks
2 parents 2ddcd1d + fa1e8ee commit b1ad61e

File tree

3 files changed

+34
-9
lines changed

3 files changed

+34
-9
lines changed

docs/codeql/ql-language-reference/ql-language-specification.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,6 +1993,10 @@ The following built-in predicates are members of type ``int``:
19931993
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
19941994
| ``toUnicode`` | string | | The result is the unicode character for the receiver seen as a unicode code point. |
19951995
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
1996+
| ``codePointAt`` | int | int | The result is the unicode code point at the index given by the argument. |
1997+
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
1998+
| ``codePointCount`` | int | int, int | The result is the number of unicode code points in the receiver between the given indices. |
1999+
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
19962000

19972001
The leftmost bit after ``bitShiftRightSigned`` depends on sign extension, whereas after ``bitShiftRight`` it is zero.
19982002

shared/regex/codeql/regex/nfa/NfaUtils.qll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,17 +164,17 @@ module Make<RegexTreeViewSig TreeImpl> {
164164
/** An input symbol corresponding to character `c`. */
165165
Char(string c) {
166166
c =
167-
getCodepointAt(any(RegexpCharacterConstant cc |
167+
getACodepoint(any(RegexpCharacterConstant cc |
168168
cc instanceof RelevantRegExpTerm and
169169
not isIgnoreCase(cc.getRootTerm())
170-
).getValue(), _)
170+
).getValue())
171171
or
172172
// normalize everything to lower case if the regexp is case insensitive
173173
c =
174174
any(RegexpCharacterConstant cc, string char |
175175
cc instanceof RelevantRegExpTerm and
176176
isIgnoreCase(cc.getRootTerm()) and
177-
char = getCodepointAt(cc.getValue(), _)
177+
char = getACodepoint(cc.getValue())
178178
|
179179
char.toLowerCase()
180180
)
@@ -370,7 +370,7 @@ module Make<RegexTreeViewSig TreeImpl> {
370370
string getARelevantChar() {
371371
exists(asciiPrintable(result))
372372
or
373-
exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
373+
exists(RegexpCharacterConstant c | result = getACodepoint(c.getValue()))
374374
or
375375
classEscapeMatches(_, result)
376376
}
@@ -1258,7 +1258,7 @@ module Make<RegexTreeViewSig TreeImpl> {
12581258
* Gets a `char` that occurs in a `pump` string.
12591259
*/
12601260
private string getAProcessChar() {
1261-
result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
1261+
result = getACodepoint(any(string s | isReDoSCandidate(_, s)))
12621262
}
12631263
}
12641264

shared/util/codeql/util/Strings.qll

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ string escape(string s) {
1818
bindingset[s]
1919
private string escapeUnicodeString(string s) {
2020
result =
21-
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
21+
concat(int i, string char |
22+
char = escapeUnicodeChar(s.codePointAt(i).toUnicode())
23+
|
24+
char order by i
25+
)
2226
}
2327

2428
/**
@@ -44,15 +48,32 @@ private predicate isPrintable(string char) {
4448

4549
/**
4650
* Gets the `i`th codepoint in `s`.
51+
* Unpaired surrogates are skipped.
52+
*/
53+
bindingset[s]
54+
string getCodepointAt(string s, int i) {
55+
// codePointAt returns the integer codePoint, so we need to convert to a string.
56+
// codePointAt returns integers for both the high and low end. The invalid strings are filtered out by `toUnicode`, but we need to re-count the index, therefore the rank.
57+
// rank is 1-indexed, so we need to offset for that to make this predicate 0-indexed.
58+
result =
59+
rank[i + 1](string char, int charIndex |
60+
char = s.codePointAt(charIndex).toUnicode()
61+
|
62+
char order by charIndex
63+
)
64+
}
65+
66+
/**
67+
* Gets any unicode character that appears in `s`.
4768
*/
4869
bindingset[s]
49-
string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
70+
string getACodepoint(string s) { result = s.codePointAt(_).toUnicode() }
5071

5172
/**
52-
* Gets the length of `s` in codepoints.
73+
* Gets the number of unicode codepoints in `s` not counting unpaired surrogates.
5374
*/
5475
bindingset[str]
55-
int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
76+
int getCodepointLength(string str) { result = str.codePointCount(0, str.length()) }
5677

5778
/**
5879
* Gets the ASCII code for `char`.

0 commit comments

Comments
 (0)