Skip to content

Commit 5d4b542

Browse files
committed
escape unicode chars in overly-large-range
1 parent 7f5f25c commit 5d4b542

File tree

5 files changed

+87
-76
lines changed

5 files changed

+87
-76
lines changed

python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
| test.py:25:32:25:34 | 7-F | Suspicious character range that is equivalent to [7-9:;<=>?@A-F]. |
99
| test.py:27:36:27:38 | 0-9 | Suspicious character range that overlaps with \\d in the same character class. |
1010
| test.py:29:39:29:41 | .-? | Suspicious character range that overlaps with \\w in the same character class, and is equivalent to [.\\/0-9:;<=>?]. |
11+
| test.py:31:30:31:32 | \ufffd-\ufffd | Suspicious character range that overlaps with \\ufffd-\\ufffd in the same character class. |

python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,5 @@
2727
overlapsWithClass1 = re.compile(r'[0-9\d]') # NOT OK
2828

2929
overlapsWithClass2 = re.compile(r'[\w,.-?:*+]') # NOT OK
30+
31+
unicodeStuff = re.compile('[\U0001D173-\U0001D17A\U000E0020-\U000E007F\U000e0001]') # NOT OK

shared/regex/codeql/regex/OverlyLargeRangeQuery.qll

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ private import RegexTreeView
99
*/
1010
module Make<RegexTreeViewSig TreeImpl> {
1111
private import TreeImpl
12+
private import codeql.util.Strings as Strings
1213

1314
/**
1415
* Gets a rank for `range` that is unique for ranges in the same file.
@@ -213,7 +214,7 @@ module Make<RegexTreeViewSig TreeImpl> {
213214
bindingset[char]
214215
private string escape(string char) {
215216
exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" |
216-
if char.regexpMatch(reg) then result = "\\" + char else result = char
217+
if char.regexpMatch(reg) then result = "\\" + char else result = Strings::escape(char)
217218
)
218219
}
219220

@@ -257,14 +258,15 @@ module Make<RegexTreeViewSig TreeImpl> {
257258
or
258259
priority = 1 and
259260
exists(RegExpCharacterRange other |
260-
reason = "overlaps with " + other + " in the same character class" and
261+
reason = "overlaps with " + Strings::escape(other.toString()) + " in the same character class" and
261262
rankRange(result) < rankRange(other) and
262263
overlap(result, other)
263264
)
264265
or
265266
priority = 2 and
266267
exists(RegExpCharacterClassEscape escape |
267-
reason = "overlaps with " + escape + " in the same character class" and
268+
reason =
269+
"overlaps with " + escapeRegExpCharacterClassEscape(escape) + " in the same character class" and
268270
overlapsWithCharEscape(result, escape)
269271
)
270272
or
@@ -276,6 +278,13 @@ module Make<RegexTreeViewSig TreeImpl> {
276278
)
277279
}
278280

281+
pragma[inline]
282+
private string escapeRegExpCharacterClassEscape(RegExpCharacterClassEscape escape) {
283+
if escape.toString().matches("%-%")
284+
then result = Strings::escape(escape.toString()) // might contain unicode characters
285+
else result = escape.toString() // just a plain `\d` or `\w` etc. Those are already escaped.
286+
}
287+
279288
/** Holds if `range` matches suspiciously many characters. */
280289
predicate problem(RegExpCharacterRange range, string reason) {
281290
reason =

shared/regex/codeql/regex/nfa/NfaUtils.qll

Lines changed: 4 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
private import codeql.regex.RegexTreeView
66
private import codeql.util.Numbers
7+
private import codeql.util.Strings
78

89
/**
910
* Classes and predicates that create an NFA and various algorithms for working with it.
@@ -15,34 +16,7 @@ module Make<RegexTreeViewSig TreeImpl> {
1516
* Gets the char after `c` (from a simplified ASCII table).
1617
*/
1718
private string nextChar(string c) {
18-
exists(int code | code = ascii(c) | code + 1 = ascii(result))
19-
}
20-
21-
/**
22-
* Gets the `i`th codepoint in `s`.
23-
*/
24-
bindingset[s]
25-
private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
26-
27-
/**
28-
* Gets the length of `s` in codepoints.
29-
*/
30-
bindingset[str]
31-
private int getCodepointLength(string str) {
32-
result = str.regexpReplaceAll("(.|\\s)", "x").length()
33-
}
34-
35-
/**
36-
* Gets an approximation for the ASCII code for `char`.
37-
* Only the easily printable chars are included (so no newline, tab, null, etc).
38-
*/
39-
private int ascii(string char) {
40-
char =
41-
rank[result](string c |
42-
c =
43-
"! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
44-
.charAt(_)
45-
)
19+
exists(int code | code = asciiPrintable(c) | code + 1 = asciiPrintable(result))
4620
}
4721

4822
/**
@@ -394,7 +368,7 @@ module Make<RegexTreeViewSig TreeImpl> {
394368
* Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`.
395369
*/
396370
string getARelevantChar() {
397-
exists(ascii(result))
371+
exists(asciiPrintable(result))
398372
or
399373
exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
400374
or
@@ -1191,7 +1165,7 @@ module Make<RegexTreeViewSig TreeImpl> {
11911165
private string relevant(RegExpRoot root) {
11921166
root = relevantRoot() and
11931167
(
1194-
exists(ascii(result)) and exists(root)
1168+
exists(asciiPrintable(result)) and exists(root)
11951169
or
11961170
exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _))
11971171
or
@@ -1322,49 +1296,6 @@ module Make<RegexTreeViewSig TreeImpl> {
13221296
)
13231297
}
13241298

1325-
/**
1326-
* Gets the result of backslash-escaping newlines, carriage-returns and
1327-
* backslashes in `s`.
1328-
*/
1329-
bindingset[s]
1330-
private string escape(string s) {
1331-
result =
1332-
escapeUnicodeString(s.replaceAll("\\", "\\\\")
1333-
.replaceAll("\n", "\\n")
1334-
.replaceAll("\r", "\\r")
1335-
.replaceAll("\t", "\\t"))
1336-
}
1337-
1338-
/**
1339-
* Gets a string where the unicode characters in `s` have been escaped.
1340-
*/
1341-
bindingset[s]
1342-
private string escapeUnicodeString(string s) {
1343-
result =
1344-
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
1345-
}
1346-
1347-
/**
1348-
* Gets a unicode escaped string for `char`.
1349-
* If `char` is a printable char, then `char` is returned.
1350-
*/
1351-
bindingset[char]
1352-
private string escapeUnicodeChar(string char) {
1353-
if isPrintable(char)
1354-
then result = char
1355-
else
1356-
if exists(to4digitHex(any(int i | i.toUnicode() = char)))
1357-
then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
1358-
else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
1359-
}
1360-
1361-
/** Holds if `char` is easily printable char, or whitespace. */
1362-
private predicate isPrintable(string char) {
1363-
exists(ascii(char))
1364-
or
1365-
char = "\n\r\t".charAt(_)
1366-
}
1367-
13681299
/**
13691300
* Gets `str` with the last `i` characters moved to the front.
13701301
*

shared/util/codeql/util/Strings.qll

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
private import Numbers
2+
3+
/**
4+
* Gets the result of backslash-escaping newlines, carriage-returns, backslashes, and unicode characters in `s`.
5+
*/
6+
bindingset[s]
7+
string escape(string s) {
8+
result =
9+
escapeUnicodeString(s.replaceAll("\\", "\\\\")
10+
.replaceAll("\n", "\\n")
11+
.replaceAll("\r", "\\r")
12+
.replaceAll("\t", "\\t"))
13+
}
14+
15+
/**
16+
* Gets a string where the unicode characters in `s` have been escaped.
17+
*/
18+
bindingset[s]
19+
private string escapeUnicodeString(string s) {
20+
result =
21+
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
22+
}
23+
24+
/**
25+
* Gets a unicode escaped string for `char`.
26+
* If `char` is a printable char, then `char` is returned.
27+
*/
28+
bindingset[char]
29+
private string escapeUnicodeChar(string char) {
30+
if isPrintable(char)
31+
then result = char
32+
else
33+
if exists(to4digitHex(any(int i | i.toUnicode() = char)))
34+
then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
35+
else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
36+
}
37+
38+
/** Holds if `char` is easily printable char, or whitespace. */
39+
private predicate isPrintable(string char) {
40+
exists(asciiPrintable(char))
41+
or
42+
char = "\n\r\t".charAt(_)
43+
}
44+
45+
/**
46+
* Gets the `i`th codepoint in `s`.
47+
*/
48+
bindingset[s]
49+
string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
50+
51+
/**
52+
* Gets the length of `s` in codepoints.
53+
*/
54+
bindingset[str]
55+
int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
56+
57+
/**
58+
* Gets the ASCII code for `char`.
59+
* Only the easily printable chars are included (so no newline, tab, null, etc).
60+
*/
61+
int asciiPrintable(string char) {
62+
char =
63+
rank[result](string c |
64+
c =
65+
"! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
66+
.charAt(_)
67+
)
68+
}

0 commit comments

Comments
 (0)