Skip to content

Commit 363dc49

Browse files
authored
Merge pull request github#14292 from github/max-schaefer/fix-python-regex-locations
Python: Improve source-location information for RegExpTerms.
2 parents 8f189cb + dfec162 commit 363dc49

File tree

11 files changed

+176
-41
lines changed

11 files changed

+176
-41
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: fix
3+
---
4+
* Subterms of regular expressions encoded as single-line string literals now have better source-location information.

python/ql/lib/semmle/python/regexp/RegexTreeView.qll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,10 +227,11 @@ module Impl implements RegexTreeViewSig {
227227
predicate hasLocationInfo(
228228
string filepath, int startline, int startcolumn, int endline, int endcolumn
229229
) {
230-
exists(int re_start |
230+
exists(int re_start, int prefix_len | prefix_len = re.getPrefix().length() |
231231
re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, _) and
232-
startcolumn = re_start + start + 4 and
233-
endcolumn = re_start + end + 3
232+
startcolumn = re_start + start + prefix_len and
233+
endcolumn = re_start + end + prefix_len - 1
234+
/* inclusive vs exclusive */
234235
)
235236
}
236237

python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ private module FindRegexMode {
101101
}
102102

103103
/**
104-
* DEPRECATED: Use `Regex` instead.
104+
* DEPRECATED: Use `RegExp` instead.
105105
*/
106106
deprecated class Regex = RegExp;
107107

@@ -327,6 +327,17 @@ class RegExp extends Expr instanceof StrConst {
327327
/** Gets the text of this regex */
328328
string getText() { result = super.getText() }
329329

330+
/**
331+
* Gets the prefix of this regex
332+
*
333+
* Examples:
334+
*
335+
* - The prefix of `'x*y'` is `'`.
336+
* - The prefix of `r''` is `r'`.
337+
* - The prefix of `r"""x*y"""` is `r"""`.
338+
*/
339+
string getPrefix() { result = super.getPrefix() }
340+
330341
/** Gets the `i`th character of this regex */
331342
string getChar(int i) { result = this.getText().charAt(i) }
332343

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
| locations.py | 14 | 2 |
2+
| locations.py | 19 | 3 |
3+
| locations.py | 24 | 3 |
4+
| locations.py | 29 | 4 |
5+
| locations.py | 34 | 4 |
6+
| locations.py | 39 | 5 |
7+
| locations.py | 44 | 5 |
8+
| locations.py | 49 | 6 |
9+
| locations.py | 54 | 2 |
10+
| locations.py | 54 | 23 |
11+
| locations.py | 59 | 2 |
12+
| locations.py | 59 | 23 |
13+
| locations.py | 65 | 2 |
14+
| locations.py | 65 | 23 |
15+
| locations.py | 72 | 6 |
16+
| locations.py | 72 | 27 |
17+
| locations.py | 80 | 3 |
18+
| locations.py | 85 | 5 |
19+
| locations.py | 90 | 2 |
20+
| locations.py | 90 | 23 |
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import semmle.python.regexp.RegexTreeView::RegexTreeView
2+
3+
from RegExpTerm t, string file, int line, int column
4+
where
5+
t.toString() = "[this]" and
6+
t.hasLocationInfo(file, line, column, _, _)
7+
select file, line, column
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import re
2+
3+
# This file contains tests for the regexp parser's location tracking.
4+
5+
# To keep the expected results manageable, we only test the locations of the
6+
# regexp term `[this]`, appearing in various kinds of regexps.
7+
#
8+
# To make the location information easier to understand, we generally put each
9+
# regexp on its own line, even though this is not idiomatic Python.
10+
# Comments indicate cases we currently do not handle correctly.
11+
12+
# plain string
13+
re.compile(
14+
'[this] is a test'
15+
)
16+
17+
# raw string
18+
re.compile(
19+
r'[this] is a test'
20+
)
21+
22+
# byte string
23+
re.compile(
24+
b'[this] is a test'
25+
)
26+
27+
# byte raw string
28+
re.compile(
29+
br'[this] is a test'
30+
)
31+
32+
# multiline string
33+
re.compile(
34+
'''[this] is a test'''
35+
)
36+
37+
# multiline raw string
38+
re.compile(
39+
r'''[this] is a test'''
40+
)
41+
42+
# multiline byte string
43+
re.compile(
44+
b'''[this] is a test'''
45+
)
46+
47+
# multiline byte raw string
48+
re.compile(
49+
br'''[this] is a test'''
50+
)
51+
52+
# plain string with multiple parts (second [this] gets wrong column: 23 instead of 26)
53+
re.compile(
54+
'[this] is a test' ' and [this] is another test'
55+
)
56+
57+
# plain string with multiple parts across lines (second [this] gets wrong location: 59:23 instead of 60:7)
58+
re.compile(
59+
'[this] is a test'
60+
' and [this] is another test'
61+
)
62+
63+
# plain string with multiple parts across lines and comments (second [this] gets wrong location: 65:23 instead of 67:7)
64+
re.compile(
65+
'[this] is a test'
66+
# comment
67+
' and [this] is another test'
68+
)
69+
70+
# actual multiline string (both [this]s get wrong location: 72:6 and 72:27 instead of 73:1 and 74:5)
71+
re.compile(
72+
r'''
73+
[this] is a test
74+
and [this] is another test
75+
'''
76+
)
77+
78+
# plain string with escape sequences ([this] gets wrong location: 80:3 instead of 80:4)
79+
re.compile(
80+
'\t[this] is a test'
81+
)
82+
83+
# raw string with escape sequences
84+
re.compile(
85+
r'\A[this] is a test'
86+
)
87+
88+
# plain string with escaped newline (second [this] gets wrong location: 90:23 instead of 91:6)
89+
re.compile(
90+
'[this] is a test\
91+
and [this] is another test'
92+
)
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
| hosttest.py:6:31:6:53 | (www\|beta).example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | hosttest.py:6:27:6:51 | ControlFlowNode for Str | here |
1+
| hosttest.py:6:28:6:50 | (www\|beta).example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | hosttest.py:6:27:6:51 | ControlFlowNode for Str | here |
Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
| test.py:3:29:3:31 | 0-9 | Suspicious character range that overlaps with 3-5 in the same character class. |
2-
| test.py:5:31:5:33 | A-z | Suspicious character range that overlaps with A-Z in the same character class, and is equivalent to [A-Z\\[\\\\\\]^_`a-z]. |
3-
| test.py:7:28:7:30 | z-a | Suspicious character range that is empty. |
4-
| test.py:17:38:17:40 | A-f | Suspicious character range that overlaps with a-f in the same character class, and is equivalent to [A-Z\\[\\\\\\]^_`a-f]. |
5-
| test.py:19:30:19:32 | $-` | Suspicious character range that is equivalent to [$%&'()*+,\\-.\\/0-9:;<=>?@A-Z\\[\\\\\\]^_`]. |
6-
| test.py:21:43:21:45 | +-< | Suspicious character range that is equivalent to [+,\\-.\\/0-9:;<]. |
7-
| test.py:23:47:23:49 | .-_ | Suspicious character range that overlaps with 1-9 in the same character class, and is equivalent to [.\\/0-9:;<=>?@A-Z\\[\\\\\\]^_]. |
8-
| test.py:25:34:25:36 | 7-F | Suspicious character range that is equivalent to [7-9:;<=>?@A-F]. |
9-
| test.py:27:38:27:40 | 0-9 | Suspicious character range that overlaps with \\d in the same character class. |
10-
| test.py:29:41:29:43 | .-? | Suspicious character range that overlaps with \\w in the same character class, and is equivalent to [.\\/0-9:;<=>?]. |
1+
| test.py:3:27:3:29 | 0-9 | Suspicious character range that overlaps with 3-5 in the same character class. |
2+
| test.py:5:29:5:31 | A-z | Suspicious character range that overlaps with A-Z in the same character class, and is equivalent to [A-Z\\[\\\\\\]^_`a-z]. |
3+
| test.py:7:26:7:28 | z-a | Suspicious character range that is empty. |
4+
| test.py:17:36:17:38 | A-f | Suspicious character range that overlaps with a-f in the same character class, and is equivalent to [A-Z\\[\\\\\\]^_`a-f]. |
5+
| test.py:19:28:19:30 | $-` | Suspicious character range that is equivalent to [$%&'()*+,\\-.\\/0-9:;<=>?@A-Z\\[\\\\\\]^_`]. |
6+
| test.py:21:41:21:43 | +-< | Suspicious character range that is equivalent to [+,\\-.\\/0-9:;<]. |
7+
| test.py:23:45:23:47 | .-_ | Suspicious character range that overlaps with 1-9 in the same character class, and is equivalent to [.\\/0-9:;<=>?@A-Z\\[\\\\\\]^_]. |
8+
| test.py:25:32:25:34 | 7-F | Suspicious character range that is equivalent to [7-9:;<=>?@A-F]. |
9+
| test.py:27:36:27:38 | 0-9 | Suspicious character range that overlaps with \\d in the same character class. |
10+
| test.py:29:39:29:41 | .-? | Suspicious character range that overlaps with \\w in the same character class, and is equivalent to [.\\/0-9:;<=>?]. |
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
| test.py:8:12:8:23 | Str | test.py:8:21:8:23 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ |
2-
| test.py:9:14:9:29 | Str | test.py:9:27:9:29 | \\d+ | Strings starting with '0.9' and with many repetitions of '99' can start matching anywhere after the start of the preceeding \\d+ |
3-
| test.py:11:22:11:33 | Str | test.py:11:31:11:33 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ |
4-
| test.py:18:14:18:25 | Str | test.py:18:23:18:25 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ |
5-
| test.py:20:23:20:274 | Str | test.py:20:273:20:274 | .* | Strings starting with 'AAAAAAAAAAAAAAAAAAAABBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC' and with many repetitions of 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC' can start matching anywhere after the start of the preceeding (AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)C.*Y |
1+
| test.py:8:12:8:23 | Str | test.py:8:19:8:21 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ |
2+
| test.py:9:14:9:29 | Str | test.py:9:25:9:27 | \\d+ | Strings starting with '0.9' and with many repetitions of '99' can start matching anywhere after the start of the preceeding \\d+ |
3+
| test.py:11:22:11:33 | Str | test.py:11:29:11:31 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ |
4+
| test.py:18:14:18:25 | Str | test.py:18:21:18:23 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ |
5+
| test.py:20:23:20:274 | Str | test.py:20:271:20:272 | .* | Strings starting with 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC' and with many repetitions of 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC' can start matching anywhere after the start of the preceeding (AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)(AA\|BB)C.*Y |

python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialReDoS.expected

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ nodes
2727
| test.py:21:18:21:21 | ControlFlowNode for text | semmle.label | ControlFlowNode for text |
2828
subpaths
2929
#select
30-
| test.py:8:30:8:33 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:8:30:8:33 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:8:21:8:23 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
31-
| test.py:9:32:9:35 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:9:32:9:35 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings starting with '0.9' and with many repetitions of '99'. | test.py:9:27:9:29 | \\d+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
32-
| test.py:12:17:12:20 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:12:17:12:20 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:11:31:11:33 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
33-
| test.py:16:24:16:30 | ControlFlowNode for my_text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:16:24:16:30 | ControlFlowNode for my_text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:18:23:18:25 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
34-
| test.py:21:18:21:21 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:21:18:21:21 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings starting with 'AAAAAAAAAAAAAAAAAAAABBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC' and with many repetitions of 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC'. | test.py:20:273:20:274 | .* | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
30+
| test.py:8:30:8:33 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:8:30:8:33 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:8:19:8:21 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
31+
| test.py:9:32:9:35 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:9:32:9:35 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings starting with '0.9' and with many repetitions of '99'. | test.py:9:25:9:27 | \\d+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
32+
| test.py:12:17:12:20 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:12:17:12:20 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:11:29:11:31 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
33+
| test.py:16:24:16:30 | ControlFlowNode for my_text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:16:24:16:30 | ControlFlowNode for my_text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:18:21:18:23 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |
34+
| test.py:21:18:21:21 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:21:18:21:21 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings starting with 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC' and with many repetitions of 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC'. | test.py:20:271:20:272 | .* | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value |

0 commit comments

Comments
 (0)