Skip to content

Commit e5f07cc

Browse files
committed
Python: inline test of regex components
- Added naive implementation of `charRange` so the test can run. - Made predicates public as needed.
1 parent a1c38b7 commit e5f07cc

File tree

7 files changed

+190
-3
lines changed

7 files changed

+190
-3
lines changed

python/ql/src/semmle/python/regex.qll

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,26 @@ abstract class RegexString extends Expr {
143143
)
144144
}
145145

146+
/**
147+
* Holds if the character set starting at `charset_start` contains a character range
148+
* with lower bound found between `start` and `lower_end`
149+
* and upper bound found between `upper_start` and `end`.
150+
*/
151+
predicate charRange(int charset_start, int start, int lower_end, int upper_start, int end) {
152+
// mirror logic from `simpleCharacter`
153+
exists(int x, int y |
154+
this.charSet(charset_start, y) and
155+
this.char_set_start(charset_start, x)
156+
|
157+
x <= start and
158+
this.simpleCharacter(start, lower_end) and
159+
this.nonEscapedCharAt(lower_end) = "-" and
160+
lower_end + 1 = upper_start and
161+
this.simpleCharacter(upper_start, end) and
162+
end < y
163+
)
164+
}
165+
146166
predicate escapingChar(int pos) { this.escaping(pos) = true }
147167

148168
private boolean escaping(int pos) {
@@ -192,7 +212,12 @@ abstract class RegexString extends Expr {
192212
not exists(int i | start + 2 < i and i < end - 1 | this.getChar(i) = "}")
193213
}
194214

195-
private predicate escapedCharacter(int start, int end) {
215+
/**
216+
* Holds if an escaped character is found between `start` and `end`.
217+
* Escaped characters include hex values, octal values and named escapes,
218+
* but excludes backreferences.
219+
*/
220+
predicate escapedCharacter(int start, int end) {
196221
this.escapingChar(start) and
197222
not exists(this.getText().substring(start + 1, end + 1).toInt()) and
198223
(
@@ -221,10 +246,9 @@ abstract class RegexString extends Expr {
221246
exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
222247
}
223248

224-
/*
249+
/**
225250
* 'simple' characters are any that don't alter the parsing of the regex.
226251
*/
227-
228252
private predicate simpleCharacter(int start, int end) {
229253
end = start + 1 and
230254
not this.charSet(start, _) and

python/ql/test/library-tests/regex/SubstructureTests.expected

Whitespace-only changes.
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import python
2+
import TestUtilities.InlineExpectationsTest
3+
private import semmle.python.regex
4+
5+
class CharacterSetTest extends InlineExpectationsTest {
6+
CharacterSetTest() { this = "CharacterSetTest" }
7+
8+
override string getARelevantTag() { result = "charSet" }
9+
10+
override predicate hasActualResult(Location location, string element, string tag, string value) {
11+
exists(location.getFile().getRelativePath()) and
12+
location.getFile().getBaseName() = "charSetTest.py" and
13+
exists(Regex re, int start, int end |
14+
re.charSet(start, end) and
15+
location = re.getLocation() and
16+
element = re.toString().substring(start, end) and
17+
value = start + ":" + end and
18+
tag = "charSet"
19+
)
20+
}
21+
}
22+
23+
class CharacterRangeTest extends InlineExpectationsTest {
24+
CharacterRangeTest() { this = "CharacterRangeTest" }
25+
26+
override string getARelevantTag() { result = "charRange" }
27+
28+
override predicate hasActualResult(Location location, string element, string tag, string value) {
29+
exists(location.getFile().getRelativePath()) and
30+
location.getFile().getBaseName() = "charRangeTest.py" and
31+
exists(Regex re, int start, int lower_end, int upper_start, int end |
32+
re.charRange(_, start, lower_end, upper_start, end) and
33+
location = re.getLocation() and
34+
element = re.toString().substring(start, end) and
35+
value = start + ":" + lower_end + "-" + upper_start + ":" + end and
36+
tag = "charRange"
37+
)
38+
}
39+
}
40+
41+
class EscapeTest extends InlineExpectationsTest {
42+
EscapeTest() { this = "EscapeTest" }
43+
44+
override string getARelevantTag() { result = "escapedCharacter" }
45+
46+
override predicate hasActualResult(Location location, string element, string tag, string value) {
47+
exists(location.getFile().getRelativePath()) and
48+
location.getFile().getBaseName() = "escapedCharacterTest.py" and
49+
exists(Regex re, int start, int end |
50+
re.escapedCharacter(start, end) and
51+
location = re.getLocation() and
52+
element = re.toString().substring(start, end) and
53+
value = start + ":" + end and
54+
tag = "escapedCharacter"
55+
)
56+
}
57+
}
58+
59+
class GroupTest extends InlineExpectationsTest {
60+
GroupTest() { this = "GroupTest" }
61+
62+
override string getARelevantTag() { result = "group" }
63+
64+
override predicate hasActualResult(Location location, string element, string tag, string value) {
65+
exists(location.getFile().getRelativePath()) and
66+
location.getFile().getBaseName() = "groupTest.py" and
67+
exists(Regex re, int start, int end |
68+
re.group(start, end) and
69+
location = re.getLocation() and
70+
element = re.toString().substring(start, end) and
71+
value = start + ":" + end and
72+
tag = "group"
73+
)
74+
}
75+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import re
2+
3+
re.compile(r'[]-[]') #$ MISSING: charRange=1:2-3:4
4+
re.compile(r'[---]') #$ MISSING: charRange=1:2-3:4
5+
re.compile(r'[\---]') #$ MISSING: charRange=1:3-4:5
6+
re.compile(r'[--\-]') #$ MISSING: charRange=1:2-3:5
7+
re.compile(r'[\--\-]') #$ cMISSING: harRange=1:3-4:6
8+
re.compile(r'[0-9-A-Z]') #$ MISSING: charRange=1:2-3:4 charRange=5:6-7:8
9+
re.compile(r'[0\-9-A-Z]') #$ MISSING: charRange=4:5-6:7
10+
re.compile(r'[0--9-A-Z]') #$ MISSING: charRange=1:2-3:4 charRange=4:5-6:7
11+
12+
re.compile(r'[^A-Z]') #$ MISSING: charRange=2:3-4:5
13+
14+
re.compile(r'[\0-\09]') #$ MISSING: charRange=1:3-4:7
15+
16+
re.compile(r'[\0123-5]') #$ MISSING: charRange=5:6-7:8
17+
18+
19+
#Negative lookahead
20+
re.compile(r'(?!not-this)^[A-Z_]+$') #$ MISSING: charRange=14:15-16:17
21+
#Negative lookbehind
22+
re.compile(r'^[A-Z_]+$(?<!not-this)') #$ MISSING: charRange=2:3-4:5
23+
24+
25+
#OK -- ODASA-ODASA-3968
26+
re.compile('(?:[^%]|^)?%\((\w*)\)[a-z]') #$ MISSING: charRange=22:23-24:25
27+
28+
#ODASA-3985
29+
#Half Surrogate pairs
30+
re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') #$ MISSING: charRange=1:2-3:4 charRange=6:7-8:9
31+
#Outside BMP
32+
re.compile(u'[\U00010000-\U0010ffff]') #$ MISSING: charRange=1:2-3:4
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import re
2+
re.compile(r'\A[+-]?\d+') #$ MISSING: charSet=2:6
3+
re.compile(r'(?P<name>[\w]+)|') #$ MISSING: charSet=9:13
4+
re.compile(r'\|\[\][123]|\{\}') #$ MISSING: charSet=6:11
5+
re.compile(r'[^A-Z]') #$ MISSING: charSet=0:6
6+
re.compile("[]]") #$ charSet=0:3
7+
re.compile("[][]") #$ MISSING: charSet=0:4
8+
re.compile("[^][^]") #$ MISSING: charSet=0:6
9+
re.compile("[.][.]") #$ charSet=0:3 MISSING: charSet=3:6
10+
re.compile("[[]]") #$ charSet=0:3
11+
re.compile("[^]]") #$ MISSING: charSet=0:4
12+
re.compile("[^-]") #$ MISSING: charSet=0:4
13+
re.compile("[]-[]") #$ MISSING: charSet=0:5
14+
re.compile("[^]-[]") #$ MISSING: charSet=0:6
15+
16+
re.compile("]]][[[[]") #$ MISSING: charSet=3:8
17+
18+
19+
#ODASA-3985
20+
#Half Surrogate pairs
21+
re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') #$ MISSING: charSet=0:5 charSet=5:10
22+
#Outside BMP
23+
re.compile(u'[\U00010000-\U0010ffff]') #$ MISSING: charSet=0:5
24+
25+
#Misparsed on LGTM
26+
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ MISSING: charSet=10:14 charSet=28:32
27+
28+
# parses wrongly, sees this \|/ as a char set start
29+
re.compile(r'''(?:[\s;,"'<>(){}|[\]@=+*]|:(?![/\\]))+''') #$ MISSING: charSet=3:25 charSet=30:35
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import re
2+
3+
re.compile(r'\b') #$ escapedCharacter=0:2
4+
re.compile(r'''\b''') #$ escapedCharacter=0:2
5+
re.compile(r"\b") #$ escapedCharacter=0:2
6+
re.compile(u"\b") # not escape
7+
re.compile("\b") # not escape
8+
re.compile(r'\\\b') #$ escapedCharacter=0:2 MISSING: escapedCharacter=2:4
9+
re.compile(r'[\---]') #$ escapedCharacter=1:3
10+
re.compile(r'[--\-]') #$ MISSING: escapedCharacter=3:5
11+
re.compile(r'[\--\-]') #$ escapedCharacter=1:3 MISSING: escapedCharacter=4:6
12+
re.compile(r'[0\-9-A-Z]') #$ MISSING: escapedCharacter=2:4
13+
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 MISSING: escapedCharacter=4:7
14+
re.compile(r'[\0123-5]') #$ MISSING: escapedCharacter=1:5
15+
16+
#ODASA-3985
17+
#Half Surrogate pairs
18+
re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') # not escapes
19+
#Outside BMP
20+
re.compile(u'[\U00010000-\U0010ffff]') # not escapes
21+
22+
#Misparsed on LGTM
23+
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 MISSING: escapedCharacter=16:18 escapedCharacter=18:20
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import re
2+
3+
re.compile(r'(?P<first>\w+) (?P<second>\w+)') #$ MISSING: group=0:14 group=15:30
4+
re.compile(r'([)(])') #$ MISSING: group=0:6

0 commit comments

Comments
 (0)