Skip to content

Commit b99c075

Browse files
authored
Merge pull request github#6460 from yoff/python-regex-parsing-consistency-checks
Python: Add regex parsing consistency checks
2 parents 1dc712f + a01fca5 commit b99c075

17 files changed

+583
-17
lines changed

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,29 @@ private import semmle.python.regex
77
* An element containing a regular expression term, that is, either
88
* a string literal (parsed as a regular expression)
99
* or another regular expression term.
10+
*
11+
* For sequences and alternations, we require at least one child.
12+
* Otherwise, we wish to represent the term differently.
13+
* This avoids multiple representations of the same term.
1014
*/
1115
newtype TRegExpParent =
1216
/** A string literal used as a regular expression */
1317
TRegExpLiteral(Regex re) or
1418
/** A quantified term */
1519
TRegExpQuantifier(Regex re, int start, int end) { re.qualifiedItem(start, end, _, _) } or
1620
/** A sequence term */
17-
TRegExpSequence(Regex re, int start, int end) { re.sequence(start, end) } or
18-
/** An alternatio term */
19-
TRegExpAlt(Regex re, int start, int end) { re.alternation(start, end) } or
21+
TRegExpSequence(Regex re, int start, int end) {
22+
re.sequence(start, end) and
23+
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
24+
} or
25+
/** An alternation term */
26+
TRegExpAlt(Regex re, int start, int end) {
27+
re.alternation(start, end) and
28+
exists(int part_end |
29+
re.alternationOption(start, end, start, part_end) and
30+
part_end < end
31+
) // if an alternation does not have more than one element, it should be treated as that element instead.
32+
} or
2033
/** A character class term */
2134
TRegExpCharacterClass(Regex re, int start, int end) { re.charSet(start, end) } or
2235
/** A character range term */
@@ -93,8 +106,7 @@ class RegExpTerm extends RegExpParent {
93106
or
94107
this = TRegExpQuantifier(re, start, end)
95108
or
96-
this = TRegExpSequence(re, start, end) and
97-
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
109+
this = TRegExpSequence(re, start, end)
98110
or
99111
this = TRegExpSpecialChar(re, start, end)
100112
}
@@ -341,10 +353,7 @@ class RegExpRange extends RegExpQuantifier {
341353
* This is a sequence with the elements `(ECMA|Java)` and `Script`.
342354
*/
343355
class RegExpSequence extends RegExpTerm, TRegExpSequence {
344-
RegExpSequence() {
345-
this = TRegExpSequence(re, start, end) and
346-
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
347-
}
356+
RegExpSequence() { this = TRegExpSequence(re, start, end) }
348357

349358
override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) }
350359

python/ql/lib/semmle/python/regex.qll

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -369,12 +369,12 @@ abstract class RegexString extends Expr {
369369
// hex value \xhh
370370
this.getChar(start + 1) = "x" and end = start + 4
371371
or
372-
// octal value \ooo
372+
// octal value \o, \oo, or \ooo
373373
end in [start + 2 .. start + 4] and
374-
this.getText().substring(start + 1, end).toInt() >= 0 and
374+
forall(int i | i in [start + 1 .. end - 1] | this.isOctal(i)) and
375375
not (
376376
end < start + 4 and
377-
exists(this.getText().substring(start + 1, end + 1).toInt())
377+
this.isOctal(end)
378378
)
379379
or
380380
// 16-bit hex value \uhhhh
@@ -392,6 +392,9 @@ abstract class RegexString extends Expr {
392392
)
393393
}
394394

395+
pragma[inline]
396+
private predicate isOctal(int index) { this.getChar(index) = [0 .. 7].toString() }
397+
395398
/** Holds if `index` is inside a character set. */
396399
predicate inCharSet(int index) {
397400
exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
@@ -690,6 +693,7 @@ abstract class RegexString extends Expr {
690693

691694
private predicate numbered_backreference(int start, int end, int value) {
692695
this.escapingChar(start) and
696+
// starting with 0 makes it an octal escape
693697
not this.getChar(start + 1) = "0" and
694698
exists(string text, string svalue, int len |
695699
end = start + len and
@@ -698,8 +702,16 @@ abstract class RegexString extends Expr {
698702
|
699703
svalue = text.substring(start + 1, start + len) and
700704
value = svalue.toInt() and
701-
not exists(text.substring(start + 1, start + len + 1).toInt()) and
702-
value > 0
705+
// value is composed of digits
706+
forall(int i | i in [start + 1 .. start + len - 1] | this.getChar(i) = [0 .. 9].toString()) and
707+
// a longer reference is not possible
708+
not (
709+
len = 2 and
710+
exists(text.substring(start + 1, start + len + 1).toInt())
711+
) and
712+
// 3 octal digits makes it an octal escape
713+
not forall(int i | i in [start + 1 .. start + 4] | this.isOctal(i))
714+
// TODO: Inside a character set, all numeric escapes are treated as characters.
703715
)
704716
}
705717

python/ql/test/library-tests/regex/Alternation.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@
1919
| x\| | 0 | 2 | x\| | 0 | 1 | x |
2020
| x\| | 0 | 2 | x\| | 2 | 2 | |
2121
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
22-
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
22+
| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |

python/ql/test/library-tests/regex/Characters.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
| [^A-Z] | 2 | 3 |
5353
| [^A-Z] | 4 | 5 |
5454
| [^]] | 2 | 3 |
55+
| \\+0 | 0 | 2 |
56+
| \\+0 | 2 | 3 |
5557
| \\A[+-]?\\d+ | 0 | 2 |
5658
| \\A[+-]?\\d+ | 3 | 4 |
5759
| \\A[+-]?\\d+ | 4 | 5 |

python/ql/test/library-tests/regex/Consistency.expected

Whitespace-only changes.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
/**
2+
* Flags regular expressions that are parsed ambigously
3+
*/
4+
5+
import python
6+
import semmle.python.regex
7+
8+
from string str, Location loc, int counter
9+
where
10+
counter = strictcount(Regex term | term.getLocation() = loc and term.getText() = str) and
11+
counter > 1
12+
select str, counter, loc

python/ql/test/library-tests/regex/FirstLast.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
| [^A-Z] | last | 0 | 6 |
4343
| [^]] | first | 0 | 4 |
4444
| [^]] | last | 0 | 4 |
45+
| \\+0 | first | 0 | 2 |
46+
| \\+0 | last | 2 | 3 |
4547
| \\A[+-]?\\d+ | first | 0 | 2 |
4648
| \\A[+-]?\\d+ | last | 7 | 9 |
4749
| \\A[+-]?\\d+ | last | 7 | 10 |

python/ql/test/library-tests/regex/Regex.expected

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@
113113
| [^]] | char | 2 | 3 |
114114
| [^]] | char-set | 0 | 4 |
115115
| [^]] | sequence | 0 | 4 |
116+
| \\+0 | char | 0 | 2 |
117+
| \\+0 | char | 2 | 3 |
118+
| \\+0 | sequence | 0 | 3 |
116119
| \\A[+-]?\\d+ | char | 0 | 2 |
117120
| \\A[+-]?\\d+ | char | 3 | 4 |
118121
| \\A[+-]?\\d+ | char | 4 | 5 |

python/ql/test/library-tests/regex/charRangeTest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424

2525
re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
2626

27-
re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
27+
re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
28+
re.compile(r'[\0-\07]') #$ charRange=1:3-4:7
2829

2930
re.compile(r'[\0123-5]') #$ charRange=5:6-7:8
3031

python/ql/test/library-tests/regex/escapedCharacterTest.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
re.compile(r'[--\-]') #$ escapedCharacter=3:5
1111
re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
1212
re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
13-
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
13+
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
14+
re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
1415
re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
16+
re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21
1517

1618
#ODASA-3985
1719
#Half Surrogate pairs
@@ -21,3 +23,9 @@
2123

2224
#Misparsed on LGTM
2325
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
26+
27+
#Non-raw string
28+
re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
29+
30+
#Backreference confusion
31+
re.compile(r'\+0') #$ escapedCharacter=0:2

0 commit comments

Comments
 (0)