Skip to content

Commit c08f94e

Browse files
committed
Python: Fix parsing of octal escapes
1 parent 34b054f commit c08f94e

File tree

9 files changed

+42
-17
lines changed

9 files changed

+42
-17
lines changed

python/ql/src/semmle/python/RegexTreeView.qll

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,11 @@ class RegExpTerm extends RegExpParent {
7575
int end;
7676

7777
RegExpTerm() {
78-
this = TRegExpAlt(re, start, end)
78+
this = TRegExpAlt(re, start, end) and
79+
exists(int part_end |
80+
re.alternationOption(start, end, start, part_end) and
81+
part_end < end
82+
) // if an alternation does not have more than one element, it should be treated as that element instead.
7983
or
8084
this = TRegExpBackRef(re, start, end)
8185
or

python/ql/src/semmle/python/regex.qll

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -371,10 +371,13 @@ abstract class RegexString extends Expr {
371371
or
372372
// octal value \ooo
373373
end in [start + 2 .. start + 4] and
374-
this.getText().substring(start + 1, end).toInt() >= 0 and
374+
// this.isOctal([start + 1 .. end]) and
375+
forall(int i | i in [start + 1 .. end - 1] | this.isOctal(i)) and
376+
// this.getText().substring(start + 1, end).toInt() >= 0 and
375377
not (
376378
end < start + 4 and
377-
exists(this.getText().substring(start + 1, end + 1).toInt())
379+
this.isOctal(end) //and
380+
// exists(this.getText().substring(start + 1, end + 1).toInt())
378381
)
379382
or
380383
// 16-bit hex value \uhhhh
@@ -392,6 +395,11 @@ abstract class RegexString extends Expr {
392395
)
393396
}
394397

398+
pragma[inline]
399+
private predicate isOctal(int index) {
400+
this.getChar(index) in ["0", "1", "2", "3", "4", "5", "6", "7"]
401+
}
402+
395403
/** Holds if `index` is inside a character set. */
396404
predicate inCharSet(int index) {
397405
exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
@@ -690,6 +698,7 @@ abstract class RegexString extends Expr {
690698

691699
private predicate numbered_backreference(int start, int end, int value) {
692700
this.escapingChar(start) and
701+
// starting with 0 makes it an octal escape
693702
not this.getChar(start + 1) = "0" and
694703
exists(string text, string svalue, int len |
695704
end = start + len and
@@ -698,8 +707,18 @@ abstract class RegexString extends Expr {
698707
|
699708
svalue = text.substring(start + 1, start + len) and
700709
value = svalue.toInt() and
701-
not exists(text.substring(start + 1, start + len + 1).toInt()) and
702-
value > 0
710+
// value is composed of digits
711+
forall(int i | i in [start + 1 .. start + len - 1] |
712+
this.getChar(i) in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
713+
) and
714+
// a longer reference is not possible
715+
not (
716+
len = 2 and
717+
exists(text.substring(start + 1, start + len + 1).toInt())
718+
) and
719+
// 3 octal digits makes it an octal escape
720+
not forall(int i | i in [start + 1 .. start + 4] | this.isOctal(i))
721+
// TODO: Inside a character set, all numeric escapes are treated as characters.
703722
)
704723
}
705724

python/ql/test/library-tests/regex/Alternation.expected

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
| (?P<name>[\\w]+)\| | 0 | 16 | (?P<name>[\\w]+)\| | 16 | 16 | |
99
| (\\033\|~{) | 1 | 8 | \\033\|~{ | 1 | 5 | \\033 |
1010
| (\\033\|~{) | 1 | 8 | \\033\|~{ | 6 | 8 | ~{ |
11-
| \\+0 | 0 | 3 | \\+0 | 0 | 2 | \\+ |
12-
| \\+0 | 0 | 3 | \\+0 | 0 | 3 | \\+0 |
1311
| \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 0 | 11 | \\\|\\[\\][123] |
1412
| \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 12 | 16 | \\{\\} |
1513
| \|x | 0 | 2 | \|x | 0 | 0 | |

python/ql/test/library-tests/regex/Characters.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
| [^A-Z] | 4 | 5 |
5454
| [^]] | 2 | 3 |
5555
| \\+0 | 0 | 2 |
56-
| \\+0 | 0 | 3 |
56+
| \\+0 | 2 | 3 |
5757
| \\A[+-]?\\d+ | 0 | 2 |
5858
| \\A[+-]?\\d+ | 3 | 4 |
5959
| \\A[+-]?\\d+ | 4 | 5 |

python/ql/test/library-tests/regex/FirstLast.expected

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,7 @@
4343
| [^]] | first | 0 | 4 |
4444
| [^]] | last | 0 | 4 |
4545
| \\+0 | first | 0 | 2 |
46-
| \\+0 | first | 0 | 3 |
47-
| \\+0 | last | 0 | 2 |
48-
| \\+0 | last | 0 | 3 |
46+
| \\+0 | last | 2 | 3 |
4947
| \\A[+-]?\\d+ | first | 0 | 2 |
5048
| \\A[+-]?\\d+ | last | 7 | 9 |
5149
| \\A[+-]?\\d+ | last | 7 | 10 |

python/ql/test/library-tests/regex/Regex.expected

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,7 @@
114114
| [^]] | char-set | 0 | 4 |
115115
| [^]] | sequence | 0 | 4 |
116116
| \\+0 | char | 0 | 2 |
117-
| \\+0 | char | 0 | 3 |
118-
| \\+0 | choice | 0 | 3 |
119-
| \\+0 | sequence | 0 | 2 |
117+
| \\+0 | char | 2 | 3 |
120118
| \\+0 | sequence | 0 | 3 |
121119
| \\A[+-]?\\d+ | char | 0 | 2 |
122120
| \\A[+-]?\\d+ | char | 3 | 4 |

python/ql/test/library-tests/regex/charRangeTest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424

2525
re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
2626

27-
re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
27+
re.compile(r'[\0-\09]') #$ charRange=1:3-4:6
28+
re.compile(r'[\0-\07]') #$ charRange=1:3-4:7
2829

2930
re.compile(r'[\0123-5]') #$ charRange=5:6-7:8
3031

python/ql/test/library-tests/regex/escapedCharacterTest.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
re.compile(r'[--\-]') #$ escapedCharacter=3:5
1111
re.compile(r'[\--\-]') #$ escapedCharacter=1:3 escapedCharacter=4:6
1212
re.compile(r'[0\-9-A-Z]') #$ escapedCharacter=2:4
13-
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:7
13+
re.compile(r'[\0-\09]') #$ escapedCharacter=1:3 escapedCharacter=4:6
14+
re.compile(r'[\0-\07]') #$ escapedCharacter=1:3 escapedCharacter=4:7
1415
re.compile(r'[\0123-5]') #$ escapedCharacter=1:5
16+
re.compile(r'\1754\1854\17\18\07\08') #$ escapedCharacter=0:4 escapedCharacter=16:19 escapedCharacter=19:21
1517

1618
#ODASA-3985
1719
#Half Surrogate pairs
@@ -21,3 +23,9 @@
2123

2224
#Misparsed on LGTM
2325
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)") #$ escapedCharacter=0:2 escapedCharacter=16:18 escapedCharacter=18:20
26+
27+
#Non-raw string
28+
re_blank = re.compile('(\n|\r|\\s)*\n', re.M) #$ escapedCharacter=5:7
29+
30+
#Backreference confusion
31+
re.compile(r'\+0') #$ escapedCharacter=0:2
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
| \\+0 | 2 | test.py:2:18:2:23 | test.py:2 |

0 commit comments

Comments
 (0)