Skip to content

Commit 74ca1d0

Browse files
committed
Python: More precise regex parsing
1 parent e5f07cc commit 74ca1d0

File tree

5 files changed

+201
-66
lines changed

5 files changed

+201
-66
lines changed

python/ql/src/semmle/python/regex.qll

Lines changed: 160 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,57 @@ deprecated string mode_from_mode_object(Value obj) {
121121
abstract class RegexString extends Expr {
122122
RegexString() { (this instanceof Bytes or this instanceof Unicode) }
123123

124+
override string toString() {
125+
result = this.(Bytes).getText()
126+
or
127+
result = this.(Unicode).getText()
128+
}
129+
130+
/** result is true for those start chars that actually mark a start of a char set. */
131+
boolean char_set_start(int pos) {
132+
exists(int index |
133+
char_set_delimiter(index, pos) = true and
134+
(
135+
index = 1 and result = true // if a '[' is first in the string (among brackets), it starts a char set
136+
or
137+
index > 1 and
138+
not char_set_delimiter(index - 1, _) = false and
139+
result = false
140+
or
141+
exists(int p1 |
142+
char_set_delimiter(index - 1, p1) = false and // if it is preceded by a closing bracket, it starts a char set
143+
if
144+
exists(int p2 |
145+
p1 = p2 + 1
146+
or
147+
this.getChar(p2 + 1) = "^" and
148+
p1 = p2 + 2
149+
|
150+
char_set_delimiter(index - 2, p2) = true // but the closing bracket only closes...
151+
)
152+
then
153+
exists(int p2 | char_set_delimiter(index - 2, p2) = true |
154+
result = char_set_start(p2).booleanNot() // ...if it is not the first in a char set
155+
)
156+
else result = true
157+
)
158+
)
159+
)
160+
}
161+
162+
/** result denotes if the index is a left bracket */
163+
boolean char_set_delimiter(int index, int pos) {
164+
pos = rank[index](int p | this.nonEscapedCharAt(p) = "[" or this.nonEscapedCharAt(p) = "]") and
165+
(
166+
this.nonEscapedCharAt(pos) = "[" and result = true
167+
or
168+
this.nonEscapedCharAt(pos) = "]" and result = false
169+
)
170+
}
171+
172+
/** Hold is a character set starts between `start` and `end`. */
124173
predicate char_set_start(int start, int end) {
125-
this.nonEscapedCharAt(start) = "[" and
174+
this.char_set_start(start) = true and
126175
(
127176
this.getChar(start + 1) = "^" and end = start + 2
128177
or
@@ -143,23 +192,80 @@ abstract class RegexString extends Expr {
143192
)
144193
}
145194

195+
/** An indexed version of `char_set_token/3` */
196+
private predicate char_set_token(int charset_start, int index, int token_start, int token_end) {
197+
token_start =
198+
rank[index](int start, int end | this.char_set_token(charset_start, start, end) | start) and
199+
this.char_set_token(charset_start, token_start, token_end)
200+
}
201+
202+
/** Either a char or a - */
203+
private predicate char_set_token(int charset_start, int start, int end) {
204+
this.char_set_start(charset_start, start) and
205+
(
206+
this.escapedCharacter(start, end)
207+
or
208+
exists(this.nonEscapedCharAt(start)) and end = start + 1
209+
)
210+
or
211+
this.char_set_token(charset_start, _, start) and
212+
(
213+
this.escapedCharacter(start, end)
214+
or
215+
exists(this.nonEscapedCharAt(start)) and
216+
end = start + 1 and
217+
not this.getChar(start) = "]"
218+
)
219+
}
220+
221+
/**
222+
* Holds if the character set starting at `charset_start` contains either
223+
* a character or a range found between `start` and `end`.
224+
*/
225+
predicate char_set_child(int charset_start, int start, int end) {
226+
this.char_set_token(charset_start, start, end) and
227+
not exists(int range_start, int range_end |
228+
this.charRange(charset_start, range_start, _, _, range_end) and
229+
range_start <= start and
230+
range_end >= end
231+
)
232+
or
233+
this.charRange(charset_start, start, _, _, end)
234+
}
235+
146236
/**
147237
* Holds if the character set starting at `charset_start` contains a character range
148238
* with lower bound found between `start` and `lower_end`
149239
* and upper bound found between `upper_start` and `end`.
150240
*/
151241
predicate charRange(int charset_start, int start, int lower_end, int upper_start, int end) {
152-
// mirror logic from `simpleCharacter`
153-
exists(int x, int y |
154-
this.charSet(charset_start, y) and
155-
this.char_set_start(charset_start, x)
156-
|
157-
x <= start and
158-
this.simpleCharacter(start, lower_end) and
159-
this.nonEscapedCharAt(lower_end) = "-" and
160-
lower_end + 1 = upper_start and
161-
this.simpleCharacter(upper_start, end) and
162-
end < y
242+
exists(int index |
243+
this.charRangeEnd(charset_start, index) = true and
244+
this.char_set_token(charset_start, index - 2, start, lower_end) and
245+
this.char_set_token(charset_start, index, upper_start, end)
246+
)
247+
}
248+
249+
private boolean charRangeEnd(int charset_start, int index) {
250+
this.char_set_token(charset_start, index, _, _) and
251+
(
252+
index in [1, 2] and result = false
253+
or
254+
index > 2 and
255+
exists(int connector_start |
256+
this.char_set_token(charset_start, index - 1, connector_start, _) and
257+
this.nonEscapedCharAt(connector_start) = "-" and
258+
result =
259+
this.charRangeEnd(charset_start, index - 2)
260+
.booleanNot()
261+
.booleanAnd(this.charRangeEnd(charset_start, index - 1).booleanNot())
262+
)
263+
or
264+
not exists(int connector_start |
265+
this.char_set_token(charset_start, index - 1, connector_start, _) and
266+
this.nonEscapedCharAt(connector_start) = "-"
267+
) and
268+
result = false
163269
)
164270
}
165271

@@ -184,14 +290,14 @@ abstract class RegexString extends Expr {
184290

185291
string nonEscapedCharAt(int i) {
186292
result = this.getText().charAt(i) and
187-
not this.escapingChar(i - 1)
293+
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1])
188294
}
189295

190296
private predicate isOptionDivider(int i) { this.nonEscapedCharAt(i) = "|" }
191297

192-
private predicate isGroupEnd(int i) { this.nonEscapedCharAt(i) = ")" }
298+
private predicate isGroupEnd(int i) { this.nonEscapedCharAt(i) = ")" and not this.inCharSet(i) }
193299

194-
private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" }
300+
private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) }
195301

196302
predicate failedToParse(int i) {
197303
exists(this.getChar(i)) and
@@ -219,14 +325,18 @@ abstract class RegexString extends Expr {
219325
*/
220326
predicate escapedCharacter(int start, int end) {
221327
this.escapingChar(start) and
222-
not exists(this.getText().substring(start + 1, end + 1).toInt()) and
328+
not this.numbered_backreference(start, _, _) and
223329
(
224330
// hex value \xhh
225331
this.getChar(start + 1) = "x" and end = start + 4
226332
or
227333
// octal value \ooo
228334
end in [start + 2 .. start + 4] and
229-
exists(this.getText().substring(start + 1, end).toInt())
335+
this.getText().substring(start + 1, end).toInt() >= 0 and
336+
not (
337+
end < start + 4 and
338+
exists(this.getText().substring(start + 1, end + 1).toInt())
339+
)
230340
or
231341
// 16-bit hex value \uhhhh
232342
this.getChar(start + 1) = "u" and end = start + 6
@@ -238,11 +348,13 @@ abstract class RegexString extends Expr {
238348
or
239349
// escape not handled above, update when adding a new case
240350
not this.getChar(start + 1) in ["x", "u", "U", "N"] and
351+
not exists(this.getChar(start + 1).toInt()) and
241352
end = start + 2
242353
)
243354
}
244355

245-
private predicate inCharSet(int index) {
356+
/** Holds if `index` is inside a character set. */
357+
predicate inCharSet(int index) {
246358
exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2])
247359
}
248360

@@ -262,7 +374,7 @@ abstract class RegexString extends Expr {
262374
or
263375
start = z - 2
264376
or
265-
start > y and start < z - 2 and not c = "-"
377+
start > y and start < z - 2 and not this.charRange(_, _, start, end, _)
266378
)
267379
or
268380
not this.inCharSet(start) and
@@ -281,7 +393,8 @@ abstract class RegexString extends Expr {
281393
or
282394
this.escapedCharacter(start, end)
283395
) and
284-
not exists(int x, int y | this.group_start(x, y) and x <= start and y >= end)
396+
not exists(int x, int y | this.group_start(x, y) and x <= start and y >= end) and
397+
not exists(int x, int y | this.backreference(x, y) and x <= start and y >= end)
285398
}
286399

287400
predicate normalCharacter(int start, int end) {
@@ -326,12 +439,13 @@ abstract class RegexString extends Expr {
326439
or
327440
this.negativeAssertionGroup(start, end)
328441
or
329-
positiveLookaheadAssertionGroup(start, end)
442+
this.positiveLookaheadAssertionGroup(start, end)
330443
or
331444
this.positiveLookbehindAssertionGroup(start, end)
332445
}
333446

334-
private predicate emptyGroup(int start, int end) {
447+
/** Holds if an empty group is found between `start` and `end`. */
448+
predicate emptyGroup(int start, int end) {
335449
exists(int endm1 | end = endm1 + 1 |
336450
this.group_start(start, endm1) and
337451
this.isGroupEnd(endm1)
@@ -364,13 +478,29 @@ abstract class RegexString extends Expr {
364478
)
365479
}
366480

367-
private predicate positiveLookaheadAssertionGroup(int start, int end) {
481+
/** Holds if a negative lookahead is found between `start` and `end` */
482+
predicate negativeLookaheadAssertionGroup(int start, int end) {
483+
exists(int in_start | this.negative_lookahead_assertion_start(start, in_start) |
484+
this.groupContents(start, end, in_start, _)
485+
)
486+
}
487+
488+
/** Holds if a negative lookbehind is found between `start` and `end` */
489+
predicate negativeLookbehindAssertionGroup(int start, int end) {
490+
exists(int in_start | this.negative_lookbehind_assertion_start(start, in_start) |
491+
this.groupContents(start, end, in_start, _)
492+
)
493+
}
494+
495+
/** Holds if a positive lookahead is found between `start` and `end` */
496+
predicate positiveLookaheadAssertionGroup(int start, int end) {
368497
exists(int in_start | this.lookahead_assertion_start(start, in_start) |
369498
this.groupContents(start, end, in_start, _)
370499
)
371500
}
372501

373-
private predicate positiveLookbehindAssertionGroup(int start, int end) {
502+
/** Holds if a positive lookbehind is found between `start` and `end` */
503+
predicate positiveLookbehindAssertionGroup(int start, int end) {
374504
exists(int in_start | this.lookbehind_assertion_start(start, in_start) |
375505
this.groupContents(start, end, in_start, _)
376506
)
@@ -429,6 +559,8 @@ abstract class RegexString extends Expr {
429559
this.getChar(start + 1) = "?" and
430560
this.getChar(start + 2) = "P" and
431561
this.getChar(start + 3) = "=" and
562+
// Should this be looking for unescaped ")"?
563+
// TODO: test this
432564
end = min(int i | i > start + 4 and this.getChar(i) = "?")
433565
}
434566

@@ -519,6 +651,7 @@ abstract class RegexString extends Expr {
519651

520652
private predicate numbered_backreference(int start, int end, int value) {
521653
this.escapingChar(start) and
654+
not this.getChar(start + 1) = "0" and
522655
exists(string text, string svalue, int len |
523656
end = start + len and
524657
text = this.getText() and
@@ -527,7 +660,7 @@ abstract class RegexString extends Expr {
527660
svalue = text.substring(start + 1, start + len) and
528661
value = svalue.toInt() and
529662
not exists(text.substring(start + 1, start + len + 1).toInt()) and
530-
value != 0
663+
value > 0
531664
)
532665
}
533666

@@ -551,6 +684,8 @@ abstract class RegexString extends Expr {
551684
this.group(start, end)
552685
or
553686
this.charSet(start, end)
687+
or
688+
this.backreference(start, end)
554689
}
555690

556691
private predicate qualifier(int start, int end, boolean maybe_empty) {
Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,32 @@
11
import re
22

3-
re.compile(r'[]-[]') #$ MISSING: charRange=1:2-3:4
4-
re.compile(r'[---]') #$ MISSING: charRange=1:2-3:4
5-
re.compile(r'[\---]') #$ MISSING: charRange=1:3-4:5
6-
re.compile(r'[--\-]') #$ MISSING: charRange=1:2-3:5
7-
re.compile(r'[\--\-]') #$ cMISSING: harRange=1:3-4:6
8-
re.compile(r'[0-9-A-Z]') #$ MISSING: charRange=1:2-3:4 charRange=5:6-7:8
9-
re.compile(r'[0\-9-A-Z]') #$ MISSING: charRange=4:5-6:7
10-
re.compile(r'[0--9-A-Z]') #$ MISSING: charRange=1:2-3:4 charRange=4:5-6:7
3+
re.compile(r'[]-[]') #$ charRange=1:2-3:4
4+
re.compile(r'[---]') #$ charRange=1:2-3:4
5+
re.compile(r'[\---]') #$ charRange=1:3-4:5
6+
re.compile(r'[--\-]') #$ charRange=1:2-3:5
7+
re.compile(r'[\--\-]') #$ charRange=1:3-4:6
8+
re.compile(r'[0-9-A-Z]') #$ charRange=1:2-3:4 charRange=5:6-7:8
9+
re.compile(r'[0\-9-A-Z]') #$ charRange=4:5-6:7
10+
re.compile(r'[0--9-A-Z]') #$ charRange=1:2-3:4 charRange=4:5-6:7
1111

12-
re.compile(r'[^A-Z]') #$ MISSING: charRange=2:3-4:5
12+
re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
1313

14-
re.compile(r'[\0-\09]') #$ MISSING: charRange=1:3-4:7
14+
re.compile(r'[\0-\09]') #$ charRange=1:3-4:7
1515

16-
re.compile(r'[\0123-5]') #$ MISSING: charRange=5:6-7:8
16+
re.compile(r'[\0123-5]') #$ charRange=5:6-7:8
1717

1818

1919
#Negative lookahead
20-
re.compile(r'(?!not-this)^[A-Z_]+$') #$ MISSING: charRange=14:15-16:17
20+
re.compile(r'(?!not-this)^[A-Z_]+$') #$ charRange=14:15-16:17
2121
#Negative lookbehind
22-
re.compile(r'^[A-Z_]+$(?<!not-this)') #$ MISSING: charRange=2:3-4:5
22+
re.compile(r'^[A-Z_]+$(?<!not-this)') #$ charRange=2:3-4:5
2323

2424

2525
#OK -- ODASA-ODASA-3968
26-
re.compile('(?:[^%]|^)?%\((\w*)\)[a-z]') #$ MISSING: charRange=22:23-24:25
26+
re.compile('(?:[^%]|^)?%\((\w*)\)[a-z]') #$ charRange=22:23-24:25
2727

2828
#ODASA-3985
2929
#Half Surrogate pairs
30-
re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') #$ MISSING: charRange=1:2-3:4 charRange=6:7-8:9
30+
re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') #$ charRange=1:2-3:4 charRange=6:7-8:9
3131
#Outside BMP
32-
re.compile(u'[\U00010000-\U0010ffff]') #$ MISSING: charRange=1:2-3:4
32+
re.compile(u'[\U00010000-\U0010ffff]') #$ charRange=1:2-3:4

0 commit comments

Comments
 (0)