Skip to content

Commit 5b0d92d

Browse files
authored
Merge pull request github#3464 from yoff/UnicodeEscape
Python: Handle more escapes in regexes
2 parents da6736d + 7125139 commit 5b0d92d

File tree

2 files changed

+32
-1
lines changed

2 files changed

+32
-1
lines changed

python/ql/src/semmle/python/regex.qll

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,16 +124,40 @@ abstract class RegexString extends Expr {
124124
)
125125
}
126126

127+
/** Named unicode characters, eg \N{degree sign} */
128+
private predicate escapedName(int start, int end) {
129+
this.escapingChar(start) and
130+
this.getChar(start + 1) = "N" and
131+
this.getChar(start + 2) = "{" and
132+
this.getChar(end - 1) = "}" and
133+
end > start and
134+
not exists(int i | start + 2 < i and i < end - 1 |
135+
this.getChar(i) = "}"
136+
)
137+
}
138+
127139
private predicate escapedCharacter(int start, int end) {
128140
this.escapingChar(start) and
129141
not exists(this.getText().substring(start + 1, end + 1).toInt()) and
130142
(
143+
// hex value \xhh
131144
this.getChar(start + 1) = "x" and end = start + 4
132145
or
146+
// octal value \ooo
133147
end in [start + 2 .. start + 4] and
134148
exists(this.getText().substring(start + 1, end).toInt())
135149
or
136-
this.getChar(start + 1) != "x" and end = start + 2
150+
// 16-bit hex value \uhhhh
151+
this.getChar(start + 1) = "u" and end = start + 6
152+
or
153+
// 32-bit hex value \Uhhhhhhhh
154+
this.getChar(start + 1) = "U" and end = start + 10
155+
or
156+
escapedName(start, end)
157+
or
158+
// escape not handled above, update when adding a new case
159+
not this.getChar(start + 1) in ["x", "u", "U", "N"] and
160+
end = start + 2
137161
)
138162
}
139163

python/ql/test/query-tests/Expressions/Regex/test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,10 @@
139139

140140
#Potentially mis-parsed character set
141141
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)")
142+
143+
#Allow unicode in raw strings
144+
re.compile(r"[\U00010000-\U0010FFFF]")
145+
re.compile(r"[\u0000-\uFFFF]")
146+
147+
#Allow unicode names
148+
re.compile(r"[\N{degree sign}\N{EM DASH}]")

0 commit comments

Comments
 (0)