Skip to content

Commit a834703

Browse files
authored
Merge pull request github#13779 from geoffw0/pythonparsemode
Python: Understand multiple parse mode flags specified in a regular expression string
2 parents 763216b + f07f97a commit a834703

File tree

10 files changed

+54
-23
lines changed

10 files changed

+54
-23
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Regular expressions containing multiple parse mode flags are now interpretted correctly. For example `"(?is)abc.*"` with both the `i` and `s` flags.

python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,7 @@ class RegExp extends Expr instanceof StrConst {
617617
private predicate group_start(int start, int end) {
618618
this.non_capturing_group_start(start, end)
619619
or
620-
this.flag_group_start(start, end, _)
620+
this.flag_group_start(start, end)
621621
or
622622
this.named_group_start(start, end)
623623
or
@@ -679,20 +679,37 @@ class RegExp extends Expr instanceof StrConst {
679679
end = min(int i | i > start + 4 and this.getChar(i) = "?")
680680
}
681681

682-
private predicate flag_group_start(int start, int end, string c) {
682+
/**
683+
* Holds if a parse mode starts between `start` and `end`.
684+
*/
685+
private predicate flag_group_start(int start, int end) {
683686
this.isGroupStart(start) and
684687
this.getChar(start + 1) = "?" and
685-
end = start + 3 and
686-
c = this.getChar(start + 2) and
687-
c in ["i", "L", "m", "s", "u", "x"]
688+
this.getChar(start + 2) in ["i", "L", "m", "s", "u", "x"] and
689+
end = start + 2
690+
}
691+
692+
/**
693+
* Holds if a parse mode group is between `start` and `end`, and includes the
694+
* mode flag `c`. For example the following span, with mode flag `i`:
695+
* ```
696+
* (?i)
697+
* ```
698+
*/
699+
private predicate flag_group(int start, int end, string c) {
700+
exists(int inStart, int inEnd |
701+
this.flag_group_start(start, inStart) and
702+
this.groupContents(start, end, inStart, inEnd) and
703+
this.getChar([inStart .. inEnd - 1]) = c
704+
)
688705
}
689706

690707
/**
691708
* Gets the mode of this regular expression string if
692709
* it is defined by a prefix.
693710
*/
694711
string getModeFromPrefix() {
695-
exists(string c | this.flag_group_start(_, _, c) |
712+
exists(string c | this.flag_group(_, _, c) |
696713
c = "i" and result = "IGNORECASE"
697714
or
698715
c = "L" and result = "LOCALE"

python/ql/test/library-tests/regex/Characters.expected

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 22 | 23 |
3737
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 24 | 25 |
3838
| (?P<name>[\\w]+)\| | 10 | 12 |
39+
| (?m)^(?!$) | 2 | 3 |
3940
| (?m)^(?!$) | 4 | 5 |
4041
| (?m)^(?!$) | 8 | 9 |
4142
| (\\033\|~{) | 1 | 5 |

python/ql/test/library-tests/regex/FirstLast.expected

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@
2222
| (?P<name>[\\w]+)\| | first | 9 | 14 |
2323
| (?P<name>[\\w]+)\| | last | 9 | 13 |
2424
| (?P<name>[\\w]+)\| | last | 9 | 14 |
25-
| (?m)^(?!$) | first | 4 | 5 |
26-
| (?m)^(?!$) | first | 8 | 9 |
25+
| (?m)^(?!$) | first | 2 | 3 |
2726
| (?m)^(?!$) | last | 4 | 5 |
2827
| (?m)^(?!$) | last | 8 | 9 |
2928
| (\\033\|~{) | first | 1 | 5 |

python/ql/test/library-tests/regex/GroupContents.expected

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 10 | (?:[^%]\|^) | 3 | 9 | [^%]\|^ |
99
| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 14 | 19 | (\\w*) | 15 | 18 | \\w* |
1010
| (?P<name>[\\w]+)\| | 0 | 15 | (?P<name>[\\w]+) | 9 | 14 | [\\w]+ |
11+
| (?m)^(?!$) | 0 | 4 | (?m) | 2 | 3 | m |
1112
| (?m)^(?!$) | 5 | 10 | (?!$) | 8 | 9 | $ |
1213
| (\\033\|~{) | 0 | 9 | (\\033\|~{) | 1 | 8 | \\033\|~{ |
1314
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 2 | 16 | (?P<txt>[^[]*) | 10 | 15 | [^[]* |

python/ql/test/library-tests/regex/Regex.expected

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,11 @@
7777
| (?P<name>[\\w]+)\| | sequence | 0 | 15 |
7878
| (?m)^(?!$) | $ | 8 | 9 |
7979
| (?m)^(?!$) | ^ | 4 | 5 |
80-
| (?m)^(?!$) | empty group | 0 | 4 |
80+
| (?m)^(?!$) | char | 2 | 3 |
8181
| (?m)^(?!$) | empty group | 5 | 10 |
82+
| (?m)^(?!$) | non-empty group | 0 | 4 |
8283
| (?m)^(?!$) | sequence | 0 | 10 |
84+
| (?m)^(?!$) | sequence | 2 | 3 |
8385
| (?m)^(?!$) | sequence | 8 | 9 |
8486
| (\\033\|~{) | char | 1 | 5 |
8587
| (\\033\|~{) | char | 6 | 7 |
Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
| tst.py:4:20:4:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
22
| tst.py:5:20:5:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
33
| tst.py:9:20:9:30 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
4-
| tst.py:10:20:10:53 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
5-
| tst.py:11:20:11:51 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
6-
| tst.py:12:20:12:58 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
7-
| tst.py:13:20:13:58 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
8-
| tst.py:14:20:14:62 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
9-
| tst.py:15:20:15:48 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
10-
| tst.py:16:20:16:66 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
11-
| tst.py:17:20:17:53 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
12-
| tst.py:19:20:19:54 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
13-
| tst.py:20:20:20:62 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
14-
| tst.py:21:20:21:161 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
15-
| tst.py:22:17:22:71 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
16-
| tst.py:23:20:23:263 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as an HTML comment end tag. |
4+
| tst.py:12:20:12:53 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
5+
| tst.py:13:20:13:51 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
6+
| tst.py:14:20:14:58 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
7+
| tst.py:15:20:15:58 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
8+
| tst.py:16:20:16:62 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
9+
| tst.py:17:20:17:48 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
10+
| tst.py:18:20:18:66 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
11+
| tst.py:19:20:19:53 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
12+
| tst.py:21:20:21:54 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
13+
| tst.py:22:20:22:62 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
14+
| tst.py:23:20:23:161 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
15+
| tst.py:24:17:24:71 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
16+
| tst.py:25:20:25:263 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as an HTML comment end tag. |

python/ql/test/query-tests/Security/CWE-116-BadTagFilter/tst.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
re.compile(r"""<!--.*-->""", re.IGNORECASE | re.DOTALL), # OK - we don't care regexps that only match comments
88
re.compile(r"""<!--.*--!?>""", re.IGNORECASE | re.DOTALL), # OK
99
re.compile(r"""<!--.*--!?>""", re.IGNORECASE), # NOT OK, does not match newlines
10+
re.compile(r"""(?is)<!--.*--!?>"""), # OK
11+
re.compile(r"""(?i)<!--.*--!?>"""), # NOT OK, does not match newlines [NOT DETECTED]
1012
re.compile(r"""<script.*?>(.|\s)*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match inside the script tag
1113
re.compile(r"""<script[^>]*?>.*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match newlines inside the content
1214
re.compile(r"""<script(\s|\w|=|")*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match single quotes for attribute values
@@ -23,4 +25,4 @@
2325
re.compile(r"""<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))"""), # NOT OK - capture groups
2426
]
2527

26-
doFilters(filters)
28+
doFilters(filters)

python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,5 @@
105105
| redos.py:391:15:391:25 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
106106
| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\\u00c6'. |
107107
| unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
108+
| unittests.py:11:20:11:28 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 's' and containing many repetitions of '\\n'. |
109+
| unittests.py:12:21:12:29 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings starting with 'is' and containing many repetitions of '\\n'. |

python/ql/test/query-tests/Security/CWE-730-ReDoS/unittests.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@
77
# Treatment of line breaks
88
re.compile(r'(?:.|\n)*b') # No ReDoS.
99
re.compile(r'(?:.|\n)*b', re.DOTALL) # Has ReDoS.
10+
re.compile(r'(?i)(?:.|\n)*b') # No ReDoS.
11+
re.compile(r'(?s)(?:.|\n)*b') # Has ReDoS.
12+
re.compile(r'(?is)(?:.|\n)*b') # Has ReDoS.

0 commit comments

Comments
 (0)