@@ -3,72 +3,6 @@ private import semmle.python.ApiGraphs
3
3
// Need to import since frameworks can extend the abstract `RegExpInterpretation::Range`
4
4
private import semmle.python.Frameworks
5
5
private import semmle.python.Concepts as Concepts
6
-
7
- /**
8
- * Gets the positional argument index containing the regular expression flags for the member of the
9
- * `re` module with the name `name`.
10
- */
11
- private int re_member_flags_arg ( string name ) {
12
- name = "compile" and result = 1
13
- or
14
- name = "search" and result = 2
15
- or
16
- name = "match" and result = 2
17
- or
18
- name = "split" and result = 3
19
- or
20
- name = "findall" and result = 2
21
- or
22
- name = "finditer" and result = 2
23
- or
24
- name = "sub" and result = 4
25
- or
26
- name = "subn" and result = 4
27
- }
28
-
29
- /**
30
- * Gets the names and corresponding API nodes of members of the `re` module that are likely to be
31
- * methods taking regular expressions as arguments.
32
- *
33
- * This is a helper predicate that fixes a bad join order, and should not be inlined without checking
34
- * that this is safe.
35
- */
36
- pragma [ nomagic]
37
- private API:: Node relevant_re_member ( string name ) {
38
- result = API:: moduleImport ( "re" ) .getMember ( name ) and
39
- name != "escape"
40
- }
41
-
42
- /**
43
- * Holds if the expression `e` is used as a regex with the `re` module, with the regex-mode `mode` (if known).
44
- * If regex mode is not known, `mode` will be `"None"`.
45
- *
46
- * This predicate has not done any data-flow tracking.
47
- */
48
- // TODO: This should only be used to get the `mode`, and nowhere else.
49
- predicate used_as_regex_internal ( Expr e , string mode ) {
50
- /* Call to re.xxx(regex, ... [mode]) */
51
- exists ( DataFlow:: CallCfgNode call |
52
- call instanceof Concepts:: RegexExecution and
53
- e = call .( Concepts:: RegexExecution ) .getRegex ( ) .asExpr ( )
54
- or
55
- call .getArg ( 0 ) .asExpr ( ) = e and
56
- call = relevant_re_member ( _) .getACall ( )
57
- |
58
- mode = "None"
59
- or
60
- exists ( DataFlow:: CallCfgNode callNode |
61
- call = callNode and
62
- mode =
63
- mode_from_node ( [
64
- callNode
65
- .getArg ( re_member_flags_arg ( callNode .( DataFlow:: MethodCallNode ) .getMethodName ( ) ) ) ,
66
- callNode .getArgByName ( "flags" )
67
- ] )
68
- )
69
- )
70
- }
71
-
72
6
private import regexp.internal.RegExpTracking as RegExpTracking
73
7
private import semmle.python.Concepts
74
8
private import semmle.python.regexp.RegexTreeView
@@ -81,49 +15,6 @@ RegExpTerm getTermForExecution(RegexExecution exec) {
81
15
)
82
16
}
83
17
84
- /**
85
- * Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
86
- * that have multiple names, we pick the long-form name as a canonical representative.
87
- */
88
- private string canonical_name ( API:: Node flag ) {
89
- result in [ "ASCII" , "IGNORECASE" , "LOCALE" , "UNICODE" , "MULTILINE" , "TEMPLATE" ] and
90
- flag = API:: moduleImport ( "re" ) .getMember ( [ result , result .prefix ( 1 ) ] )
91
- or
92
- flag = API:: moduleImport ( "re" ) .getMember ( [ "DOTALL" , "S" ] ) and result = "DOTALL"
93
- or
94
- flag = API:: moduleImport ( "re" ) .getMember ( [ "VERBOSE" , "X" ] ) and result = "VERBOSE"
95
- }
96
-
97
- /**
98
- * A type tracker for regular expression flag names. Holds if the result is a node that may refer
99
- * to the `re` flag with the canonical name `flag_name`
100
- */
101
- private DataFlow:: TypeTrackingNode re_flag_tracker ( string flag_name , DataFlow:: TypeTracker t ) {
102
- t .start ( ) and
103
- exists ( API:: Node flag | flag_name = canonical_name ( flag ) and result = flag .asSource ( ) )
104
- or
105
- exists ( BinaryExprNode binop , DataFlow:: Node operand |
106
- operand .getALocalSource ( ) = re_flag_tracker ( flag_name , t .continue ( ) ) and
107
- operand .asCfgNode ( ) = binop .getAnOperand ( ) and
108
- ( binop .getOp ( ) instanceof BitOr or binop .getOp ( ) instanceof Add ) and
109
- result .asCfgNode ( ) = binop
110
- )
111
- or
112
- exists ( DataFlow:: TypeTracker t2 | result = re_flag_tracker ( flag_name , t2 ) .track ( t2 , t ) )
113
- }
114
-
115
- /**
116
- * A type tracker for regular expression flag names. Holds if the result is a node that may refer
117
- * to the `re` flag with the canonical name `flag_name`
118
- */
119
- private DataFlow:: Node re_flag_tracker ( string flag_name ) {
120
- re_flag_tracker ( flag_name , DataFlow:: TypeTracker:: end ( ) ) .flowsTo ( result )
121
- }
122
-
123
- /** Gets a regular expression mode flag associated with the given data flow node. */
124
- // TODO: Move this into a RegexFlag module, along with related code?
125
- string mode_from_node ( DataFlow:: Node node ) { node = re_flag_tracker ( result ) }
126
-
127
18
/** Provides a class for modeling regular expression interpretations. */
128
19
module RegExpInterpretation {
129
20
/**
@@ -150,6 +41,102 @@ deprecated class RegexString extends Regex {
150
41
RegexString ( ) { this = RegExpTracking:: regExpSource ( _) .asExpr ( ) }
151
42
}
152
43
44
+ /** Utility predicates for finding the mode of a regex based on where it's used. */
45
+ private module FindRegexMode {
46
+ // TODO: Movev this (and Regex) into a ParseRegExp file.
47
+ /**
48
+ * Gets the mode of the regex `regex` based on the context where it's used.
49
+ * Does not find the mode if it's in a prefix inside the regex itself (see `Regex::getAMode`).
50
+ */
51
+ string getAMode ( Regex regex ) {
52
+ exists ( DataFlow:: Node sink |
53
+ sink = regex .getAUse ( ) and
54
+ /* Call to re.xxx(regex, ... [mode]) */
55
+ exists ( DataFlow:: CallCfgNode call |
56
+ call instanceof Concepts:: RegexExecution and
57
+ sink = call .( Concepts:: RegexExecution ) .getRegex ( )
58
+ or
59
+ call .getArg ( _) = sink and
60
+ sink instanceof RegExpInterpretation:: Range
61
+ |
62
+ exists ( DataFlow:: CallCfgNode callNode |
63
+ call = callNode and
64
+ result =
65
+ mode_from_node ( [
66
+ callNode
67
+ .getArg ( re_member_flags_arg ( callNode .( DataFlow:: MethodCallNode ) .getMethodName ( ) ) ) ,
68
+ callNode .getArgByName ( "flags" )
69
+ ] )
70
+ )
71
+ )
72
+ )
73
+ }
74
+
75
+ /**
76
+ * Gets the positional argument index containing the regular expression flags for the member of the
77
+ * `re` module with the name `name`.
78
+ */
79
+ private int re_member_flags_arg ( string name ) {
80
+ name = "compile" and result = 1
81
+ or
82
+ name = "search" and result = 2
83
+ or
84
+ name = "match" and result = 2
85
+ or
86
+ name = "split" and result = 3
87
+ or
88
+ name = "findall" and result = 2
89
+ or
90
+ name = "finditer" and result = 2
91
+ or
92
+ name = "sub" and result = 4
93
+ or
94
+ name = "subn" and result = 4
95
+ }
96
+
97
+ /**
98
+ * Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
99
+ * that have multiple names, we pick the long-form name as a canonical representative.
100
+ */
101
+ private string canonical_name ( API:: Node flag ) {
102
+ result in [ "ASCII" , "IGNORECASE" , "LOCALE" , "UNICODE" , "MULTILINE" , "TEMPLATE" ] and
103
+ flag = API:: moduleImport ( "re" ) .getMember ( [ result , result .prefix ( 1 ) ] )
104
+ or
105
+ flag = API:: moduleImport ( "re" ) .getMember ( [ "DOTALL" , "S" ] ) and result = "DOTALL"
106
+ or
107
+ flag = API:: moduleImport ( "re" ) .getMember ( [ "VERBOSE" , "X" ] ) and result = "VERBOSE"
108
+ }
109
+
110
+ /**
111
+ * A type tracker for regular expression flag names. Holds if the result is a node that may refer
112
+ * to the `re` flag with the canonical name `flag_name`
113
+ */
114
+ private DataFlow:: TypeTrackingNode re_flag_tracker ( string flag_name , DataFlow:: TypeTracker t ) {
115
+ t .start ( ) and
116
+ exists ( API:: Node flag | flag_name = canonical_name ( flag ) and result = flag .asSource ( ) )
117
+ or
118
+ exists ( BinaryExprNode binop , DataFlow:: Node operand |
119
+ operand .getALocalSource ( ) = re_flag_tracker ( flag_name , t .continue ( ) ) and
120
+ operand .asCfgNode ( ) = binop .getAnOperand ( ) and
121
+ ( binop .getOp ( ) instanceof BitOr or binop .getOp ( ) instanceof Add ) and
122
+ result .asCfgNode ( ) = binop
123
+ )
124
+ or
125
+ exists ( DataFlow:: TypeTracker t2 | result = re_flag_tracker ( flag_name , t2 ) .track ( t2 , t ) )
126
+ }
127
+
128
+ /**
129
+ * A type tracker for regular expression flag names. Holds if the result is a node that may refer
130
+ * to the `re` flag with the canonical name `flag_name`
131
+ */
132
+ private DataFlow:: Node re_flag_tracker ( string flag_name ) {
133
+ re_flag_tracker ( flag_name , DataFlow:: TypeTracker:: end ( ) ) .flowsTo ( result )
134
+ }
135
+
136
+ /** Gets a regular expression mode flag associated with the given data flow node. */
137
+ private string mode_from_node ( DataFlow:: Node node ) { node = re_flag_tracker ( result ) }
138
+ }
139
+
153
140
/** A StrConst used as a regular expression */
154
141
class Regex extends Expr {
155
142
DataFlow:: Node sink ;
@@ -175,11 +162,7 @@ class Regex extends Expr {
175
162
* VERBOSE
176
163
*/
177
164
string getAMode ( ) {
178
- exists ( string mode |
179
- used_as_regex_internal ( sink .asExpr ( ) , mode ) and
180
- result != "None" and
181
- result = mode
182
- )
165
+ result = FindRegexMode:: getAMode ( this )
183
166
or
184
167
result = this .getModeFromPrefix ( )
185
168
}
0 commit comments