Skip to content

Commit 8f152a5

Browse files
committed
Python: Port regex concepts and adapt PolyRedos
1 parent 68ed325 commit 8f152a5

File tree

3 files changed

+199
-135
lines changed

3 files changed

+199
-135
lines changed

python/ql/lib/semmle/python/Concepts.qll

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,41 @@ module SqlExecution {
355355
}
356356
}
357357

358+
/**
359+
* A data-flow node that executes a regular expression.
360+
*
361+
* Extend this class to refine existing API models. If you want to model new APIs,
362+
* extend `RegexExecution::Range` instead.
363+
*/
364+
class RegexExecution extends DataFlow::Node {
365+
RegexExecution::Range range;
366+
367+
RegexExecution() { this = range }
368+
369+
/** Gets the data flow node for the regex being compiled by this node. */
370+
DataFlow::Node getRegexNode() { result = range.getRegexNode() }
371+
372+
/** Gets a dataflow node for the string to be searched or matched against. */
373+
DataFlow::Node getString() { result = range.getString() }
374+
}
375+
376+
/** Provides classes for modeling new regular-expression execution APIs. */
377+
module RegexExecution {
378+
/**
379+
* A data-flow node that executes a regular expression.
380+
*
381+
* Extend this class to model new APIs. If you want to refine existing API models,
382+
* extend `RegexExecution` instead.
383+
*/
384+
abstract class Range extends DataFlow::Node {
385+
/** Gets the data flow node for the regex being compiled by this node. */
386+
abstract DataFlow::Node getRegexNode();
387+
388+
/** Gets a dataflow node for the string to be searched or matched against. */
389+
abstract DataFlow::Node getString();
390+
}
391+
}
392+
358393
/**
359394
* A data-flow node that escapes meta-characters, which could be used to prevent
360395
* injection attacks.
@@ -411,6 +446,9 @@ module Escaping {
411446

412447
/** Gets the escape-kind for escaping a string so it can safely be included in HTML. */
413448
string getHtmlKind() { result = "html" }
449+
450+
/** Gets the escape-kind for escaping a string so it can safely be included in HTML. */
451+
string getRegexKind() { result = "regex" }
414452
// TODO: If adding an XML kind, update the modeling of the `MarkupSafe` PyPI package.
415453
//
416454
// Technically it claims to escape for both HTML and XML, but for now we don't have
@@ -427,6 +465,14 @@ class HtmlEscaping extends Escaping {
427465
HtmlEscaping() { range.getKind() = Escaping::getHtmlKind() }
428466
}
429467

468+
/**
469+
* An escape of a string so it can be safely included in
470+
* the body of a regex.
471+
*/
472+
class RegexEscaping extends Escaping {
473+
RegexEscaping() { range.getKind() = Escaping::getRegexKind() }
474+
}
475+
430476
/** Provides classes for modeling HTTP-related APIs. */
431477
module HTTP {
432478
import semmle.python.web.HttpConstants

python/ql/lib/semmle/python/frameworks/Stdlib.qll

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1497,6 +1497,158 @@ private module StdlibPrivate {
14971497
}
14981498
}
14991499

1500+
// ---------------------------------------------------------------------------
1501+
// re
1502+
// ---------------------------------------------------------------------------
1503+
/**
1504+
* List of methods in the `re` module immediately executing a regular expression.
1505+
*
1506+
* See https://docs.python.org/3/library/re.html#module-contents
1507+
*/
1508+
private class RegexExecutionMethod extends string {
1509+
RegexExecutionMethod() {
1510+
this in ["match", "fullmatch", "search", "split", "findall", "finditer", "sub", "subn"]
1511+
}
1512+
}
1513+
1514+
/** Gets the index of the argument representing the string to be searched by a regex. */
1515+
int stringArg(RegexExecutionMethod method) {
1516+
method in ["match", "fullmatch", "search", "split", "findall", "finditer"] and
1517+
result = 1
1518+
or
1519+
method in ["sub", "subn"] and
1520+
result = 2
1521+
}
1522+
1523+
/**
1524+
* A a call to a method from the `re` module immediately executing a regular expression.
1525+
*
1526+
* See `RegexExecutionMethods`
1527+
*/
1528+
private class DirectRegex extends DataFlow::CallCfgNode, RegexExecution::Range {
1529+
RegexExecutionMethod method;
1530+
1531+
DirectRegex() { this = API::moduleImport("re").getMember(method).getACall() }
1532+
1533+
override DataFlow::Node getRegexNode() {
1534+
result in [this.getArg(0), this.getArgByName("pattern")]
1535+
}
1536+
1537+
override DataFlow::Node getString() {
1538+
result in [this.getArg(stringArg(method)), this.getArgByName("string")]
1539+
}
1540+
}
1541+
1542+
/** Helper module for tracking compiled regexes. */
1543+
private module CompiledRegexes {
1544+
private import semmle.python.dataflow.new.DataFlow2
1545+
private import semmle.python.RegexTreeView
1546+
1547+
// TODO: This module should be refactored once API graphs are more expressinve.
1548+
/** A configuration for finding uses of compiled regexes. */
1549+
class RegexDefinitionConfiguration extends DataFlow2::Configuration {
1550+
RegexDefinitionConfiguration() { this = "RegexDefinitionConfiguration" }
1551+
1552+
override predicate isSource(DataFlow::Node source) { source instanceof RegexDefinitonSource }
1553+
1554+
override predicate isSink(DataFlow::Node sink) { sink instanceof RegexDefinitionSink }
1555+
}
1556+
1557+
/** A regex compilation. */
1558+
class RegexDefinitonSource extends DataFlow::CallCfgNode {
1559+
DataFlow::Node regexNode;
1560+
1561+
RegexDefinitonSource() {
1562+
this = API::moduleImport("re").getMember("compile").getACall() and
1563+
regexNode in [this.getArg(0), this.getArgByName("pattern")]
1564+
}
1565+
1566+
/** Gets the data flow node for the regex being compiled by this node. */
1567+
DataFlow::Node getRegexNode() { result = regexNode }
1568+
}
1569+
1570+
/** A use of a compiled regex. */
1571+
class RegexDefinitionSink extends DataFlow::Node {
1572+
RegexExecutionMethod method;
1573+
DataFlow::CallCfgNode executingCall;
1574+
1575+
RegexDefinitionSink() {
1576+
executingCall =
1577+
API::moduleImport("re").getMember("compile").getReturn().getMember(method).getACall() and
1578+
this = executingCall.getFunction().(DataFlow::AttrRead).getObject()
1579+
}
1580+
1581+
/** Gets the method used to execute the regex. */
1582+
RegexExecutionMethod getMethod() { result = method }
1583+
1584+
/** Gets the data flow node for the executing call. */
1585+
DataFlow::CallCfgNode getExecutingCall() { result = executingCall }
1586+
}
1587+
}
1588+
1589+
private import CompiledRegexes
1590+
1591+
/**
1592+
* A call on compiled regular expression (obtained via `re.compile`) executing a
1593+
* regular expression.
1594+
*
1595+
* Given the following example:
1596+
*
1597+
* ```py
1598+
* pattern = re.compile(input)
1599+
* pattern.match(s)
1600+
* ```
1601+
*
1602+
* This class will identify that `re.compile` compiles `input` and afterwards
1603+
* executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)`
1604+
* and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument).
1605+
*
1606+
*
1607+
* See `RegexExecutionMethods`
1608+
*
1609+
* See https://docs.python.org/3/library/re.html#regular-expression-objects
1610+
*/
1611+
private class CompiledRegex extends DataFlow::CallCfgNode, RegexExecution {
1612+
DataFlow::Node regexNode;
1613+
RegexExecutionMethod method;
1614+
1615+
CompiledRegex() {
1616+
exists(
1617+
RegexDefinitionConfiguration conf, RegexDefinitonSource source, RegexDefinitionSink sink
1618+
|
1619+
conf.hasFlow(source, sink) and
1620+
regexNode = source.getRegexNode() and
1621+
method = sink.getMethod() and
1622+
this = sink.getExecutingCall()
1623+
)
1624+
}
1625+
1626+
override DataFlow::Node getRegexNode() { result = regexNode }
1627+
1628+
override DataFlow::Node getString() {
1629+
result in [this.getArg(stringArg(method) - 1), this.getArgByName("string")]
1630+
}
1631+
}
1632+
1633+
/**
1634+
* A call to 're.escape'.
1635+
* See https://docs.python.org/3/library/re.html#re.escape
1636+
*/
1637+
private class ReEscapeCall extends Escaping::Range, DataFlow::CallCfgNode {
1638+
DataFlow::Node regexNode;
1639+
1640+
ReEscapeCall() {
1641+
this = API::moduleImport("re").getMember("escape").getACall() and
1642+
regexNode in [this.getArg(0), this.getArgByName("pattern")]
1643+
}
1644+
1645+
override DataFlow::Node getAnInput() { result = regexNode }
1646+
1647+
override DataFlow::Node getOutput() { result = this }
1648+
1649+
override string getKind() { result = Escaping::getRegexKind() }
1650+
}
1651+
15001652
// ---------------------------------------------------------------------------
15011653
// OTHER
15021654
// ---------------------------------------------------------------------------

python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll

Lines changed: 1 addition & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ module PolynomialReDoS {
6060
RegExpTerm t;
6161

6262
RegexExecutionAsSink() {
63-
exists(CompiledRegexes::RegexExecution re |
63+
exists(RegexExecution re |
6464
re.getRegexNode().asExpr() = t.getRegex() and
6565
this = re.getString()
6666
) and
@@ -76,137 +76,3 @@ module PolynomialReDoS {
7676
*/
7777
class StringConstCompareAsSanitizerGuard extends SanitizerGuard, StringConstCompare { }
7878
}
79-
80-
/** Helper module for tracking compiled regexes. */
81-
private module CompiledRegexes {
82-
// TODO: This module should be refactored and merged with the experimental work done on detecting
83-
// regex injections, such that this can be expressed from just using a concept.
84-
/** A configuration for finding uses of compiled regexes. */
85-
class RegexDefinitionConfiguration extends DataFlow2::Configuration {
86-
RegexDefinitionConfiguration() { this = "RegexDefinitionConfiguration" }
87-
88-
override predicate isSource(DataFlow::Node source) { source instanceof RegexDefinitonSource }
89-
90-
override predicate isSink(DataFlow::Node sink) { sink instanceof RegexDefinitionSink }
91-
}
92-
93-
/** A regex compilation. */
94-
class RegexDefinitonSource extends DataFlow::CallCfgNode {
95-
DataFlow::Node regexNode;
96-
97-
RegexDefinitonSource() {
98-
this = API::moduleImport("re").getMember("compile").getACall() and
99-
regexNode in [this.getArg(0), this.getArgByName("pattern")]
100-
}
101-
102-
/** Gets the regex that is being compiled by this node. */
103-
RegExpTerm getRegExp() { result.getRegex() = regexNode.asExpr() and result.isRootTerm() }
104-
105-
/** Gets the data flow node for the regex being compiled by this node. */
106-
DataFlow::Node getRegexNode() { result = regexNode }
107-
}
108-
109-
/** A use of a compiled regex. */
110-
class RegexDefinitionSink extends DataFlow::Node {
111-
RegexExecutionMethod method;
112-
DataFlow::CallCfgNode executingCall;
113-
114-
RegexDefinitionSink() {
115-
exists(DataFlow::AttrRead reMethod |
116-
executingCall.getFunction() = reMethod and
117-
reMethod.getAttributeName() = method and
118-
this = reMethod.getObject()
119-
)
120-
}
121-
122-
/** Gets the method used to execute the regex. */
123-
RegexExecutionMethod getMethod() { result = method }
124-
125-
/** Gets the data flow node for the executing call. */
126-
DataFlow::CallCfgNode getExecutingCall() { result = executingCall }
127-
}
128-
129-
/** A data flow node executing a regex. */
130-
abstract class RegexExecution extends DataFlow::Node {
131-
/** Gets the data flow node for the regex being compiled by this node. */
132-
abstract DataFlow::Node getRegexNode();
133-
134-
/** Gets a dataflow node for the string to be searched or matched against. */
135-
abstract DataFlow::Node getString();
136-
}
137-
138-
private class RegexExecutionMethod extends string {
139-
RegexExecutionMethod() {
140-
this in ["match", "fullmatch", "search", "split", "findall", "finditer", "sub", "subn"]
141-
}
142-
}
143-
144-
/** Gets the index of the argument representing the string to be searched by a regex. */
145-
int stringArg(RegexExecutionMethod method) {
146-
method in ["match", "fullmatch", "search", "split", "findall", "finditer"] and
147-
result = 1
148-
or
149-
method in ["sub", "subn"] and
150-
result = 2
151-
}
152-
153-
/**
154-
* A class to find `re` methods immediately executing an expression.
155-
*
156-
* See `RegexExecutionMethods`
157-
*/
158-
class DirectRegex extends DataFlow::CallCfgNode, RegexExecution {
159-
RegexExecutionMethod method;
160-
161-
DirectRegex() { this = API::moduleImport("re").getMember(method).getACall() }
162-
163-
override DataFlow::Node getRegexNode() {
164-
result in [this.getArg(0), this.getArgByName("pattern")]
165-
}
166-
167-
override DataFlow::Node getString() {
168-
result in [this.getArg(stringArg(method)), this.getArgByName("string")]
169-
}
170-
}
171-
172-
/**
173-
* A class to find `re` methods immediately executing a compiled expression by `re.compile`.
174-
*
175-
* Given the following example:
176-
*
177-
* ```py
178-
* pattern = re.compile(input)
179-
* pattern.match(s)
180-
* ```
181-
*
182-
* This class will identify that `re.compile` compiles `input` and afterwards
183-
* executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)`
184-
* and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument)
185-
*
186-
*
187-
* See `RegexExecutionMethods`
188-
*
189-
* See https://docs.python.org/3/library/re.html#regular-expression-objects
190-
*/
191-
private class CompiledRegex extends DataFlow::CallCfgNode, RegexExecution {
192-
DataFlow::Node regexNode;
193-
RegexExecutionMethod method;
194-
195-
CompiledRegex() {
196-
exists(
197-
RegexDefinitionConfiguration conf, RegexDefinitonSource source, RegexDefinitionSink sink
198-
|
199-
conf.hasFlow(source, sink) and
200-
regexNode = source.getRegexNode() and
201-
method = sink.getMethod() and
202-
this = sink.getExecutingCall()
203-
)
204-
}
205-
206-
override DataFlow::Node getRegexNode() { result = regexNode }
207-
208-
override DataFlow::Node getString() {
209-
result in [this.getArg(stringArg(method) - 1), this.getArgByName("string")]
210-
}
211-
}
212-
}

0 commit comments

Comments
 (0)