Skip to content

Commit 7cd5e68

Browse files
authored
Merge pull request github#6693 from yoff/python/promote-regex-injection
Python: Promote `py/regex-injection`
2 parents 47a85bb + 83490e9 commit 7cd5e68

File tree

20 files changed

+318
-377
lines changed

20 files changed

+318
-377
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
lgtm,codescanning
2+
* The query "Regular expression injection" (`py/regex-injection`) has been promoted from experimental to the main query pack. Its results will now appear by default. This query was originally [submitted as an experimental query by @jorgectf](https://github.com/github/codeql/pull/5442).

python/ql/lib/semmle/python/Concepts.qll

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,53 @@ module SqlExecution {
355355
}
356356
}
357357

358+
/**
359+
* A data-flow node that executes a regular expression.
360+
*
361+
* Extend this class to refine existing API models. If you want to model new APIs,
362+
* extend `RegexExecution::Range` instead.
363+
*/
364+
class RegexExecution extends DataFlow::Node {
365+
RegexExecution::Range range;
366+
367+
RegexExecution() { this = range }
368+
369+
/** Gets the data flow node for the regex being executed by this node. */
370+
DataFlow::Node getRegex() { result = range.getRegex() }
371+
372+
/** Gets a dataflow node for the string to be searched or matched against. */
373+
DataFlow::Node getString() { result = range.getString() }
374+
375+
/**
376+
* Gets the name of this regex execution, typically the name of an executing method.
377+
* This is used for nice alert messages and should include the module if possible.
378+
*/
379+
string getName() { result = range.getName() }
380+
}
381+
382+
/** Provides classes for modeling new regular-expression execution APIs. */
383+
module RegexExecution {
384+
/**
385+
* A data-flow node that executes a regular expression.
386+
*
387+
* Extend this class to model new APIs. If you want to refine existing API models,
388+
* extend `RegexExecution` instead.
389+
*/
390+
abstract class Range extends DataFlow::Node {
391+
/** Gets the data flow node for the regex being executed by this node. */
392+
abstract DataFlow::Node getRegex();
393+
394+
/** Gets a dataflow node for the string to be searched or matched against. */
395+
abstract DataFlow::Node getString();
396+
397+
/**
398+
* Gets the name of this regex execution, typically the name of an executing method.
399+
* This is used for nice alert messages and should include the module if possible.
400+
*/
401+
abstract string getName();
402+
}
403+
}
404+
358405
/**
359406
* A data-flow node that escapes meta-characters, which could be used to prevent
360407
* injection attacks.
@@ -411,6 +458,9 @@ module Escaping {
411458

412459
/** Gets the escape-kind for escaping a string so it can safely be included in HTML. */
413460
string getHtmlKind() { result = "html" }
461+
462+
/** Gets the escape-kind for escaping a string so it can safely be included in HTML. */
463+
string getRegexKind() { result = "regex" }
414464
// TODO: If adding an XML kind, update the modeling of the `MarkupSafe` PyPI package.
415465
//
416466
// Technically it claims to escape for both HTML and XML, but for now we don't have
@@ -427,6 +477,14 @@ class HtmlEscaping extends Escaping {
427477
HtmlEscaping() { range.getKind() = Escaping::getHtmlKind() }
428478
}
429479

480+
/**
481+
* An escape of a string so it can be safely included in
482+
* the body of a regex.
483+
*/
484+
class RegexEscaping extends Escaping {
485+
RegexEscaping() { range.getKind() = Escaping::getRegexKind() }
486+
}
487+
430488
/** Provides classes for modeling HTTP-related APIs. */
431489
module HTTP {
432490
import semmle.python.web.HttpConstants

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ newtype TRegExpParent =
4949
* or another regular expression term.
5050
*/
5151
class RegExpParent extends TRegExpParent {
52+
/** Gets a textual representation of this element. */
5253
string toString() { result = "RegExpParent" }
5354

5455
/** Gets the `i`th child term. */
@@ -72,14 +73,18 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
7273

7374
override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() }
7475

76+
/** Holds if dot, `.`, matches all characters, including newlines. */
7577
predicate isDotAll() { re.getAMode() = "DOTALL" }
7678

79+
/** Holds if this regex matching is case-insensitive for this regex. */
7780
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
7881

82+
/** Get a string representing all modes for this regex. */
7983
string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") }
8084

8185
override Regex getRegex() { result = re }
8286

87+
/** Gets the primary QL class for this regex. */
8388
string getPrimaryQLClass() { result = "RegExpLiteral" }
8489
}
8590

@@ -246,8 +251,10 @@ class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier {
246251
result.getEnd() = part_end
247252
}
248253

254+
/** Hols if this term may match an unlimited number of times. */
249255
predicate mayRepeatForever() { may_repeat_forever = true }
250256

257+
/** Gets the qualifier for this term. That is e.g "?" for "a?". */
251258
string getQualifier() { result = re.getText().substring(part_end, end) }
252259

253260
override string getPrimaryQLClass() { result = "RegExpQuantifier" }
@@ -322,8 +329,10 @@ class RegExpRange extends RegExpQuantifier {
322329

323330
RegExpRange() { re.multiples(part_end, end, lower, upper) }
324331

332+
/** Gets the string defining the upper bound of this range, if any. */
325333
string getUpper() { result = upper }
326334

335+
/** Gets the string defining the lower bound of this range, if any. */
327336
string getLower() { result = lower }
328337

329338
/**
@@ -465,11 +474,13 @@ class RegExpEscape extends RegExpNormalChar {
465474
result = this.getUnicode()
466475
}
467476

477+
/** Holds if this terms name is given by the part following the escape character. */
468478
predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] }
469479

470480
override string getPrimaryQLClass() { result = "RegExpEscape" }
471481

472-
string getUnescaped() { result = this.getText().suffix(1) }
482+
/** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */
483+
private string getUnescaped() { result = this.getText().suffix(1) }
473484

474485
/**
475486
* Gets the text for this escape. That is e.g. "\w".
@@ -536,15 +547,8 @@ private int toHex(string hex) {
536547
* ```
537548
*/
538549
class RegExpCharacterClassEscape extends RegExpEscape {
539-
// string value;
540-
RegExpCharacterClassEscape() {
541-
// value = re.getText().substring(start + 1, end) and
542-
// value in ["d", "D", "s", "S", "w", "W"]
543-
this.getValue() in ["d", "D", "s", "S", "w", "W"]
544-
}
550+
RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] }
545551

546-
/** Gets the name of the character class; for example, `w` for `\w`. */
547-
// override string getValue() { result = value }
548552
override RegExpTerm getChild(int i) { none() }
549553

550554
override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
@@ -563,10 +567,13 @@ class RegExpCharacterClassEscape extends RegExpEscape {
563567
class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass {
564568
RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) }
565569

570+
/** Holds if this character class is inverted, matching the opposite of its content. */
566571
predicate isInverted() { re.getChar(start + 1) = "^" }
567572

573+
/** Gets the `i`th char inside this charater class. */
568574
string getCharThing(int i) { result = re.getChar(i + start) }
569575

576+
/** Holds if this character class can match anything. */
570577
predicate isUniversalClass() {
571578
// [^]
572579
this.isInverted() and not exists(this.getAChild())
@@ -620,6 +627,7 @@ class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
620627
re.charRange(_, start, lower_end, upper_start, end)
621628
}
622629

630+
/** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */
623631
predicate isRange(string lo, string hi) {
624632
lo = re.getText().substring(start, lower_end) and
625633
hi = re.getText().substring(upper_start, end)
@@ -653,8 +661,13 @@ class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
653661
class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar {
654662
RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) }
655663

664+
/**
665+
* Holds if this constant represents a valid Unicode character (as opposed
666+
* to a surrogate code point that does not correspond to a character by itself.)
667+
*/
656668
predicate isCharacter() { any() }
657669

670+
/** Gets the string representation of the char matched by this term. */
658671
string getValue() { result = re.getText().substring(start, end) }
659672

660673
override RegExpTerm getChild(int i) { none() }
@@ -684,15 +697,15 @@ class RegExpConstant extends RegExpTerm {
684697
qstart <= start and end <= qend
685698
) and
686699
value = this.(RegExpNormalChar).getValue()
687-
// This will never hold
688-
// or
689-
// this = TRegExpSpecialChar(re, start, end) and
690-
// re.inCharSet(start) and
691-
// value = this.(RegExpSpecialChar).getChar()
692700
}
693701

702+
/**
703+
* Holds if this constant represents a valid Unicode character (as opposed
704+
* to a surrogate code point that does not correspond to a character by itself.)
705+
*/
694706
predicate isCharacter() { any() }
695707

708+
/** Gets the string matched by this constant term. */
696709
string getValue() { result = value }
697710

698711
override RegExpTerm getChild(int i) { none() }
@@ -731,10 +744,6 @@ class RegExpGroup extends RegExpTerm, TRegExpGroup {
731744
/** Gets the name of this capture group, if any. */
732745
string getName() { result = re.getGroupName(start, end) }
733746

734-
predicate isCharacter() { any() }
735-
736-
string getValue() { result = re.getText().substring(start, end) }
737-
738747
override RegExpTerm getChild(int i) {
739748
result.getRegex() = re and
740749
i = 0 and
@@ -762,8 +771,13 @@ class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar {
762771
re.specialCharacter(start, end, char)
763772
}
764773

774+
/**
775+
* Holds if this constant represents a valid Unicode character (as opposed
776+
* to a surrogate code point that does not correspond to a character by itself.)
777+
*/
765778
predicate isCharacter() { any() }
766779

780+
/** Gets the char for this term. */
767781
string getChar() { result = char }
768782

769783
override RegExpTerm getChild(int i) { none() }
@@ -828,8 +842,6 @@ class RegExpCaret extends RegExpSpecialChar {
828842
class RegExpZeroWidthMatch extends RegExpGroup {
829843
RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) }
830844

831-
override predicate isCharacter() { any() }
832-
833845
override RegExpTerm getChild(int i) { none() }
834846

835847
override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" }

python/ql/lib/semmle/python/frameworks/Stdlib.qll

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1636,6 +1636,119 @@ private module StdlibPrivate {
16361636
result = this.getArg(any(int i | i >= msgIndex))
16371637
}
16381638
}
1639+
1640+
// ---------------------------------------------------------------------------
1641+
// re
1642+
// ---------------------------------------------------------------------------
1643+
/**
1644+
* List of methods in the `re` module immediately executing a regular expression.
1645+
*
1646+
* See https://docs.python.org/3/library/re.html#module-contents
1647+
*/
1648+
private class RegexExecutionMethod extends string {
1649+
RegexExecutionMethod() {
1650+
this in ["match", "fullmatch", "search", "split", "findall", "finditer", "sub", "subn"]
1651+
}
1652+
1653+
/** Gets the index of the argument representing the string to be searched by a regex. */
1654+
int getStringArgIndex() {
1655+
this in ["match", "fullmatch", "search", "split", "findall", "finditer"] and
1656+
result = 1
1657+
or
1658+
this in ["sub", "subn"] and
1659+
result = 2
1660+
}
1661+
}
1662+
1663+
/**
1664+
* A a call to a method from the `re` module immediately executing a regular expression.
1665+
*
1666+
* See `RegexExecutionMethods`
1667+
*/
1668+
private class DirectRegexExecution extends DataFlow::CallCfgNode, RegexExecution::Range {
1669+
RegexExecutionMethod method;
1670+
1671+
DirectRegexExecution() { this = API::moduleImport("re").getMember(method).getACall() }
1672+
1673+
override DataFlow::Node getRegex() { result in [this.getArg(0), this.getArgByName("pattern")] }
1674+
1675+
override DataFlow::Node getString() {
1676+
result in [this.getArg(method.getStringArgIndex()), this.getArgByName("string")]
1677+
}
1678+
1679+
override string getName() { result = "re." + method }
1680+
}
1681+
1682+
/** Helper module for tracking compiled regexes. */
1683+
private module CompiledRegexes {
1684+
private DataFlow::TypeTrackingNode compiledRegex(DataFlow::TypeTracker t, DataFlow::Node regex) {
1685+
t.start() and
1686+
result = API::moduleImport("re").getMember("compile").getACall() and
1687+
regex in [
1688+
result.(DataFlow::CallCfgNode).getArg(0),
1689+
result.(DataFlow::CallCfgNode).getArgByName("pattern")
1690+
]
1691+
or
1692+
exists(DataFlow::TypeTracker t2 | result = compiledRegex(t2, regex).track(t2, t))
1693+
}
1694+
1695+
DataFlow::Node compiledRegex(DataFlow::Node regex) {
1696+
compiledRegex(DataFlow::TypeTracker::end(), regex).flowsTo(result)
1697+
}
1698+
}
1699+
1700+
private import CompiledRegexes
1701+
1702+
/**
1703+
* A call on compiled regular expression (obtained via `re.compile`) executing a
1704+
* regular expression.
1705+
*
1706+
* Given the following example:
1707+
*
1708+
* ```py
1709+
* pattern = re.compile(input)
1710+
* pattern.match(s)
1711+
* ```
1712+
*
1713+
* This class will identify that `re.compile` compiles `input` and afterwards
1714+
* executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)`
1715+
* and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument).
1716+
*
1717+
*
1718+
* See `RegexExecutionMethods`
1719+
*
1720+
* See https://docs.python.org/3/library/re.html#regular-expression-objects
1721+
*/
1722+
private class CompiledRegexExecution extends DataFlow::MethodCallNode, RegexExecution::Range {
1723+
DataFlow::Node regexNode;
1724+
RegexExecutionMethod method;
1725+
1726+
CompiledRegexExecution() { this.calls(compiledRegex(regexNode), method) }
1727+
1728+
override DataFlow::Node getRegex() { result = regexNode }
1729+
1730+
override DataFlow::Node getString() {
1731+
result in [this.getArg(method.getStringArgIndex() - 1), this.getArgByName("string")]
1732+
}
1733+
1734+
override string getName() { result = "re." + method }
1735+
}
1736+
1737+
/**
1738+
* A call to 're.escape'.
1739+
* See https://docs.python.org/3/library/re.html#re.escape
1740+
*/
1741+
private class ReEscapeCall extends Escaping::Range, DataFlow::CallCfgNode {
1742+
ReEscapeCall() { this = API::moduleImport("re").getMember("escape").getACall() }
1743+
1744+
override DataFlow::Node getAnInput() {
1745+
result in [this.getArg(0), this.getArgByName("pattern")]
1746+
}
1747+
1748+
override DataFlow::Node getOutput() { result = this }
1749+
1750+
override string getKind() { result = Escaping::getRegexKind() }
1751+
}
16391752
}
16401753

16411754
// ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)