cklin
diff --git a/‎python/change-notes/2021-09-14-promote-regex-injection.md
Lines changed: 2 additions & 0 deletions b/‎python/change-notes/2021-09-14-promote-regex-injection.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/ql/lib/semmle/python/Concepts.qll
Lines changed: 58 additions & 0 deletions b/‎python/ql/lib/semmle/python/Concepts.qll
Lines changed: 58 additions & 0 deletions
diff --git a/‎python/ql/lib/semmle/python/RegexTreeView.qll
Lines changed: 32 additions & 20 deletions b/‎python/ql/lib/semmle/python/RegexTreeView.qll
Lines changed: 32 additions & 20 deletions
diff --git a/‎python/ql/lib/semmle/python/frameworks/Stdlib.qll
Lines changed: 113 additions & 0 deletions b/‎python/ql/lib/semmle/python/frameworks/Stdlib.qll
Lines changed: 113 additions & 0 deletions
@@ -0,0 +1,2 @@
+lgtm,codescanning
+* The query "Regular expression injection" (`py/regex-injection`) has been promoted from experimental to the main query pack. Its results will now appear by default. This query was originally [submitted as an experimental query by @jorgectf](https://github.com/github/codeql/pull/5442).
@@ -355,6 +355,53 @@ module SqlExecution {
   }
 }
 
+/**
+ * A data-flow node that executes a regular expression.
+ *
+ * Extend this class to refine existing API models. If you want to model new APIs,
+ * extend `RegexExecution::Range` instead.
+ */
+class RegexExecution extends DataFlow::Node {
+  RegexExecution::Range range;
+
+  RegexExecution() { this = range }
+
+  /** Gets the data flow node for the regex being executed by this node. */
+  DataFlow::Node getRegex() { result = range.getRegex() }
+
+  /** Gets a dataflow node for the string to be searched or matched against. */
+  DataFlow::Node getString() { result = range.getString() }
+
+  /**
+   * Gets the name of this regex execution, typically the name of an executing method.
+   * This is used for nice alert messages and should include the module if possible.
+   */
+  string getName() { result = range.getName() }
+}
+
+/** Provides classes for modeling new regular-expression execution APIs. */
+module RegexExecution {
+  /**
+   * A data-flow node that executes a regular expression.
+   *
+   * Extend this class to model new APIs. If you want to refine existing API models,
+   * extend `RegexExecution` instead.
+   */
+  abstract class Range extends DataFlow::Node {
+    /** Gets the data flow node for the regex being executed by this node. */
+    abstract DataFlow::Node getRegex();
+
+    /** Gets a dataflow node for the string to be searched or matched against. */
+    abstract DataFlow::Node getString();
+
+    /**
+     * Gets the name of this regex execution, typically the name of an executing method.
+     * This is used for nice alert messages and should include the module if possible.
+     */
+    abstract string getName();
+  }
+}
+
 /**
  * A data-flow node that escapes meta-characters, which could be used to prevent
  * injection attacks.
@@ -411,6 +458,9 @@ module Escaping {
 
   /** Gets the escape-kind for escaping a string so it can safely be included in HTML. */
   string getHtmlKind() { result = "html" }
+
+  /** Gets the escape-kind for escaping a string so it can safely be included in HTML. */
+  string getRegexKind() { result = "regex" }
   // TODO: If adding an XML kind, update the modeling of the `MarkupSafe` PyPI package.
   //
   // Technically it claims to escape for both HTML and XML, but for now we don't have
@@ -427,6 +477,14 @@ class HtmlEscaping extends Escaping {
   HtmlEscaping() { range.getKind() = Escaping::getHtmlKind() }
 }
 
+/**
+ * An escape of a string so it can be safely included in
+ * the body of a regex.
+ */
+class RegexEscaping extends Escaping {
+  RegexEscaping() { range.getKind() = Escaping::getRegexKind() }
+}
+
 /** Provides classes for modeling HTTP-related APIs. */
 module HTTP {
   import semmle.python.web.HttpConstants
 
@@ -49,6 +49,7 @@ newtype TRegExpParent =
  * or another regular expression term.
  */
 class RegExpParent extends TRegExpParent {
+  /** Gets a textual representation of this element. */
   string toString() { result = "RegExpParent" }
 
   /** Gets the `i`th child term. */
@@ -72,14 +73,18 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
 
   override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() }
 
+  /** Holds if dot, `.`, matches all characters, including newlines. */
   predicate isDotAll() { re.getAMode() = "DOTALL" }
 
+  /** Holds if this regex matching is case-insensitive for this regex. */
   predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
 
+  /** Get a string representing all modes for this regex. */
   string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") }
 
   override Regex getRegex() { result = re }
 
+  /** Gets the primary QL class for this regex. */
   string getPrimaryQLClass() { result = "RegExpLiteral" }
 }
 
@@ -246,8 +251,10 @@ class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier {
     result.getEnd() = part_end
   }
 
+  /** Hols if this term may match an unlimited number of times. */
   predicate mayRepeatForever() { may_repeat_forever = true }
 
+  /** Gets the qualifier for this term. That is e.g "?" for "a?". */
   string getQualifier() { result = re.getText().substring(part_end, end) }
 
   override string getPrimaryQLClass() { result = "RegExpQuantifier" }
@@ -322,8 +329,10 @@ class RegExpRange extends RegExpQuantifier {
 
   RegExpRange() { re.multiples(part_end, end, lower, upper) }
 
+  /** Gets the string defining the upper bound of this range, if any. */
   string getUpper() { result = upper }
 
+  /** Gets the string defining the lower bound of this range, if any. */
   string getLower() { result = lower }
 
   /**
@@ -465,11 +474,13 @@ class RegExpEscape extends RegExpNormalChar {
     result = this.getUnicode()
   }
 
+  /** Holds if this terms name is given by the part following the escape character. */
   predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] }
 
   override string getPrimaryQLClass() { result = "RegExpEscape" }
 
-  string getUnescaped() { result = this.getText().suffix(1) }
+  /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */
+  private string getUnescaped() { result = this.getText().suffix(1) }
 
   /**
    * Gets the text for this escape. That is e.g. "\w".
@@ -536,15 +547,8 @@ private int toHex(string hex) {
  * ```
  */
 class RegExpCharacterClassEscape extends RegExpEscape {
-  // string value;
-  RegExpCharacterClassEscape() {
-    // value = re.getText().substring(start + 1, end) and
-    // value in ["d", "D", "s", "S", "w", "W"]
-    this.getValue() in ["d", "D", "s", "S", "w", "W"]
-  }
+  RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] }
 
-  /** Gets the name of the character class; for example, `w` for `\w`. */
-  // override string getValue() { result = value }
   override RegExpTerm getChild(int i) { none() }
 
   override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
@@ -563,10 +567,13 @@ class RegExpCharacterClassEscape extends RegExpEscape {
 class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass {
   RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) }
 
+  /** Holds if this character class is inverted, matching the opposite of its content. */
   predicate isInverted() { re.getChar(start + 1) = "^" }
 
+  /** Gets the `i`th char inside this charater class. */
   string getCharThing(int i) { result = re.getChar(i + start) }
 
+  /** Holds if this character class can match anything. */
   predicate isUniversalClass() {
     // [^]
     this.isInverted() and not exists(this.getAChild())
@@ -620,6 +627,7 @@ class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
     re.charRange(_, start, lower_end, upper_start, end)
   }
 
+  /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */
   predicate isRange(string lo, string hi) {
     lo = re.getText().substring(start, lower_end) and
     hi = re.getText().substring(upper_start, end)
@@ -653,8 +661,13 @@ class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
 class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar {
   RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) }
 
+  /**
+   * Holds if this constant represents a valid Unicode character (as opposed
+   * to a surrogate code point that does not correspond to a character by itself.)
+   */
   predicate isCharacter() { any() }
 
+  /** Gets the string representation of the char matched by this term. */
   string getValue() { result = re.getText().substring(start, end) }
 
   override RegExpTerm getChild(int i) { none() }
@@ -684,15 +697,15 @@ class RegExpConstant extends RegExpTerm {
       qstart <= start and end <= qend
     ) and
     value = this.(RegExpNormalChar).getValue()
-    // This will never hold
-    // or
-    // this = TRegExpSpecialChar(re, start, end) and
-    // re.inCharSet(start) and
-    // value = this.(RegExpSpecialChar).getChar()
   }
 
+  /**
+   * Holds if this constant represents a valid Unicode character (as opposed
+   * to a surrogate code point that does not correspond to a character by itself.)
+   */
   predicate isCharacter() { any() }
 
+  /** Gets the string matched by this constant term. */
   string getValue() { result = value }
 
   override RegExpTerm getChild(int i) { none() }
@@ -731,10 +744,6 @@ class RegExpGroup extends RegExpTerm, TRegExpGroup {
   /** Gets the name of this capture group, if any. */
   string getName() { result = re.getGroupName(start, end) }
 
-  predicate isCharacter() { any() }
-
-  string getValue() { result = re.getText().substring(start, end) }
-
   override RegExpTerm getChild(int i) {
     result.getRegex() = re and
     i = 0 and
@@ -762,8 +771,13 @@ class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar {
     re.specialCharacter(start, end, char)
   }
 
+  /**
+   * Holds if this constant represents a valid Unicode character (as opposed
+   * to a surrogate code point that does not correspond to a character by itself.)
+   */
   predicate isCharacter() { any() }
 
+  /** Gets the char for this term. */
   string getChar() { result = char }
 
   override RegExpTerm getChild(int i) { none() }
@@ -828,8 +842,6 @@ class RegExpCaret extends RegExpSpecialChar {
 class RegExpZeroWidthMatch extends RegExpGroup {
   RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) }
 
-  override predicate isCharacter() { any() }
-
   override RegExpTerm getChild(int i) { none() }
 
   override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" }
 
@@ -1636,6 +1636,119 @@ private module StdlibPrivate {
       result = this.getArg(any(int i | i >= msgIndex))
     }
   }
+
+  // ---------------------------------------------------------------------------
+  // re
+  // ---------------------------------------------------------------------------
+  /**
+   * List of methods in the `re` module immediately executing a regular expression.
+   *
+   * See https://docs.python.org/3/library/re.html#module-contents
+   */
+  private class RegexExecutionMethod extends string {
+    RegexExecutionMethod() {
+      this in ["match", "fullmatch", "search", "split", "findall", "finditer", "sub", "subn"]
+    }
+
+    /** Gets the index of the argument representing the string to be searched by a regex. */
+    int getStringArgIndex() {
+      this in ["match", "fullmatch", "search", "split", "findall", "finditer"] and
+      result = 1
+      or
+      this in ["sub", "subn"] and
+      result = 2
+    }
+  }
+
+  /**
+   * A a call to a method from the `re` module immediately executing a regular expression.
+   *
+   * See `RegexExecutionMethods`
+   */
+  private class DirectRegexExecution extends DataFlow::CallCfgNode, RegexExecution::Range {
+    RegexExecutionMethod method;
+
+    DirectRegexExecution() { this = API::moduleImport("re").getMember(method).getACall() }
+
+    override DataFlow::Node getRegex() { result in [this.getArg(0), this.getArgByName("pattern")] }
+
+    override DataFlow::Node getString() {
+      result in [this.getArg(method.getStringArgIndex()), this.getArgByName("string")]
+    }
+
+    override string getName() { result = "re." + method }
+  }
+
+  /** Helper module for tracking compiled regexes. */
+  private module CompiledRegexes {
+    private DataFlow::TypeTrackingNode compiledRegex(DataFlow::TypeTracker t, DataFlow::Node regex) {
+      t.start() and
+      result = API::moduleImport("re").getMember("compile").getACall() and
+      regex in [
+          result.(DataFlow::CallCfgNode).getArg(0),
+          result.(DataFlow::CallCfgNode).getArgByName("pattern")
+        ]
+      or
+      exists(DataFlow::TypeTracker t2 | result = compiledRegex(t2, regex).track(t2, t))
+    }
+
+    DataFlow::Node compiledRegex(DataFlow::Node regex) {
+      compiledRegex(DataFlow::TypeTracker::end(), regex).flowsTo(result)
+    }
+  }
+
+  private import CompiledRegexes
+
+  /**
+   * A call on compiled regular expression (obtained via `re.compile`) executing a
+   * regular expression.
+   *
+   * Given the following example:
+   *
+   * ```py
+   * pattern = re.compile(input)
+   * pattern.match(s)
+   * ```
+   *
+   * This class will identify that `re.compile` compiles `input` and afterwards
+   * executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)`
+   * and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument).
+   *
+   *
+   * See `RegexExecutionMethods`
+   *
+   * See https://docs.python.org/3/library/re.html#regular-expression-objects
+   */
+  private class CompiledRegexExecution extends DataFlow::MethodCallNode, RegexExecution::Range {
+    DataFlow::Node regexNode;
+    RegexExecutionMethod method;
+
+    CompiledRegexExecution() { this.calls(compiledRegex(regexNode), method) }
+
+    override DataFlow::Node getRegex() { result = regexNode }
+
+    override DataFlow::Node getString() {
+      result in [this.getArg(method.getStringArgIndex() - 1), this.getArgByName("string")]
+    }
+
+    override string getName() { result = "re." + method }
+  }
+
+  /**
+   * A call to 're.escape'.
+   * See https://docs.python.org/3/library/re.html#re.escape
+   */
+  private class ReEscapeCall extends Escaping::Range, DataFlow::CallCfgNode {
+    ReEscapeCall() { this = API::moduleImport("re").getMember("escape").getACall() }
+
+    override DataFlow::Node getAnInput() {
+      result in [this.getArg(0), this.getArgByName("pattern")]
+    }
+
+    override DataFlow::Node getOutput() { result = this }
+
+    override string getKind() { result = Escaping::getRegexKind() }
+  }
 }
 
 // ---------------------------------------------------------------------------
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+lgtm,codescanning`
	`2`	+* The query "Regular expression injection" (`py/regex-injection`) has been promoted from experimental to the main query pack. Its results will now appear by default. This query was originally [submitted as an experimental query by @jorgectf](https://github.com/github/codeql/pull/5442).