Merge branch 'python-port-ReDoS' of github.com:yoff/codeql into python-port-ReDoS

yoff · yoff · commit 52d91917aab9 · 2021-06-30T12:25:59.000+02:00
diff --git a/python/change-notes/2021-07-28-port-RoDoS-queries.md b/python/change-notes/2021-07-28-port-RoDoS-queries.md
@@ -1,3 +1,3 @@
 lgtm,codescanning
-* Ported _Inefficient regular expression_ (`py/redos`) query from javascript.
-* Ported _Polynomial regular expression used on uncontrolled data_ [`py/polynomial-redos`] query from javascript.
+* Added _Inefficient regular expression_ (`py/redos`) query, which is already available in JavaScript.
+* Added _Polynomial regular expression used on uncontrolled data_ (`py/polynomial-redos`), which is already available in JavaScript.
diff --git a/python/ql/src/semmle/python/regex.qll b/python/ql/src/semmle/python/regex.qll
@@ -130,36 +130,57 @@ abstract class RegexString extends Expr {
   /** result is true for those start chars that actually mark a start of a char set. */
   boolean char_set_start(int pos) {
     exists(int index |
-      char_set_delimiter(index, pos) = true and
+      // is opening bracket
+      this.char_set_delimiter(index, pos) = true and
       (
-        index = 1 and result = true // if a '[' is first in the string (among brackets), it starts a char set
+        // if this is the first bracket, `pos` starts a char set
+        index = 1 and result = true
         or
+        // if the previous char set delimiter was not a closing bracket, `pos` does
+        // not start a char set. This is needed to handle cases such as `[[]` (a
+        // char set that matches the `[` char)
         index > 1 and
-        not char_set_delimiter(index - 1, _) = false and
+        not this.char_set_delimiter(index - 1, _) = false and
         result = false
         or
-        exists(int p1 |
-          char_set_delimiter(index - 1, p1) = false and // if it is preceded by a closing bracket, it starts a char set
+        // special handling of cases such as `[][]` (the character-set of the characters `]` and `[`).
+        exists(int prev_closing_bracket_pos |
+          // previous bracket is a closing bracket
+          this.char_set_delimiter(index - 1, prev_closing_bracket_pos) = false and
           if
-            exists(int p2 |
-              p1 = p2 + 1
-              or
-              this.getChar(p2 + 1) = "^" and
-              p1 = p2 + 2
+            // check if the character that comes before the previous closing bracket
+            // is an opening bracket (taking `^` into account)
+            exists(int pos_before_prev_closing_bracket |
+              if this.getChar(prev_closing_bracket_pos - 1) = "^"
+              then pos_before_prev_closing_bracket = prev_closing_bracket_pos - 2
+              else pos_before_prev_closing_bracket = prev_closing_bracket_pos - 1
             |
-              char_set_delimiter(index - 2, p2) = true // but the closing bracket only closes...
+              this.char_set_delimiter(index - 2, pos_before_prev_closing_bracket) = true
             )
           then
-            exists(int p2 | char_set_delimiter(index - 2, p2) = true |
-              result = char_set_start(p2).booleanNot() // ...if it is not the first in a char set
+            // brackets without anything in between is not valid character ranges, so
+            // the first closing bracket in `[]]` and `[^]]` does not count,
+            //
+            // and we should _not_ mark the second opening bracket in `[][]` and `[^][]`
+            // as starting a new char set.                               ^           ^
+            exists(int pos_before_prev_closing_bracket |
+              this.char_set_delimiter(index - 2, pos_before_prev_closing_bracket) = true
+            |
+              result = this.char_set_start(pos_before_prev_closing_bracket).booleanNot()
             )
-          else result = true
+          else
+            // if not, `pos` does in fact mark a real start of a character range
+            result = true
         )
       )
     )
   }
 
-  /** result denotes if the index is a left bracket */
+  /** 
+   * Helper predicate for chars that could be character-set delimiters. 
+   * Holds if the (non-escaped) char at `pos` in the string, is the (one-based) `index` occurrence of a bracket (`[` or `]`) in the string. 
+   * Result if `true` is the char is `[`, and `false` if the char is `]`.
+   */
   boolean char_set_delimiter(int index, int pos) {
     pos = rank[index](int p | this.nonEscapedCharAt(p) = "[" or this.nonEscapedCharAt(p) = "]") and
     (
diff --git a/python/ql/test/library-tests/regex/charRangeTest.py b/python/ql/test/library-tests/regex/charRangeTest.py
@@ -1,13 +1,26 @@
 import re
 
-re.compile(r'[]-[]') #$ charRange=1:2-3:4
+
+re.compile(r'[A-Z]') #$ charRange=1:2-3:4
+
+try:
+    re.compile(r'[]-[]') #$ SPURIOUS: charRange=1:2-3:4
+    raise Exception("this should not be reached")
+except re.error:
+    pass
+
 re.compile(r'[---]') #$ charRange=1:2-3:4
 re.compile(r'[\---]') #$ charRange=1:3-4:5
 re.compile(r'[--\-]') #$ charRange=1:2-3:5
 re.compile(r'[\--\-]') #$ charRange=1:3-4:6
 re.compile(r'[0-9-A-Z]') #$ charRange=1:2-3:4 charRange=5:6-7:8
 re.compile(r'[0\-9-A-Z]') #$ charRange=4:5-6:7
-re.compile(r'[0--9-A-Z]') #$ charRange=1:2-3:4 charRange=4:5-6:7
+
+try:
+    re.compile(r'[0--9-A-Z]') #$ SPURIOUS: charRange=1:2-3:4 charRange=4:5-6:7
+    raise Exception("this should not be reached")
+except re.error:
+    pass
 
 re.compile(r'[^A-Z]') #$ charRange=2:3-4:5
 
diff --git a/python/ql/test/library-tests/regex/charSetTest.py b/python/ql/test/library-tests/regex/charSetTest.py
@@ -10,8 +10,18 @@
 re.compile("[[]]") #$ charSet=0:3
 re.compile("[^]]") #$ charSet=0:4
 re.compile("[^-]") #$ charSet=0:4
-re.compile("[]-[]") #$ charSet=0:5
-re.compile("[^]-[]") #$ charSet=0:6
+
+try:
+    re.compile("[]-[]") #$ SPURIOUS: charSet=0:5
+    raise Exception("this should not be reached")
+except re.error:
+    pass
+
+try:
+    re.compile("[^]-[]") #$ SPURIOUS: charSet=0:6
+    raise Exception("this should not be reached")
+except re.error:
+    pass
 
 re.compile("]]][[[[]") #$ charSet=3:8