enable unicode support in the Python ReDoS query

erik-krogh · erik-krogh · commit 440e4b9a92e3 · 2021-07-11T21:28:40.000+02:00
diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll
@@ -473,46 +473,44 @@ class RegExpEscape extends RegExpNormalChar {
    * E.g. for `\u0061` this returns "a".
    */
   private string getUnicode() {
-    // TODO: Enable this once a supporting CLI is released.
-    // exists(int codepoint | codepoint = sum(getHexValueFromUnicode(_)) |
-    //   result = codepoint.toUnicode()
-    // )
-    none()
+    exists(int codepoint | codepoint = sum(getHexValueFromUnicode(_)) |
+      result = codepoint.toUnicode()
+    )
+  }
+
+  /**
+   * Gets int value for the `index`th char in the hex number of the unicode escape.
+   * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
+   */
+  private int getHexValueFromUnicode(int index) {
+    isUnicode() and
+    exists(string hex, string char | hex = getText().suffix(2) |
+      char = hex.charAt(index) and
+      result = 16.pow(hex.length() - index - 1) * toHex(char)
+    )
   }
-  // TODO: Enable this once a supporting CLI is released.
-  // /**
-  //  * Gets int value for the `index`th char in the hex number of the unicode escape.
-  //  * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
-  //  */
-  // private int getHexValueFromUnicode(int index) {
-  //   isUnicode() and
-  //   exists(string hex, string char | hex = getText().suffix(2) |
-  //     char = hex.charAt(index) and
-  //     result = 16.pow(hex.length() - index - 1) * toHex(char)
-  //   )
-  // }
-}
-
-// TODO: Enable this once a supporting CLI is released.
-// /**
-//  * Gets the hex number for the `hex` char.
-//  */
-// private int toHex(string hex) {
-//   hex = [0 .. 9].toString() and
-//   result = hex.toInt()
-//   or
-//   result = 10 and hex = ["a", "A"]
-//   or
-//   result = 11 and hex = ["b", "B"]
-//   or
-//   result = 12 and hex = ["c", "C"]
-//   or
-//   result = 13 and hex = ["d", "D"]
-//   or
-//   result = 14 and hex = ["e", "E"]
-//   or
-//   result = 15 and hex = ["f", "F"]
-// }
+}
+
+/**
+ * Gets the hex number for the `hex` char.
+ */
+private int toHex(string hex) {
+  hex = [0 .. 9].toString() and
+  result = hex.toInt()
+  or
+  result = 10 and hex = ["a", "A"]
+  or
+  result = 11 and hex = ["b", "B"]
+  or
+  result = 12 and hex = ["c", "C"]
+  or
+  result = 13 and hex = ["d", "D"]
+  or
+  result = 14 and hex = ["e", "E"]
+  or
+  result = 15 and hex = ["f", "F"]
+}
+
 /**
  * A character class escape in a regular expression.
  * That is, an escaped charachter that denotes multiple characters.
diff --git a/python/ql/test/query-tests/Security/CWE-730/ReDoS.expected b/python/ql/test/query-tests/Security/CWE-730/ReDoS.expected
@@ -92,5 +92,6 @@
 | redos.py:363:25:363:43 | ((?:a{0\|-)\|\\w\\{\\d)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0'. |
 | redos.py:364:25:364:45 | ((?:a{0,\|-)\|\\w\\{\\d,)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0,'. |
 | redos.py:365:25:365:48 | ((?:a{0,2\|-)\|\\w\\{\\d,\\d)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a{0,2'. |
+| redos.py:371:25:371:35 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
 | unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
 | unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
diff --git a/python/ql/test/query-tests/Security/CWE-730/redos.py b/python/ql/test/query-tests/Security/CWE-730/redos.py
@@ -366,3 +366,9 @@
 
 # GOOD:
 good42 = re.compile(r'''^((?:a{0,2}|-)|\w\{\d,\d\})+X$''')
+
+# NOT GOOD
+bad87 = re.compile(r'X(\u0061|a)*Y')
+
+# GOOD
+good43 = re.compile(r'X(\u0061|b)+Y')