Merge pull request github#14481 from erik-krogh/proper-codepoints

erik-krogh · web-flow · commit b1ad61e27d41 · 2023-10-13T09:35:55.000+02:00
ReDoS: use the new codePointAt and codePointCount methods instead of regex hacks
diff --git a/docs/codeql/ql-language-reference/ql-language-specification.rst b/docs/codeql/ql-language-reference/ql-language-specification.rst
@@ -1993,6 +1993,10 @@ The following built-in predicates are members of type ``int``:
 +-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
 | ``toUnicode``           | string      |                | The result is the unicode character for the receiver seen as a unicode code point.                             |
 +-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
+| ``codePointAt``         | int         | int            | The result is the unicode code point at the index given by the argument.                                       |
++-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
+| ``codePointCount``      | int         | int, int       | The result is the number of unicode code points in the receiver between the given indices.                     |
++-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
 
 The leftmost bit after ``bitShiftRightSigned`` depends on sign extension, whereas after ``bitShiftRight`` it is zero.
 
diff --git a/shared/regex/codeql/regex/nfa/NfaUtils.qll b/shared/regex/codeql/regex/nfa/NfaUtils.qll
@@ -164,17 +164,17 @@ module Make<RegexTreeViewSig TreeImpl> {
     /** An input symbol corresponding to character `c`. */
     Char(string c) {
       c =
-        getCodepointAt(any(RegexpCharacterConstant cc |
+        getACodepoint(any(RegexpCharacterConstant cc |
             cc instanceof RelevantRegExpTerm and
             not isIgnoreCase(cc.getRootTerm())
-          ).getValue(), _)
+          ).getValue())
       or
       // normalize everything to lower case if the regexp is case insensitive
       c =
         any(RegexpCharacterConstant cc, string char |
           cc instanceof RelevantRegExpTerm and
           isIgnoreCase(cc.getRootTerm()) and
-          char = getCodepointAt(cc.getValue(), _)
+          char = getACodepoint(cc.getValue())
         |
           char.toLowerCase()
         )
@@ -370,7 +370,7 @@ module Make<RegexTreeViewSig TreeImpl> {
     string getARelevantChar() {
       exists(asciiPrintable(result))
       or
-      exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
+      exists(RegexpCharacterConstant c | result = getACodepoint(c.getValue()))
       or
       classEscapeMatches(_, result)
     }
@@ -1258,7 +1258,7 @@ module Make<RegexTreeViewSig TreeImpl> {
        * Gets a `char` that occurs in a `pump` string.
        */
       private string getAProcessChar() {
-        result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
+        result = getACodepoint(any(string s | isReDoSCandidate(_, s)))
       }
     }
 
diff --git a/shared/util/codeql/util/Strings.qll b/shared/util/codeql/util/Strings.qll
@@ -18,7 +18,11 @@ string escape(string s) {
 bindingset[s]
 private string escapeUnicodeString(string s) {
   result =
-    concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
+    concat(int i, string char |
+      char = escapeUnicodeChar(s.codePointAt(i).toUnicode())
+    |
+      char order by i
+    )
 }
 
 /**
@@ -44,15 +48,32 @@ private predicate isPrintable(string char) {
 
 /**
  * Gets the `i`th codepoint in `s`.
+ * Unpaired surrogates are skipped.
+ */
+bindingset[s]
+string getCodepointAt(string s, int i) {
+  // codePointAt returns the integer codePoint, so we need to convert to a string.
+  // codePointAt returns integers for both the high and low end. The invalid strings are filtered out by `toUnicode`, but we need to re-count the index, therefore the rank.
+  // rank is 1-indexed, so we need to offset for that to make this predicate 0-indexed.
+  result =
+    rank[i + 1](string char, int charIndex |
+      char = s.codePointAt(charIndex).toUnicode()
+    |
+      char order by charIndex
+    )
+}
+
+/**
+ * Gets any unicode character that appears in `s`.
  */
 bindingset[s]
-string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
+string getACodepoint(string s) { result = s.codePointAt(_).toUnicode() }
 
 /**
- * Gets the length of `s` in codepoints.
+ * Gets the number of unicode codepoints in `s` not counting unpaired surrogates.
  */
 bindingset[str]
-int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
+int getCodepointLength(string str) { result = str.codePointCount(0, str.length()) }
 
 /**
  * Gets the ASCII code for `char`.