use the new codePointAt and codePointCount methods instead of regex hacks

erik-krogh · erik-krogh · commit 116025c569d4 · 2023-10-12T13:38:19.000+02:00
diff --git a/shared/util/codeql/util/Strings.qll b/shared/util/codeql/util/Strings.qll
@@ -18,7 +18,11 @@ string escape(string s) {
 bindingset[s]
 private string escapeUnicodeString(string s) {
   result =
-    concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
+    concat(int i, string char |
+      char = escapeUnicodeChar(s.codePointAt(i).toUnicode())
+    |
+      char order by i
+    )
 }
 
 /**
@@ -44,15 +48,26 @@ private predicate isPrintable(string char) {
 
 /**
  * Gets the `i`th codepoint in `s`.
+ * Unpaired surrogates are skipped.
  */
 bindingset[s]
-string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
+string getCodepointAt(string s, int i) {
+  // codePointAt returns the integer codePoint, so we need to convert to a string.
+  // codePointAt returns integers for both the high and low end. The invalid strings are filtered out by `toUnicode`, but we need to re-count the index, therefore the rank.
+  // rank is 1-indexed, so we need to offset for that to make this predicate 0-indexed.
+  result =
+    rank[i + 1](string char, int charIndex |
+      char = s.codePointAt(charIndex).toUnicode()
+    |
+      char order by charIndex
+    )
+}
 
 /**
- * Gets the length of `s` in codepoints.
+ * Gets the number of unicode codepoints in `s` not counting unpaired surrogates.
  */
 bindingset[str]
-int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
+int getCodepointLength(string str) { result = str.codePointCount(0, str.length()) }
 
 /**
  * Gets the ASCII code for `char`.