Skip to content

Commit 116025c

Browse files
committed
use the new codePointAt and codePointCount methods instead of regex hacks
1 parent 59c43c7 commit 116025c

File tree

1 file changed

+19
-4
lines changed

1 file changed

+19
-4
lines changed

shared/util/codeql/util/Strings.qll

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ string escape(string s) {
1818
bindingset[s]
1919
private string escapeUnicodeString(string s) {
2020
result =
21-
concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
21+
concat(int i, string char |
22+
char = escapeUnicodeChar(s.codePointAt(i).toUnicode())
23+
|
24+
char order by i
25+
)
2226
}
2327

2428
/**
@@ -44,15 +48,26 @@ private predicate isPrintable(string char) {
4448

4549
/**
4650
* Gets the `i`th codepoint in `s`.
51+
* Unpaired surrogates are skipped.
4752
*/
4853
bindingset[s]
49-
string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
54+
string getCodepointAt(string s, int i) {
55+
// codePointAt returns the integer codePoint, so we need to convert to a string.
56+
// codePointAt returns integers for both the high and low end. The invalid strings are filtered out by `toUnicode`, but we need to re-count the index, therefore the rank.
57+
// rank is 1-indexed, so we need to offset for that to make this predicate 0-indexed.
58+
result =
59+
rank[i + 1](string char, int charIndex |
60+
char = s.codePointAt(charIndex).toUnicode()
61+
|
62+
char order by charIndex
63+
)
64+
}
5065

5166
/**
52-
* Gets the length of `s` in codepoints.
67+
* Gets the number of unicode codepoints in `s` not counting unpaired surrogates.
5368
*/
5469
bindingset[str]
55-
int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
70+
int getCodepointLength(string str) { result = str.codePointCount(0, str.length()) }
5671

5772
/**
5873
* Gets the ASCII code for `char`.

0 commit comments

Comments
 (0)