Skip to content

Commit 6bffda3

Browse files
ckettijhy
andauthored
Change TokenQueue.consumeCssIdentifier() to support hex digit unescaping (#2297)
To meet the https://www.w3.org/TR/css-syntax-3/#consume-name spec. --------- Co-authored-by: Jonathan Hedley <jonathan@hedley.net>
1 parent eb95db7 commit 6bffda3

File tree

3 files changed

+290
-13
lines changed

3 files changed

+290
-13
lines changed

src/main/java/org/jsoup/parser/TokenQueue.java

Lines changed: 153 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55

66
/**
77
* A character queue with parsing helpers.
8-
*
9-
* @author Jonathan Hedley
108
*/
119
public class TokenQueue {
1210
private String queue;
1311
private int pos = 0;
1412

15-
private static final char ESC = '\\'; // escape char for chomp balanced.
13+
private static final char Esc = '\\'; // escape char for chomp balanced.
14+
private static final char Hyphen_Minus = '-';
15+
private static final char Unicode_Null = '\u0000';
16+
private static final char Replacement = '\uFFFD';
1617

1718
/**
1819
Create a new TokenQueue.
@@ -116,6 +117,10 @@ public void advance() {
116117
if (!isEmpty()) pos++;
117118
}
118119

120+
private char current() {
121+
return queue.charAt(pos);
122+
}
123+
119124
/**
120125
* Consume one character off queue.
121126
* @return first character on queue.
@@ -238,7 +243,7 @@ public String chompBalanced(char open, char close) {
238243
do {
239244
if (isEmpty()) break;
240245
char c = consume();
241-
if (last != ESC) {
246+
if (last != Esc) {
242247
if (c == '\'' && c != open && !inDoubleQuote)
243248
inSingleQuote = !inSingleQuote;
244249
else if (c == '"' && c != open && !inSingleQuote)
@@ -281,8 +286,8 @@ public static String unescape(String in) {
281286
StringBuilder out = StringUtil.borrowBuilder();
282287
char last = 0;
283288
for (char c : in.toCharArray()) {
284-
if (c == ESC) {
285-
if (last == ESC) {
289+
if (c == Esc) {
290+
if (last == Esc) {
286291
out.append(c);
287292
c = 0;
288293
}
@@ -305,7 +310,7 @@ public static String escapeCssIdentifier(String in) {
305310
if (q.matchesCssIdentifier(CssIdentifierChars)) {
306311
out.append(q.consume());
307312
} else {
308-
out.append(ESC).append(q.consume());
313+
out.append(Esc).append(q.consume());
309314
}
310315
}
311316
return StringUtil.releaseBuilder(out);
@@ -335,7 +340,6 @@ public String consumeWord() {
335340
return queue.substring(start, pos);
336341
}
337342

338-
339343
/**
340344
* Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
341345
*
@@ -347,21 +351,157 @@ public String consumeElementSelector() {
347351
private static final String[] ElementSelectorChars = {"*", "|", "_", "-"};
348352

349353
/**
350-
Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
351-
http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
352-
@return identifier
354+
Consume a CSS identifier (ID or class) off the queue.
355+
<p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead
356+
of {@code \31}.</p>
357+
358+
@return The unescaped identifier.
359+
@throws IllegalArgumentException if an invalid escape sequence was found. Afterward, the state of the TokenQueue
360+
is undefined.
361+
@see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a>
362+
@see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a>
353363
*/
354364
public String consumeCssIdentifier() {
355-
return consumeEscapedCssIdentifier(CssIdentifierChars);
365+
if (isEmpty()) throw new IllegalArgumentException("CSS identifier expected, but end of input found");
366+
int start = pos;
367+
368+
// Fast path for CSS identifiers that don't contain escape sequences.
369+
while (!isEmpty()) {
370+
char c = current();
371+
if (isIdent(c)) {
372+
advance();
373+
} else if (c == Esc || c == Unicode_Null) {
374+
// Exit fast path when an escape sequence or U+0000 is found.
375+
break;
376+
} else {
377+
// End of identifier reached without encountering a sequence that requires special handling. The CSS
378+
// identifier is a substring of the input.
379+
return queue.substring(start, pos);
380+
}
381+
}
382+
383+
// An escape sequence was found. Use a StringBuilder to store the decoded CSS identifier.
384+
StringBuilder out = StringUtil.borrowBuilder();
385+
386+
if (start < pos) {
387+
// Copy the CSS identifier up to the first escape sequence.
388+
out.append(queue, start, pos);
389+
}
390+
391+
while (!isEmpty()) {
392+
char c = current();
393+
if (isIdent(c)) {
394+
out.append(consume());
395+
} else if (c == Unicode_Null) {
396+
// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
397+
advance();
398+
out.append(Replacement);
399+
} else if (c == Esc) {
400+
advance();
401+
if (!isEmpty() && isNewline(current())) {
402+
// Not a valid escape sequence. This is treated as the end of the CSS identifier.
403+
pos--;
404+
break;
405+
} else {
406+
consumeCssEscapeSequenceInto(out);
407+
}
408+
} else {
409+
break;
410+
}
411+
}
412+
return StringUtil.releaseBuilder(out);
356413
}
414+
415+
private void consumeCssEscapeSequenceInto(StringBuilder out) {
416+
if (isEmpty()) {
417+
out.append(Replacement);
418+
return;
419+
}
420+
int start = pos;
421+
char firstEscaped = consume();
422+
if (!isHexDigit(firstEscaped)) {
423+
out.append(firstEscaped);
424+
} else {
425+
for (int i = 0; i < 5 && !isEmpty(); i++) {
426+
char escapedChar = current();
427+
if (isHexDigit(escapedChar)) advance();
428+
else break;
429+
}
430+
String hexString = queue.substring(start, pos);
431+
int codePoint;
432+
try {
433+
codePoint = Integer.parseInt(hexString, 16);
434+
} catch (NumberFormatException e) { // Won't happen as we confirmed hex above; just mollifying scanners
435+
throw new IllegalArgumentException("Invalid escape sequence: " + hexString, e);
436+
}
437+
if (isValidCodePoint(codePoint)) {
438+
out.appendCodePoint(codePoint);
439+
} else {
440+
out.append(Replacement);
441+
}
442+
443+
if (!isEmpty()) {
444+
char c = current();
445+
if (c == '\r') {
446+
// Since there's currently no input preprocessing, check for CRLF here.
447+
// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
448+
advance();
449+
if (!isEmpty() && current() == '\n') advance();
450+
} else if (c == ' ' || c == '\t' || isNewline(c)) {
451+
advance();
452+
}
453+
}
454+
}
455+
}
456+
457+
// statics below specifically for CSS identifiers:
458+
459+
private static boolean isLetter(char c) {
460+
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z';
461+
}
462+
463+
private static boolean isDigit(char c) {
464+
return c >= '0' && c <= '9';
465+
}
466+
467+
private static boolean isHexDigit(char c) {
468+
return isDigit(c) || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F';
469+
}
470+
471+
// https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point
472+
private static boolean isNonAscii(char c) {
473+
return c >= '\u0080';
474+
}
475+
476+
// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
477+
private static boolean isIdentStart(char c) {
478+
return c == '_' || isLetter(c) || isNonAscii(c);
479+
}
480+
481+
// https://www.w3.org/TR/css-syntax-3/#ident-code-point
482+
private static boolean isIdent(char c) {
483+
return c == Hyphen_Minus || isDigit(c) || isIdentStart(c);
484+
}
485+
486+
// https://www.w3.org/TR/css-syntax-3/#newline
487+
// Note: currently there's no preprocessing happening.
488+
private static boolean isNewline(char c) {
489+
return c == '\n' || c == '\r' || c == '\f';
490+
}
491+
492+
// https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
493+
private static boolean isValidCodePoint(int codePoint) {
494+
return codePoint != 0 && Character.isValidCodePoint(codePoint) && !Character.isSurrogate((char) codePoint);
495+
}
496+
357497
private static final String[] CssIdentifierChars = {"-", "_"};
358498

359499

360500
private String consumeEscapedCssIdentifier(String... matches) {
361501
int start = pos;
362502
boolean escaped = false;
363503
while (!isEmpty()) {
364-
if (queue.charAt(pos) == ESC && remainingLength() >1 ) {
504+
if (queue.charAt(pos) == Esc && remainingLength() >1 ) {
365505
escaped = true;
366506
pos+=2; // skip the escape and the escaped
367507
} else if (matchesCssIdentifier(matches)) {

src/test/java/org/jsoup/parser/TokenQueueTest.java

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@
33
import org.jsoup.Jsoup;
44
import org.jsoup.nodes.Document;
55
import org.junit.jupiter.api.Test;
6+
import org.junit.jupiter.params.ParameterizedTest;
7+
import org.junit.jupiter.params.provider.Arguments;
8+
import org.junit.jupiter.params.provider.MethodSource;
69

710
import java.util.regex.Pattern;
11+
import java.util.stream.Stream;
812

913
import static org.junit.jupiter.api.Assertions.*;
1014

@@ -139,4 +143,126 @@ public void testQuotedPattern() {
139143
assertEquals("i\\d", q.consumeCssIdentifier());
140144
assertTrue(q.isEmpty());
141145
}
146+
147+
@ParameterizedTest
148+
@MethodSource("cssIdentifiers")
149+
@MethodSource("cssAdditionalIdentifiers")
150+
void consumeCssIdentifier_WebPlatformTests(String expected, String cssIdentifier) {
151+
assertParsedCssIdentifierEquals(expected, cssIdentifier);
152+
}
153+
154+
private static Stream<Arguments> cssIdentifiers() {
155+
return Stream.of(
156+
// https://github.com/web-platform-tests/wpt/blob/36036fb5212a3fc15fc5750cecb1923ba4071668/dom/nodes/ParentNode-querySelector-escapes.html
157+
// - escape hex digit
158+
Arguments.of("0nextIsWhiteSpace", "\\30 nextIsWhiteSpace"),
159+
Arguments.of("0nextIsNotHexLetters", "\\30nextIsNotHexLetters"),
160+
Arguments.of("0connectHexMoreThan6Hex", "\\000030connectHexMoreThan6Hex"),
161+
Arguments.of("0spaceMoreThan6Hex", "\\000030 spaceMoreThan6Hex"),
162+
163+
// - hex digit special replacement
164+
// 1. zero points
165+
Arguments.of("zero\uFFFD", "zero\\0"),
166+
Arguments.of("zero\uFFFD", "zero\\000000"),
167+
// 2. surrogate points
168+
Arguments.of("\uFFFDsurrogateFirst", "\\d83d surrogateFirst"),
169+
Arguments.of("surrogateSecond\uFFFd", "surrogateSecond\\dd11"),
170+
Arguments.of("surrogatePair\uFFFD\uFFFD", "surrogatePair\\d83d\\dd11"),
171+
// 3. out of range points
172+
Arguments.of("outOfRange\uFFFD", "outOfRange\\110000"),
173+
Arguments.of("outOfRange\uFFFD", "outOfRange\\110030"),
174+
Arguments.of("outOfRange\uFFFD", "outOfRange\\555555"),
175+
Arguments.of("outOfRange\uFFFD", "outOfRange\\ffffff"),
176+
177+
// - escape anything else
178+
Arguments.of(".comma", "\\.comma"),
179+
Arguments.of("-minus", "\\-minus"),
180+
Arguments.of("g", "\\g"),
181+
182+
// non edge cases
183+
Arguments.of("aBMPRegular", "\\61 BMPRegular"),
184+
Arguments.of("\uD83D\uDD11nonBMP", "\\1f511 nonBMP"),
185+
Arguments.of("00continueEscapes", "\\30\\30 continueEscapes"),
186+
Arguments.of("00continueEscapes", "\\30 \\30 continueEscapes"),
187+
Arguments.of("continueEscapes00", "continueEscapes\\30 \\30 "),
188+
Arguments.of("continueEscapes00", "continueEscapes\\30 \\30"),
189+
Arguments.of("continueEscapes00", "continueEscapes\\30\\30 "),
190+
Arguments.of("continueEscapes00", "continueEscapes\\30\\30"),
191+
192+
// ident tests case from CSS tests of chromium source: https://goo.gl/3Cxdov
193+
Arguments.of("hello", "hel\\6Co"),
194+
Arguments.of("&B", "\\26 B"),
195+
Arguments.of("hello", "hel\\6C o"),
196+
Arguments.of("spaces", "spac\\65\r\ns"),
197+
Arguments.of("spaces", "sp\\61\tc\\65\fs"),
198+
Arguments.of("test\uD799", "test\\D799"),
199+
Arguments.of("\uE000", "\\E000"),
200+
Arguments.of("test", "te\\s\\t"),
201+
Arguments.of("spaces in\tident", "spaces\\ in\\\tident"),
202+
Arguments.of(".,:!", "\\.\\,\\:\\!"),
203+
Arguments.of("null\uFFFD", "null\\0"),
204+
Arguments.of("null\uFFFD", "null\\0000"),
205+
Arguments.of("large\uFFFD", "large\\110000"),
206+
Arguments.of("large\uFFFD", "large\\23456a"),
207+
Arguments.of("surrogate\uFFFD", "surrogate\\D800"),
208+
Arguments.of("surrogate\uFFFD", "surrogate\\0DBAC"),
209+
Arguments.of("\uFFFDsurrogate", "\\00DFFFsurrogate"),
210+
Arguments.of("\uDBFF\uDFFF", "\\10fFfF"),
211+
Arguments.of("\uDBFF\uDFFF0", "\\10fFfF0"),
212+
Arguments.of("\uDBC0\uDC0000", "\\10000000"),
213+
Arguments.of("eof\uFFFD", "eof\\"),
214+
215+
Arguments.of("simple-ident", "simple-ident"),
216+
Arguments.of("testing123", "testing123"),
217+
Arguments.of("_underscore", "_underscore"),
218+
Arguments.of("-text", "-text"),
219+
Arguments.of("-m", "-\\6d"),
220+
Arguments.of("--abc", "--abc"),
221+
Arguments.of("--", "--"),
222+
Arguments.of("--11", "--11"),
223+
Arguments.of("---", "---"),
224+
Arguments.of("\u2003", "\u2003"),
225+
Arguments.of("\u00A0", "\u00A0"),
226+
Arguments.of("\u1234", "\u1234"),
227+
Arguments.of("\uD808\uDF45", "\uD808\uDF45"),
228+
Arguments.of("\uFFFD", "\u0000"),
229+
Arguments.of("ab\uFFFDc", "ab\u0000c")
230+
);
231+
}
232+
233+
private static Stream<Arguments> cssAdditionalIdentifiers() {
234+
return Stream.of(
235+
Arguments.of("1st", "\\31\r\nst"),
236+
Arguments.of("1", "\\31\r"),
237+
Arguments.of("1a", "\\31\ra"),
238+
Arguments.of("1", "\\031"),
239+
Arguments.of("1", "\\0031"),
240+
Arguments.of("1", "\\00031"),
241+
Arguments.of("1", "\\000031"),
242+
Arguments.of("1", "\\000031"),
243+
Arguments.of("a", "a\\\nb")
244+
);
245+
}
246+
247+
@Test void consumeCssIdentifierWithEmptyInput() {
248+
TokenQueue emptyQueue = new TokenQueue("");
249+
Exception exception = assertThrows(IllegalArgumentException.class, emptyQueue::consumeCssIdentifier);
250+
assertEquals("CSS identifier expected, but end of input found", exception.getMessage());
251+
}
252+
253+
// Some of jsoup's tests depend on this behavior
254+
@Test public void consumeCssIdentifier_invalidButSupportedForBackwardsCompatibility() {
255+
assertParsedCssIdentifierEquals("1", "1");
256+
assertParsedCssIdentifierEquals("-", "-");
257+
assertParsedCssIdentifierEquals("-1", "-1");
258+
}
259+
260+
private static String parseCssIdentifier(String text) {
261+
TokenQueue q = new TokenQueue(text);
262+
return q.consumeCssIdentifier();
263+
}
264+
265+
private void assertParsedCssIdentifierEquals(String expected, String cssIdentifier) {
266+
assertEquals(expected, parseCssIdentifier(cssIdentifier));
267+
}
142268
}

src/test/java/org/jsoup/select/SelectorTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,4 +1450,15 @@ public void testAncestorChain() {
14501450
assertTrue(b_needs_a.threadMemo.get().isEmpty());
14511451
assertTrue(c_needs_b.threadMemo.get().isEmpty());
14521452
}
1453+
1454+
@Test void hexDigitUnescape() {
1455+
// tests the select component of https://github.com/jhy/jsoup/pull/2297, with per-spec escapes
1456+
// literal is: #\30 \%\ Platform\ Image
1457+
String html = "<img id='0% Platform Image'>";
1458+
String q = "#\\30 \\%\\ Platform\\ Image";
1459+
1460+
Document doc = Jsoup.parse(html);
1461+
Element img = doc.expectFirst(q);
1462+
assertEquals("img", img.tagName());
1463+
}
14531464
}

0 commit comments

Comments
 (0)