Skip to content

Commit 7693192

Browse files
committed
Tokenize closing Data as tagName, not LetterSequence
Fixes #2332
1 parent 4533854 commit 7693192

File tree

4 files changed

+34
-6
lines changed

4 files changed

+34
-6
lines changed

CHANGES.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
* Added `Connection#readFully()` as a replacement for `Connection#bufferUp()` with an explicit IOException. Similarly, added `Connection#readBody()` over `Connection#body()`. Deprecated `Connection#bufferUp()`. [#2327](https://github.com/jhy/jsoup/pull/2327)
1717

1818
### Bug Fixes
19-
* The contents of a `script` in a `svg` foreign context should be parsed as script data, not text. [#2320](https://github.com/jhy/jsoup/issues/2320)
19+
* The contents of a `script` in a `svg` foreign context should be parsed as script data, not text. [#2320](https://github.com/jhy/jsoup/issues/2320)
2020
* `Tag#isFormSubmittable()` was updating the Tag's options. [#2323](https://github.com/jhy/jsoup/issues/2323)
2121
* The HTML pretty-printer would incorrectly trim whitespace when text followed an inline element in a block element. [#2325](https://github.com/jhy/jsoup/issues/2325)
22+
* Custom tags with hyphens or other non-letter characters in their names now work correctly as Data or RcData tags. Their closing tags are now tokenized properly. [#2332](https://github.com/jhy/jsoup/issues/2332)
2223

2324
## 1.20.1 (2025-04-29)
2425

src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -624,9 +624,9 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
624624
default:
625625
Tag tag = tb.tagFor(startTag);
626626
TokeniserState textState = tag.textState();
627-
// custom rcdata or rawtext (if we were in head, will have auto-transitioned here)
628-
if (textState != null) HandleTextState(startTag, tb, textState);
629-
else if (!tag.isKnownTag()) { // no other special rules for custom tags
627+
if (textState != null) { // custom rcdata or rawtext (if we were in head, will have auto-transitioned here)
628+
HandleTextState(startTag, tb, textState);
629+
} else if (!tag.isKnownTag()) { // no other special rules for custom tags
630630
tb.insertElementFor(startTag);
631631
} else if (inSorted(name, Constants.InBodyStartPClosers)) {
632632
if (tb.inButtonScope("p")) tb.processEndTag("p");

src/main/java/org/jsoup/parser/TokeniserState.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ enum TokeniserState {
216216
RCDATAEndTagName {
217217
@Override void read(Tokeniser t, CharacterReader r) {
218218
if (r.matchesAsciiAlpha()) {
219-
String name = r.consumeLetterSequence();
219+
String name = r.consumeTagName();
220220
t.tagPending.appendTagName(name);
221221
t.dataBuffer.append(name);
222222
return;
@@ -1673,7 +1673,7 @@ else if (r.matches('>')) {
16731673
*/
16741674
private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) {
16751675
if (r.matchesAsciiAlpha()) {
1676-
String name = r.consumeLetterSequence();
1676+
String name = r.consumeTagName();
16771677
t.tagPending.appendTagName(name);
16781678
t.dataBuffer.append(name);
16791679
return;

src/test/java/org/jsoup/parser/TokeniserStateTest.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,4 +270,31 @@ public void attributeValUnquoted() {
270270
doc = Jsoup.parse("<p foo=");
271271
assertEquals("<p foo></p>", doc.body().html());
272272
}
273+
274+
@Test void customDataTagWithHyphen() {
275+
// https://github.com/jhy/jsoup/issues/2332
276+
277+
TagSet tagSet = TagSet.Html();
278+
tagSet.valueOf("custom-data", Parser.NamespaceHtml).set(Tag.Data);
279+
tagSet.valueOf("custom-rcdata", Parser.NamespaceHtml).set(Tag.RcData);
280+
281+
String html = "<body><custom-data>a < > b</custom-data><p>One</p><custom-rcdata>a < > b</custom-rcdata><p>Two</p>";
282+
Document doc = Jsoup.parse(html, Parser.htmlParser().tagSet(tagSet));
283+
assertEquals(
284+
"<custom-data>a < > b</custom-data><p>One</p><custom-rcdata>a &lt; &gt; b</custom-rcdata><p>Two</p>",
285+
TextUtil.normalizeSpaces(doc.body().html()));
286+
}
287+
288+
@Test void customDataTagWithHyphenXml() {
289+
String xml = "<custom-data>a < > b</custom-data><p>One</p><custom-rcdata>a < > b</custom-rcdata><p>Two</p>";
290+
Parser parser = Parser.xmlParser();
291+
TagSet tagSet = parser.tagSet();
292+
tagSet.valueOf("custom-data", Parser.NamespaceXml).set(Tag.Data);
293+
tagSet.valueOf("custom-rcdata", Parser.NamespaceXml).set(Tag.RcData);
294+
295+
Document doc = Jsoup.parse(xml, parser);
296+
assertEquals(
297+
"<custom-data><![CDATA[a < > b]]></custom-data><p>One</p><custom-rcdata>a &lt; &gt; b</custom-rcdata><p>Two</p>",
298+
TextUtil.normalizeSpaces(doc.html()));
299+
}
273300
}

0 commit comments

Comments
 (0)