Skip to content

Commit 61923e0

Browse files
committed
Normalize XML tag names
Fixes #1496
1 parent 062d190 commit 61923e0

File tree

4 files changed

+27
-5
lines changed

4 files changed

+27
-5
lines changed

CHANGES.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,14 @@
55
### Changes
66

77
### Improvements
8-
* `Element.cssSelector()` will prefer to return shorter selectors by using ancestor IDs when available and unique. E.g. `#id > div > p` instead of `html > body > div > div > p` [#2283](https://github.com/jhy/jsoup/pull/2283).
8+
9+
* `Element.cssSelector()` will prefer to return shorter selectors by using ancestor IDs when available and unique. E.g.
10+
`#id > div > p` instead of `html > body > div > div > p` [#2283](https://github.com/jhy/jsoup/pull/2283).
11+
12+
### Bug Fixes
13+
14+
* When serializing a Document to XML, element names with characters that are invalid in XML are now
15+
normalized. [#1496].(https://github.com/jhy/jsoup/issues/1496)
916

1017
## 1.19.1 (2025-03-04)
1118

src/main/java/org/jsoup/internal/Normalizer.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
package org.jsoup.internal;
22

3+
import org.jsoup.nodes.Attribute;
4+
import org.jsoup.nodes.Document;
5+
import org.jspecify.annotations.Nullable;
6+
37
import java.util.Locale;
48

59
/**
@@ -23,8 +27,7 @@ public static String normalize(final String input, boolean isStringLiteral) {
2327
}
2428

2529
/** Minimal helper to get an otherwise OK HTML name like "foo<bar" to "foo_bar". */
26-
public static String xmlSafeTagName(final String tagname) {
27-
// todo - if required we could make a fuller version of this as in Attribute.getValidKey(syntax) in Element. for now, just minimal based on what HtmlTreeBuilder produces
28-
return tagname.replace('<', '_');
30+
@Nullable public static String xmlSafeTagName(final String tagname) {
31+
return Attribute.getValidKey(tagname, Document.OutputSettings.Syntax.xml); // Reuses the Attribute key normal, which is same for xml tag names
2932
}
3033
}

src/main/java/org/jsoup/nodes/Element.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1838,7 +1838,7 @@ void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) thr
18381838
}
18391839

18401840
/* If XML syntax, normalizes < to _ in tag name. */
1841-
private String safeTagName(Document.OutputSettings.Syntax syntax) {
1841+
@Nullable private String safeTagName(Document.OutputSettings.Syntax syntax) {
18421842
return syntax == xml ? Normalizer.xmlSafeTagName(tagName()) : tagName();
18431843
}
18441844

src/test/java/org/jsoup/nodes/ElementTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3035,4 +3035,16 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
30353035
Iterator<Element> iterator = div.iterator();
30363036
assertIterates(iterator, expect);
30373037
}
3038+
3039+
@Test void htmlToXmlNormalizes() {
3040+
// https://github.com/jhy/jsoup/issues/1496
3041+
String in = "<p\u226F\u0322>One</p\u226F\u0322>";
3042+
Document doc = Jsoup.parse(in);
3043+
doc.outputSettings().prettyPrint(false);
3044+
String html = doc.body().html();
3045+
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
3046+
String xml = doc.body().html();
3047+
assertEquals("<p≯̢>One</p≯̢>", html);
3048+
assertEquals("<p_>One</p_>", xml);
3049+
}
30383050
}

0 commit comments

Comments
 (0)