Skip to content

Commit 1eb3104

Browse files
HTML 5 namespace prefix normalization for HTML and XHTML serializers (#2588)
* fix 'canonical' handling for DBNode element namespace * add HTML 5 namespace normalization to HTML and XHTML serializers
1 parent 22701a2 commit 1eb3104

File tree

7 files changed

+118
-15
lines changed

7 files changed

+118
-15
lines changed

basex-core/src/main/java/org/basex/io/serial/HTMLSerializer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* @author BaseX Team, BSD License
1919
* @author Christian Gruen
2020
*/
21-
final class HTMLSerializer extends MarkupSerializer {
21+
final class HTMLSerializer extends XhtmlHtmlSerializer {
2222
/** (X)HTML: elements with an empty content model. */
2323
static final TokenSet EMPTIES = new TokenSet("area", "base", "basefont", "br", "col", "embed",
2424
"frame", "hr", "img", "input", "isindex", "link", "meta", "param");

basex-core/src/main/java/org/basex/io/serial/Serializer.java

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ public abstract class Serializer implements Closeable {
6262
* @param uri URI (can be {@code null})
6363
* @param value attribute value (can be {@code null})
6464
*/
65-
private record Att(byte[] name, byte[] value, byte[] uri) { }
66-
/** Attribute/namespace collector. */
67-
private final ArrayList<Att> attributes = new ArrayList<>();
65+
protected record Att(byte[] name, byte[] value, byte[] uri) { }
66+
/** Attribute collector. */
67+
protected final ArrayList<Att> attributes = new ArrayList<>();
68+
/** Namespace collector. */
69+
protected final ArrayList<Att> namespaces = new ArrayList<>();
6870

6971
/** Static context. */
7072
protected StaticContext sc;
@@ -270,6 +272,15 @@ protected boolean skipElement(final XNode node) {
270272
protected void attribute(final byte[] name, final byte[] value, final boolean standalone)
271273
throws IOException { }
272274

275+
/**
276+
* Returns the element name (may be overridden to modify names).
277+
* @param name original name
278+
* @return modified name
279+
*/
280+
protected QNm elementName(final QNm name) {
281+
return name;
282+
}
283+
273284
/**
274285
* Starts an element.
275286
* @param name element name
@@ -278,6 +289,13 @@ protected void attribute(final byte[] name, final byte[] value, final boolean st
278289
@SuppressWarnings("unused")
279290
protected void startOpen(final QNm name) throws IOException { }
280291

292+
/**
293+
* Adjusts namespaces before serializing namespaces and attributes.
294+
* @param name original element name
295+
*/
296+
@SuppressWarnings("unused")
297+
protected void adjustNamespaces(final QNm name) { }
298+
281299
/**
282300
* Finishes an opening element node.
283301
* @throws IOException I/O exception
@@ -358,8 +376,8 @@ private void addAttribute(final byte[] name, final byte[] value, final byte[] ur
358376
* @param prefix prefix
359377
* @param uri URI
360378
*/
361-
private void addNamespace(final byte[] prefix, final byte[] uri) {
362-
attributes.add(new Att(prefix, null, uri));
379+
protected void addNamespace(final byte[] prefix, final byte[] uri) {
380+
namespaces.add(new Att(prefix, null, uri));
363381
}
364382

365383
/**
@@ -384,10 +402,10 @@ private void emitAttributes() throws IOException {
384402
*/
385403
private void emitNamespaces() throws IOException {
386404
if(canonical) {
387-
attributes.sort((a, b) -> compare(a.name, b.name));
405+
namespaces.sort((a, b) -> compare(a.name, b.name));
388406
}
389-
for(final Att att : attributes) namespace(att.name, att.uri, false);
390-
attributes.clear();
407+
for(final Att att : namespaces) namespace(att.name, att.uri, false);
408+
namespaces.clear();
391409
}
392410

393411
/**
@@ -454,9 +472,10 @@ private void node(final DBNode node) throws IOException {
454472
nsUri = data.nspaces.uri(data.uriId(pre, kind));
455473
}
456474
// open element, serialize namespace declaration if it's new
457-
openElement(new QNm(name, nsUri));
475+
final QNm originalName = new QNm(name, nsUri);
476+
openElement(elementName(originalName));
458477
if(nsUri == null) nsUri = EMPTY;
459-
namespace(nsPrefix, nsUri, false);
478+
addNamespace(nsPrefix, nsUri);
460479

461480
// database contains namespaces: add declarations
462481
if(nsExist) {
@@ -476,7 +495,6 @@ private void node(final DBNode node) throws IOException {
476495
} while(p >= 0 && data.kind(p) == Data.ELEM);
477496

478497
// reset namespace cache
479-
emitNamespaces();
480498
nsSet.clear();
481499
}
482500

@@ -488,6 +506,8 @@ private void node(final DBNode node) throws IOException {
488506
addAttribute(n, v, canonical ? data.nspaces.uri(data.uriId(pre, Data.ATTR)) : null);
489507
if(eq(n, XML_SPACE) && indent) indent = !eq(v, PRESERVE);
490508
}
509+
adjustNamespaces(originalName);
510+
emitNamespaces();
491511
emitAttributes();
492512
parentStack.push(par);
493513
}
@@ -531,7 +551,8 @@ private void node(final FNode node) throws IOException {
531551
closeDoc();
532552
} else if(skip == 0 || !skipElement(node)) {
533553
// serialize elements (code will never be called for attributes)
534-
final QNm name = node.qname();
554+
final QNm originalName = node.qname();
555+
final QNm name = elementName(originalName);
535556
openElement(name);
536557

537558
// serialize declared namespaces
@@ -540,7 +561,6 @@ private void node(final FNode node) throws IOException {
540561
for(int p = 0; p < ps; p++) addNamespace(nsp.name(p), nsp.value(p));
541562
// add new or updated namespace
542563
addNamespace(name.prefix(), name.uri());
543-
emitNamespaces();
544564

545565
// serialize attributes
546566
final boolean i = indent;
@@ -550,6 +570,8 @@ private void node(final FNode node) throws IOException {
550570
addAttribute(n, v, canonical ? nsUri(prefix(n)) : null);
551571
if(eq(n, XML_SPACE) && indent) indent = !eq(v, PRESERVE);
552572
}
573+
adjustNamespaces(originalName);
574+
emitNamespaces();
553575
emitAttributes();
554576

555577
// serialize children

basex-core/src/main/java/org/basex/io/serial/XHTMLSerializer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* @author BaseX Team, BSD License
1616
* @author Christian Gruen
1717
*/
18-
final class XHTMLSerializer extends MarkupSerializer {
18+
final class XHTMLSerializer extends XhtmlHtmlSerializer {
1919
/**
2020
* Constructor, specifying serialization options.
2121
* @param os output stream
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package org.basex.io.serial;
2+
3+
import static org.basex.util.Token.*;
4+
import static org.basex.util.XMLToken.*;
5+
6+
import java.io.*;
7+
8+
import org.basex.query.value.item.*;
9+
10+
/**
11+
* This class contains the common behavior of XHTML and HTML serializers.
12+
*
13+
* @author BaseX Team, BSD License
14+
* @author Gunther Rademacher
15+
*/
16+
abstract class XhtmlHtmlSerializer extends MarkupSerializer {
17+
/**
18+
* Constructor.
19+
* @param os output stream
20+
* @param sopts serialization parameters
21+
* @param versions supported versions
22+
* @throws IOException I/O exception
23+
*/
24+
protected XhtmlHtmlSerializer(final OutputStream os, final SerializerOptions sopts,
25+
final String... versions) throws IOException {
26+
super(os, sopts, versions);
27+
}
28+
29+
@Override
30+
protected final QNm elementName(final QNm name) {
31+
return mustRewrite(name) ? new QNm(name.local(), name.uri()) : name;
32+
}
33+
34+
@Override
35+
protected final void adjustNamespaces(final QNm name) {
36+
if(!mustRewrite(name)) return;
37+
final byte[] prefix = name.prefix();
38+
boolean unused = true;
39+
for(final Att att : attributes) {
40+
final byte[] an = att.name();
41+
if(startsWith(an, prefix) && indexOf(an, ':') == prefix.length) {
42+
unused = false;
43+
break;
44+
}
45+
}
46+
for(int i = namespaces.size() - 1; i >= 0; --i) {
47+
final byte[] nsName = namespaces.get(i).name();
48+
if(nsName.length == 0 || unused && eq(nsName, prefix)) namespaces.remove(i);
49+
}
50+
addNamespace(EMPTY, name.uri());
51+
}
52+
53+
/**
54+
* Checks if the namespace prefix of an element name must be rewritten.
55+
* @param name name to be checked
56+
* @return result of check
57+
*/
58+
private boolean mustRewrite(final QNm name) {
59+
return html5 && name.hasPrefix() && eq(name.uri(), XHTML_URI, MATHML_URI, SVG_URI);
60+
}
61+
}

basex-core/src/main/java/org/basex/util/XMLToken.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ public final class XMLToken {
113113
public static final byte[] XHTML_URI = token("http://www.w3.org/1999/xhtml");
114114
/** XML namespace. */
115115
public static final byte[] XML_URI = token("http://www.w3.org/XML/1998/namespace");
116+
/** MathML namespace. */
117+
public static final byte[] MATHML_URI = token("http://www.w3.org/1998/Math/MathML");
118+
/** SVG namespace. */
119+
public static final byte[] SVG_URI = token("http://www.w3.org/2000/svg");
116120

117121
/** Index for all HTML entities (lazy initialization). */
118122
private static TokenObjectMap<byte[]> entities;

basex-core/src/test/java/org/basex/query/SerializerTest.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,19 @@ public final class SerializerTest extends SandboxTest {
175175
}
176176
query(option + "<a>&#x90;</a>", "<a>&#x90;</a>");
177177
query(option + "<html/>", "<!DOCTYPE HTML><html></html>");
178+
query("declare namespace xhtml = 'http://www.w3.org/1999/xhtml';\n"
179+
+ option + INDENT_ATTRIBUTES.arg("yes") + INDENT.arg("yes")
180+
+ "<html>\n"
181+
+ " <xhtml:body xhtml:test='x' xmlns='http://www.w3.org/2000/svg'>\n"
182+
+ " <svg/>\n"
183+
+ " </xhtml:body>\n"
184+
+ "</html>",
185+
"<!DOCTYPE HTML>\n"
186+
+ "<html>\n"
187+
+ " <body xmlns:xhtml=\"http://www.w3.org/1999/xhtml\"\n"
188+
+ " xmlns=\"http://www.w3.org/1999/xhtml\"\n"
189+
+ " xhtml:test=\"x\"><svg xmlns=\"http://www.w3.org/2000/svg\"/></body>\n"
190+
+ "</html>");
178191
}
179192

180193
/** Test: method=text. */

basex-core/src/test/java/org/basex/query/func/FnModuleTest.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3022,6 +3022,9 @@ public final class FnModuleTest extends SandboxTest {
30223022
final String canonicalXml = " { 'method': 'xml', 'canonical': true() }";
30233023
query(func.args(" <a xmlns:p='urn:test:x' p:z='1' a='2'/>", canonicalXml),
30243024
"<a xmlns:p=\"urn:test:x\" a=\"2\" p:z=\"1\"></a>");
3025+
query(func.args(" parse-xml(`<q:a xmlns:p='urn:test:x' xmlns:q='urn:test:y' p:z='1' a='2'/>`)",
3026+
canonicalXml),
3027+
"<q:a xmlns:p=\"urn:test:x\" xmlns:q=\"urn:test:y\" a=\"2\" p:z=\"1\"></q:a>");
30253028

30263029
final String canonicalJson = " {'method': 'json', 'canonical': true()}";
30273030
query(func.args(" [0x7fff_ffff_ffff_ffff]", canonicalJson), "[9223372036854776000]");

0 commit comments

Comments
 (0)