Skip to content

Commit c2deef3

Browse files
basiljoehni
authored andcommitted
Allow PrettyPrintWriter to replace invalid XML characters when not running in quirks mode. Closes #335.
1 parent dbe845d commit c2deef3

File tree

3 files changed

+148
-18
lines changed

3 files changed

+148
-18
lines changed

xstream-distribution/src/content/changes.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ <h1 id="upcoming-1.4.x">Upcoming 1.4.x maintenance release</h1>
111111
<h2>Minor changes</h2>
112112

113113
<ul>
114+
<li>GHPR:#335: Allow PrettyPrintWriter to replace invalid XML characters when not running in quirks mode (by Basil Crow).</li>
114115
<li>GHPR:#331, GHI:#326: Fix handling of empty java.util.concurrent.atomic.AtomicReference (by Alex Blekhman of Atlassian).</li>
115116
<li>GHPR:#334: Fix remaining buffer size calculation in QuickWriter (by Higuchi Yuta).</li>
116117
<li>GHI:#342: Optimize internal handling of children in DomReader avoiding O(n²) access times for siblings (by Shiang-Yun Yang).</li>

xstream/src/java/com/thoughtworks/xstream/io/xml/PrettyPrintWriter.java

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (C) 2004, 2005, 2006 Joe Walnes.
3-
* Copyright (C) 2006, 2007, 2008, 2009, 2011, 2013, 2014, 2015 XStream Committers.
3+
* Copyright (C) 2006, 2007, 2008, 2009, 2011, 2013, 2014, 2015, 2023 XStream Committers.
44
* All rights reserved.
55
*
66
* The software in this package is published under the terms of the BSD
@@ -42,6 +42,8 @@
4242
* href="http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets">1.1</a>. If a character is not supported, a
4343
* {@link StreamException} is thrown. Select a proper parser implementation that respects the version in the XML header
4444
* (the Xpp3 parser will also read character entities of normally invalid characters).
45+
* You may also switch to XML_1_0_REPLACEMENT or XML_1_1_REPLACEMENT mode, which will replace the invalid characters
46+
* with a U+FFFD replacement character.
4547
* </p>
4648
*
4749
* @author Joe Walnes
@@ -52,6 +54,8 @@ public class PrettyPrintWriter extends AbstractXmlWriter {
5254
public static int XML_QUIRKS = -1;
5355
public static int XML_1_0 = 0;
5456
public static int XML_1_1 = 1;
57+
public static int XML_1_0_REPLACEMENT = 2;
58+
public static int XML_1_1_REPLACEMENT = 3;
5559

5660
private final QuickWriter writer;
5761
private final FastStack<String> elementStack = new FastStack<>(16);
@@ -71,6 +75,7 @@ public class PrettyPrintWriter extends AbstractXmlWriter {
7175
private static final char[] QUOT = "&quot;".toCharArray();
7276
private static final char[] APOS = "&apos;".toCharArray();
7377
private static final char[] CLOSE = "</".toCharArray();
78+
private static final char[] REPLACEMENT = "&#xfffd;".toCharArray();
7479

7580
/**
7681
* @since 1.4
@@ -80,8 +85,8 @@ public PrettyPrintWriter(final Writer writer, final int mode, final char[] lineI
8085
this.writer = new QuickWriter(writer);
8186
this.lineIndenter = lineIndenter;
8287
this.mode = mode;
83-
if (mode < XML_QUIRKS || mode > XML_1_1) {
84-
throw new IllegalArgumentException("Not a valid XML mode");
88+
if (mode < XML_QUIRKS || mode > XML_1_1_REPLACEMENT) {
89+
throw new IllegalArgumentException("Not a valid XML mode: " + mode);
8590
}
8691
}
8792

@@ -213,6 +218,8 @@ private void writeText(final String text, final boolean isAttribute) {
213218
case '\0':
214219
if (mode == XML_QUIRKS) {
215220
writer.write(NULL);
221+
} else if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
222+
writer.write(REPLACEMENT);
216223
} else {
217224
throw new StreamException("Invalid character 0x0 in XML stream");
218225
}
@@ -244,32 +251,53 @@ private void writeText(final String text, final boolean isAttribute) {
244251
//$FALL-THROUGH$
245252
default:
246253
if (Character.isDefined(c) && !Character.isISOControl(c)) {
254+
boolean replaced = false;
247255
if (mode != XML_QUIRKS) {
248256
if (c > '\ud7ff' && c < '\ue000') {
249-
throw new StreamException("Invalid character 0x"
250-
+ Integer.toHexString(c)
251-
+ " in XML stream");
257+
if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
258+
writer.write(REPLACEMENT);
259+
replaced = true;
260+
} else {
261+
throw new StreamException("Invalid character 0x"
262+
+ Integer.toHexString(c)
263+
+ " in XML stream");
264+
}
252265
}
253266
}
254-
writer.write(c);
267+
if (!replaced) {
268+
writer.write(c);
269+
}
255270
} else {
256-
if (mode == XML_1_0) {
271+
boolean replaced = false;
272+
if (mode == XML_1_0 || mode == XML_1_0_REPLACEMENT) {
257273
if (c < 9 || c == '\u000b' || c == '\u000c' || c == '\u000e' || c >= '\u000f' && c <= '\u001f') {
258-
throw new StreamException("Invalid character 0x"
259-
+ Integer.toHexString(c)
260-
+ " in XML 1.0 stream");
274+
if (mode == XML_1_0_REPLACEMENT) {
275+
writer.write(REPLACEMENT);
276+
replaced = true;
277+
} else {
278+
throw new StreamException("Invalid character 0x"
279+
+ Integer.toHexString(c)
280+
+ " in XML 1.0 stream");
281+
}
261282
}
262283
}
263284
if (mode != XML_QUIRKS) {
264285
if (c == '\ufffe' || c == '\uffff') {
265-
throw new StreamException("Invalid character 0x"
266-
+ Integer.toHexString(c)
267-
+ " in XML stream");
286+
if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
287+
writer.write(REPLACEMENT);
288+
replaced = true;
289+
} else {
290+
throw new StreamException("Invalid character 0x"
291+
+ Integer.toHexString(c)
292+
+ " in XML stream");
293+
}
268294
}
269295
}
270-
writer.write("&#x");
271-
writer.write(Integer.toHexString(c));
272-
writer.write(';');
296+
if (!replaced) {
297+
writer.write("&#x");
298+
writer.write(Integer.toHexString(c));
299+
writer.write(';');
300+
}
273301
}
274302
}
275303
}

xstream/src/test/com/thoughtworks/xstream/io/xml/PrettyPrintWriterTest.java

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (C) 2004, 2005 Joe Walnes.
3-
* Copyright (C) 2006, 2007, 2008, 2013, 2018 XStream Committers.
3+
* Copyright (C) 2006, 2007, 2008, 2013, 2018, 2023 XStream Committers.
44
* All rights reserved.
55
*
66
* The software in this package is published under the terms of the BSD
@@ -168,6 +168,24 @@ public void testThrowsForNullInXml1_1Mode() {
168168
}
169169
}
170170

171+
public void testReplacesNullInXml1_0ReplacementMode() {
172+
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
173+
writer.startNode("tag");
174+
writer.setValue("\u0000");
175+
writer.endNode();
176+
177+
assertXmlProducedIs("<tag>&#xfffd;</tag>");
178+
}
179+
180+
public void testReplacesNullInXml1_1ReplacementMode() {
181+
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
182+
writer.startNode("tag");
183+
writer.setValue("\u0000");
184+
writer.endNode();
185+
186+
assertXmlProducedIs("<tag>&#xfffd;</tag>");
187+
}
188+
171189
public void testSupportsOnlyValidControlCharactersInXml1_0Mode() {
172190
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0);
173191
writer.startNode("tag");
@@ -237,6 +255,65 @@ public void testSupportsOnlyValidControlCharactersInXml1_1Mode() {
237255
+ "&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9d;&#x9e;&#x9f;</tag>");
238256
}
239257

258+
public void testReplacesInvalidControlCharactersInXml1_0ReplacementMode() {
259+
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
260+
writer.startNode("tag");
261+
final String ctrl = ""
262+
+ "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007"
263+
+ "\u0008\u0009\n\u000b\u000c\r\u000e\u000f"
264+
+ "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017"
265+
+ "\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
266+
+ "\u007f"
267+
+ "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"
268+
+ "\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f"
269+
+ "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097"
270+
+ "\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f"
271+
+ "";
272+
for (int i = 0; i < ctrl.length(); i++) {
273+
final char c = ctrl.charAt(i);
274+
writer.setValue(new Character(c).toString());
275+
}
276+
writer.endNode();
277+
278+
assertXmlProducedIs("<tag>&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;"
279+
+ "&#xfffd;\t\n&#xfffd;&#xfffd;&#xd;&#xfffd;&#xfffd;"
280+
+ "&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;"
281+
+ "&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;&#xfffd;"
282+
+ "&#x7f;"
283+
+ "&#x80;&#x81;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;"
284+
+ "&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8d;&#x8e;&#x8f;"
285+
+ "&#x90;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;"
286+
+ "&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9d;&#x9e;&#x9f;</tag>"); }
287+
288+
public void testReplacesInvalidControlCharactersInXml1_1ReplacementMode() {
289+
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
290+
writer.startNode("tag");
291+
final String ctrl = ""
292+
+ "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007"
293+
+ "\u0008\u0009\n\u000b\u000c\r\u000e\u000f"
294+
+ "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017"
295+
+ "\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
296+
+ "\u007f"
297+
+ "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"
298+
+ "\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f"
299+
+ "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097"
300+
+ "\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f"
301+
+ "";
302+
for (int i = 0; i < ctrl.length(); i++) {
303+
final char c = ctrl.charAt(i);
304+
writer.setValue(new Character(c).toString());
305+
}
306+
writer.endNode();
307+
assertXmlProducedIs("<tag>&#xfffd;&#x1;&#x2;&#x3;&#x4;&#x5;&#x6;&#x7;"
308+
+ "&#x8;\t\n&#xb;&#xc;&#xd;&#xe;&#xf;"
309+
+ "&#x10;&#x11;&#x12;&#x13;&#x14;&#x15;&#x16;&#x17;"
310+
+ "&#x18;&#x19;&#x1a;&#x1b;&#x1c;&#x1d;&#x1e;&#x1f;&#x7f;"
311+
+ "&#x80;&#x81;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;"
312+
+ "&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8d;&#x8e;&#x8f;"
313+
+ "&#x90;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;"
314+
+ "&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9d;&#x9e;&#x9f;</tag>");
315+
}
316+
240317
public void testSupportsInvalidUnicodeCharacterslInQuirksMode() {
241318
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_QUIRKS);
242319
writer.startNode("tag");
@@ -295,6 +372,30 @@ public void testThrowsForInvalidUnicodeCharacterslInXml1_1Mode() {
295372
assertXmlProducedIs("<tag>&#xd7ff;\ue000\ufffd</tag>");
296373
}
297374

375+
public void testReplacesInvalidUnicodeCharactersInXml1_0ReplacementMode() {
376+
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
377+
writer.startNode("tag");
378+
final String ctrl = "\ud7ff\ud800\udfff\ue000\ufffd\ufffe\uffff";
379+
for (int i = 0; i < ctrl.length(); i++) {
380+
final char c = ctrl.charAt(i);
381+
writer.setValue(new Character(c).toString());
382+
}
383+
writer.endNode();
384+
assertXmlProducedIs("<tag>&#xd7ff;&#xfffd;&#xfffd;\ue000\ufffd&#xfffd;&#xfffd;</tag>");
385+
}
386+
387+
public void testReplacesInvalidUnicodeCharactersInXml1_1ReplacementMode() {
388+
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
389+
writer.startNode("tag");
390+
final String ctrl = "\ud7ff\ud800\udfff\ue000\ufffd\ufffe\uffff";
391+
for (int i = 0; i < ctrl.length(); i++) {
392+
final char c = ctrl.charAt(i);
393+
writer.setValue(new Character(c).toString());
394+
}
395+
writer.endNode();
396+
assertXmlProducedIs("<tag>&#xd7ff;&#xfffd;&#xfffd;\ue000\ufffd&#xfffd;&#xfffd;</tag>");
397+
}
398+
298399
private String replace(final String in, final char what, final String with) {
299400
final int pos = in.indexOf(what);
300401
if (pos == -1) {

0 commit comments

Comments
 (0)