Skip to content

Commit 9cdf2ce

Browse files
committed
Make whitespace handling conform to the XML standard. No longer recognize LFCR as single newline. Support #x85 and #2028
1 parent 5677178 commit 9cdf2ce

File tree

8 files changed

+146
-98
lines changed

8 files changed

+146
-98
lines changed

Changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ Changes:
2020
only applies if there are 2 or more characters and this would collapse to an
2121
empty string). The old behaviour only allowed a single character without
2222
surrounding whitespace.
23+
- Standards compliant line ending handling. \n\r is not collapsed as a single
24+
line end anymore. Characters #x85 and #2028 are now handled as line end (unless
25+
preceded by \r)
26+
2327
- Update kotlinx.io support to 0.9.0, atomicfu to 0.31.0, kotlinx.serialization
2428
to 1.10.0, kotlinx.benchmark to 0.4.16, kotlin to 2.3.10, junit to 5.14.3.
2529
- Always expand entities in attribute values (causing an exception if the entity

benchmark/src/jvmTest/kotlin/nl/adaptivity/xmlutil/benchmark/test/SerializationTest.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ open class SerializationTest : Serialization() {
5454
@Test
5555
fun testAttributePositionRegression() {
5656
val schemaName = "/xsts/ibmData/valid/S3_12/s3_12v03.xsd"
57-
val xml = XML.v1()
57+
val xml = XML.v1 { defaultToGenericParser = true }
5858
val schemaText = String(javaClass.getResourceAsStream(schemaName)!!.readAllBytes())
5959
val schema = xml.decodeFromString<XSSchema>(schemaText)
6060

core/base/src/commonMain/kotlin/nl/adaptivity/xmlutil/core/InOutBuffer.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ public interface InOutBuffer {
206206
val firstChar = readChar()
207207
require (isXmlWhitespace(firstChar)) { "Expected whitespace, but found non-whitespace: '$firstChar'" }
208208

209-
while (peek().let { it == '\t'.code || it == '\r'.code || it == ' '.code || it == '\r'.code }) {
209+
while (peek().let { it == '\t'.code || it == '\r'.code || it == ' '.code || it == '\n'.code || it == 0x85 || it == 0x2028 }) {
210210
markPeekedAsRead() //needs line ending handling
211211
}
212212
}
@@ -220,7 +220,7 @@ public interface InOutBuffer {
220220
while (c >= 0) {
221221
when (c.toChar()) {
222222
'\t', ' ' -> cnt += 1
223-
'\n', '\r' -> {
223+
'\n', '\r', '\u0085', '\u2028' -> {
224224
if (cnt > 0) skip(cnt)
225225
cnt = 0
226226
markPeekedAsRead() // does newlines for us

core/base/src/commonMain/kotlin/nl/adaptivity/xmlutil/core/KtXmlReader.kt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -578,8 +578,6 @@ public class KtXmlReader(
578578
inOutBuffer.resumeCopySequence()
579579
}
580580

581-
'\r' -> throw AssertionError("Carriage returns should have been normalized out here")
582-
583581
delimiter -> return
584582
else -> inOutBuffer.markPeekedAsRead()
585583
}

core/base/src/commonMain/kotlin/nl/adaptivity/xmlutil/core/KtXmlWriter.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ public class KtXmlWriter(
545545

546546
for (c in text) {
547547
when (c) {
548-
' ', '\t', '\r', '\n' -> {}
548+
' ', '\t', '\r', '\n', '\u0085', '\u2028' -> {}
549549
else -> throw IllegalArgumentException("\"$text\" is not ignorable whitespace")
550550
}
551551
}

core/base/src/commonMain/kotlin/nl/adaptivity/xmlutil/core/internal/StringInOutBuffer.kt

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,10 @@ public class StringInOutBuffer(public val input: CharSequence): InOutBuffer {
162162
private fun peekCommon(bufPos: Int): Int {
163163
return if (bufPos >= input.length) -1
164164
else {
165-
val c = input[bufPos]
166-
if (c == '\r') '\n'.code else c.code
165+
when (val c = input[bufPos]) {
166+
'\r', '\u0085', '\u2028' -> '\n'.code
167+
else -> c.code
168+
}
167169
}
168170
}
169171

@@ -201,66 +203,59 @@ public class StringInOutBuffer(public val input: CharSequence): InOutBuffer {
201203
}
202204

203205
override fun markPeekedAsRead() {
204-
fun handleLineEnd(complement: Int, oldPos: Int) {
205-
if (peek(1) == complement) {
206-
pauseCopySequence()
207-
addToCopySequence('\n')
208-
offset = oldPos + 2
209-
resumeCopySequence()
210-
} else {
211-
if (copySequenceState == State.ACTIVE && complement == '\n'.code) {
212-
pauseCopySequence()
213-
addToCopySequence('\n')
214-
offset = oldPos + 1
215-
resumeCopySequence()
216-
} else {
217-
offset = oldPos + 1
218-
}
219-
}
220-
221-
line += 1
222-
lastColumnStart = offset
223-
}
224206

225207
val oldPos = offset
226208
val peeked = input[oldPos]
227209
when (peeked) {
228-
'\r' -> handleLineEnd('\n'.code, oldPos)
229-
'\n' -> handleLineEnd('\r'.code, oldPos)
210+
'\r' -> handle2CharLineEnd(oldPos)
211+
'\n', '\u0085', '\u2028' -> handleLineEnd(oldPos + 1)
230212
else -> offset = oldPos + 1
231213
}
232214
}
233215

234-
/** Does never read more than needed */
235-
override fun read(): Int {
236-
fun handleLineEnd(complement: Int, oldPos: Int) {
237-
val inc = if(peek(2) == complement) 2 else 1
238-
val newPos = oldPos + inc
239-
if (copySequenceState == State.ACTIVE && (inc == 2 || complement == '\n'.code)) {
240-
pauseCopySequence()
241-
addToCopySequence('\n')
242-
offset = newPos
243-
resumeCopySequence()
244-
} else {
245-
offset = newPos
246-
}
247-
lastColumnStart = offsetBase + newPos
248-
line += 1
216+
private fun handleLineEnd(newPos: Int) {
217+
// this cannot use peek as peek normalized (so the test will not work)
218+
if (copySequenceState == State.ACTIVE && input[offset] != '\n') {
219+
pauseCopySequence()
220+
addToCopySequence('\n')
221+
offset = newPos
222+
resumeCopySequence()
223+
} else {
224+
offset = newPos
249225
}
226+
lastColumnStart = newPos
227+
line += 1
228+
}
250229

230+
private fun handle2CharLineEnd(oldPos: Int) {
231+
val nextChar = peek(1)
232+
val inc = if(nextChar == 0xA || nextChar == 0x85) 2 else 1
233+
handleLineEnd(oldPos + inc)
234+
}
235+
236+
237+
/** Does never read more than needed */
238+
override fun read(): Int {
251239
// In this case we *may* need the right buffer, otherwise not
252240
// optimize this implementation for the "happy" path
253241
val oldPos = offset
254242
if (oldPos >= input.length) return -1
255243

256244
when (val char = input[oldPos]) {
257245
'\r' -> { // as \r is always transformed to \n, this requires a stringBuilder.
258-
handleLineEnd('\n'.code, oldPos)
246+
handle2CharLineEnd(oldPos)
247+
return '\n'.code
248+
}
249+
250+
'\u0085',
251+
'\u2028' -> {
252+
handleLineEnd(oldPos + 1)
259253
return '\n'.code
260254
}
261255

262256
'\n' -> {
263-
handleLineEnd('\r'.code, oldPos)
257+
offset = oldPos + 1
258+
lastColumnStart = oldPos + 1
264259
return '\n'.code
265260
}
266261

core/base/src/commonMain/kotlin/nl/adaptivity/xmlutil/core/internal/SwappedInOutBuffer.kt

Lines changed: 52 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -240,17 +240,20 @@ public class SwappedInOutBuffer(public val reader: Reader): InOutBuffer {
240240
return peekCommon(srcBufPos + offset)
241241
}
242242

243+
private fun rawPeek(bufPos: Int):Int = when {
244+
bufPos >= srcBufCount -> -1
245+
bufPos >= BUF_SIZE -> bufRight[bufPos - BUF_SIZE].code
246+
else -> bufLeft[bufPos].code
247+
}
248+
243249
private fun peekCommon(bufPos: Int): Int {
244250
// end of buffer. This implies bufPos < 2 * BUF_SIZE
245251
if (bufPos >= srcBufCount) return -1
246-
val c = when {
247-
bufPos >= BUF_SIZE -> bufRight[bufPos - BUF_SIZE]
248-
else -> bufLeft[bufPos]
249-
}
252+
val c = rawPeek(bufPos)
250253

251254
return when (c) {
252-
'\r' -> '\n'.code
253-
else -> c.code
255+
'\r'.code, 0x85, 0x2028 -> '\n'.code
256+
else -> c
254257
}
255258
}
256259

@@ -301,58 +304,51 @@ public class SwappedInOutBuffer(public val reader: Reader): InOutBuffer {
301304
}
302305

303306
override fun markPeekedAsRead() {
304-
fun handleLineEnd(complement: Int, oldPos: Int) {
305-
if (peek(1) == complement) {
306-
if (copySequenceState == State.ACTIVE) {
307-
pauseCopySequence()
308-
srcBufPos = oldPos + 2
309-
addToCopySequence('\n')
310-
resumeCopySequence()
311-
} else {
312-
srcBufPos = oldPos + 2
313-
}
314-
} else {
315-
if (copySequenceState == State.ACTIVE && complement == '\n'.code) {
316-
pauseCopySequence()
317-
addToCopySequence('\n')
318-
srcBufPos = oldPos + 1
319-
resumeCopySequence()
320-
} else {
321-
srcBufPos = oldPos + 1
322-
}
323-
}
324-
325-
line += 1
326-
lastColumnStart = offset
327-
}
328307

329308
val oldPos = srcBufPos
330-
val peeked = if (oldPos < BUF_SIZE) bufLeft[oldPos] else bufRight[oldPos - BUF_SIZE]
309+
val peeked = rawPeek (oldPos).toChar()
331310
when (peeked) {
332-
'\r' -> handleLineEnd('\n'.code, oldPos)
333-
'\n' -> handleLineEnd('\r'.code, oldPos)
311+
'\r' -> handle2CharLineEnd(oldPos)
312+
'\u0085', '\u2028' -> {
313+
bufLeft[oldPos] = '\n'
314+
handleLineEnd(oldPos + 1)
315+
}
316+
317+
'\n' -> handleLineEnd(oldPos + 1)
318+
334319
else -> srcBufPos = oldPos + 1
335320
}
336321
if (srcBufPos >= BUF_SIZE) swapInputBuffer()
337322
}
338323

339-
/** Does never read more than needed */
340-
override fun read(): Int {
341-
fun handleLineEnd(complement: Int, oldPos: Int) {
342-
val inc = if(peek(2) == complement) 2 else 1
343-
val newPos = oldPos + inc
344-
if (copySequenceState == State.ACTIVE && (inc == 2 || complement == '\n'.code)) {
345-
pauseCopySequence()
346-
addToCopySequence('\n')
347-
srcBufPos = newPos
348-
resumeCopySequence()
349-
} else {
350-
srcBufPos = newPos
324+
private fun handleLineEnd(newPos: Int) {
325+
if (copySequenceState == State.ACTIVE && rawPeek(srcBufPos) != '\n'.code) {
326+
pauseCopySequence()
327+
addToCopySequence('\n')
328+
srcBufPos = newPos
329+
resumeCopySequence()
330+
} else {
331+
srcBufPos = newPos
332+
}
333+
lastColumnStart = offsetBase + newPos
334+
line += 1
335+
}
336+
337+
private fun handle2CharLineEnd(oldPos: Int) {
338+
val nextChar = peek(1)
339+
val inc = when (nextChar) {
340+
0xA, 0x85 -> 2
341+
342+
else -> {
343+
bufLeft[oldPos] = '\n'
344+
1
351345
}
352-
lastColumnStart = offsetBase + newPos
353-
line += 1
354346
}
347+
handleLineEnd(oldPos + inc)
348+
}
355349

350+
/** Does never read more than needed */
351+
override fun read(): Int {
356352
// In this case we *may* need the right buffer, otherwise not
357353
// optimize this implementation for the "happy" path
358354
var oldPos = srcBufPos
@@ -367,12 +363,19 @@ public class SwappedInOutBuffer(public val reader: Reader): InOutBuffer {
367363

368364
when (val char = bufLeft[oldPos]) {
369365
'\r' -> { // as \r is always transformed to \n, this requires a stringBuilder.
370-
handleLineEnd('\n'.code, oldPos)
366+
handle2CharLineEnd(oldPos)
367+
return '\n'.code
368+
}
369+
370+
'\u0085',
371+
'\u2028' -> {
372+
bufLeft[srcBufPos] = '\n'
373+
handleLineEnd(oldPos + 1)
371374
return '\n'.code
372375
}
373376

374377
'\n' -> {
375-
handleLineEnd('\r'.code, oldPos)
378+
handleLineEnd(oldPos + 1)
376379
return '\n'.code
377380
}
378381

@@ -383,7 +386,6 @@ public class SwappedInOutBuffer(public val reader: Reader): InOutBuffer {
383386
}
384387
}
385388

386-
387389
private fun readUntilFullOrEOF(buffer: CharArray): Int {
388390
val bufSize = buffer.size
389391
var totalRead: Int = reader.read(buffer, 0, bufSize)

core/base/src/commonTest/kotlin/nl/adaptivity/xmlutil/test/TestKtXmlReader.kt

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import nl.adaptivity.xmlutil.core.KtXmlWriter
2727
import nl.adaptivity.xmlutil.core.impl.multiplatform.StringReader
2828
import nl.adaptivity.xmlutil.core.impl.multiplatform.StringWriter
2929
import nl.adaptivity.xmlutil.core.impl.multiplatform.use
30+
import nl.adaptivity.xmlutil.core.internal.StringInOutBuffer
3031
import nl.adaptivity.xmlutil.test.multiplatform.Target
3132
import nl.adaptivity.xmlutil.test.multiplatform.testTarget
3233
import kotlin.test.Test
@@ -81,7 +82,7 @@ class TestKtXmlReader : TestCommonReader() {
8182
}
8283

8384
@Test
84-
fun testXmlDecl() {
85+
fun testXmlDeclReader() {
8586
val reader = KtXmlReader(StringReader("<?xml version=\"1.1\" standalone=\"yes\"?>\r<foo>bar</foo>"))
8687
assertEquals(EventType.START_DOCUMENT, reader.next())
8788
assertEquals("1.1", reader.version)
@@ -100,6 +101,54 @@ class TestKtXmlReader : TestCommonReader() {
100101
assertEquals(2, reader.getLineNumber())
101102
}
102103

104+
@Test
105+
fun testXmlDeclString() {
106+
val reader = KtXmlReader(StringInOutBuffer("<?xml version=\"1.1\" standalone=\"yes\"?>\r<foo>bar</foo>"))
107+
assertEquals(EventType.START_DOCUMENT, reader.next())
108+
assertEquals("1.1", reader.version)
109+
assertEquals(true, reader.standalone)
110+
assertEquals(39, reader.getColumnNumber())
111+
assertEquals(1, reader.getLineNumber())
112+
113+
assertEquals(EventType.IGNORABLE_WHITESPACE, reader.next())
114+
assertEquals("\n", reader.text)
115+
assertEquals(1, reader.getColumnNumber())
116+
assertEquals(2, reader.getLineNumber())
117+
118+
assertEquals(EventType.START_ELEMENT, reader.next())
119+
assertEquals("foo", reader.localName)
120+
assertEquals(6, reader.getColumnNumber())
121+
assertEquals(2, reader.getLineNumber())
122+
}
123+
124+
fun testParseNewline(newLine: String, count: Int = 1) {
125+
val xml = "<tag>$newLine</tag>"
126+
val reader1 = KtXmlReader(StringReader(xml))
127+
assertEquals(EventType.START_ELEMENT, reader1.nextTag())
128+
assertEquals(EventType.IGNORABLE_WHITESPACE, reader1.next())
129+
assertEquals("\n".repeat(count), reader1.text)
130+
assertEquals(EventType.END_ELEMENT, reader1.next())
131+
assertEquals(1+count, reader1.getLineNumber())
132+
assertEquals(7, reader1.getColumnNumber())
133+
134+
val reader2 = KtXmlReader(StringInOutBuffer(xml))
135+
assertEquals(EventType.START_ELEMENT, reader2.nextTag())
136+
assertEquals(EventType.IGNORABLE_WHITESPACE, reader2.next())
137+
assertEquals("\n".repeat(count), reader2.text)
138+
assertEquals(EventType.END_ELEMENT, reader2.next())
139+
assertEquals(1+count, reader2.getLineNumber())
140+
assertEquals(7, reader2.getColumnNumber())
141+
}
142+
143+
@Test fun testParseNewlineLF() = testParseNewline("\n")
144+
@Test fun testParseNewlineCR() = testParseNewline("\r")
145+
@Test fun testParseNewlineCRLF() = testParseNewline("\r\n")
146+
@Test fun testParseNewlineCR85() = testParseNewline("\r\u0085")
147+
@Test fun testParseNewlineCR2028() = testParseNewline("\r\u2028")
148+
@Test fun testParseNewline85() = testParseNewline("\u0085")
149+
@Test fun testParseNewline2028() = testParseNewline("\u2028")
150+
@Test fun testParseNewlineLFCR() = testParseNewline("\n\r", 2)
151+
103152
@Test
104153
fun testUnquotedAttributeValues() {
105154
val xml = "<tag attr='foo' attr2=b&lt;ar attr3=baz/>"

0 commit comments

Comments
 (0)