Skip to content

Commit 90113a9

Browse files
fred01sandwwraith
andauthored
Added ability to read buffered huge strings in custom KSerializers (#2012)
Added stream-friendly version of decodeString for new ChunkedDecoder interface. Fixes #1987 Co-authored-by: Leonid Startsev <[email protected]>
1 parent 623dcad commit 90113a9

File tree

7 files changed

+275
-2
lines changed

7 files changed

+275
-2
lines changed

core/api/kotlinx-serialization-core.api

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,10 @@ public abstract class kotlinx/serialization/encoding/AbstractEncoder : kotlinx/s
414414
public fun shouldEncodeElementDefault (Lkotlinx/serialization/descriptors/SerialDescriptor;I)Z
415415
}
416416

417+
public abstract interface class kotlinx/serialization/encoding/ChunkedDecoder {
418+
public abstract fun decodeStringChunked (Lkotlin/jvm/functions/Function1;)V
419+
}
420+
417421
public abstract interface class kotlinx/serialization/encoding/CompositeDecoder {
418422
public static final field Companion Lkotlinx/serialization/encoding/CompositeDecoder$Companion;
419423
public static final field DECODE_DONE I
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package kotlinx.serialization.encoding
2+
3+
import kotlinx.serialization.ExperimentalSerializationApi
4+
5+
/**
6+
* This interface indicates that decoder supports consuming large strings by chunks via consumeChunk method.
7+
* Currently, only streaming json decoder implements this interface.
8+
* Please note that this interface is only applicable to streaming decoders. That means that it is not possible to use
9+
* some JsonTreeDecoder features like polymorphism with this interface.
10+
*/
11+
@ExperimentalSerializationApi
12+
public interface ChunkedDecoder {
13+
/**
14+
* Method allows decoding a string value by fixed-size chunks.
15+
* Usable for handling very large strings that may not fit in memory.
16+
* Chunk size is guaranteed to not exceed 16384 chars (but it may be smaller than that).
17+
* Feeds string chunks to the provided consumer.
18+
*
19+
* @param consumeChunk - lambda function to handle string chunks
20+
*
21+
* Example usage:
22+
* ```
23+
* @Serializable(with = LargeStringSerializer::class)
24+
* data class LargeStringData(val largeString: String)
25+
*
26+
* @Serializable
27+
* data class ClassWithLargeStringDataField(val largeStringField: LargeStringData)
28+
*
29+
* object LargeStringSerializer : KSerializer<LargeStringData> {
30+
* override val descriptor: SerialDescriptor = PrimitiveSerialDescriptor("LargeStringContent", PrimitiveKind.STRING)
31+
*
32+
* override fun deserialize(decoder: Decoder): LargeStringData {
33+
* require(decoder is ChunkedDecoder) { "Only chunked decoder supported" }
34+
*
35+
* val tmpFile = createTempFile()
36+
* val writer = FileWriter(tmpFile.toFile()).use {
37+
* decoder.decodeStringChunked { chunk ->
38+
* writer.append(chunk)
39+
* }
40+
* }
41+
* return LargeStringData("file://${tmpFile.absolutePathString()}")
42+
* }
43+
* }
44+
* ```
45+
*
46+
* In this sample, we need to be able to handle a huge string coming from json. Instead of storing it in memory,
47+
* we offload it into a file and return the file name instead
48+
*/
49+
@ExperimentalSerializationApi
50+
public fun decodeStringChunked(consumeChunk: (chunk: String) -> Unit)
51+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
package kotlinx.serialization.json
2+
3+
import kotlinx.serialization.*
4+
import kotlinx.serialization.Serializable
5+
import kotlinx.serialization.descriptors.*
6+
import kotlinx.serialization.encoding.*
7+
import kotlinx.serialization.test.assertFailsWithMessage
8+
import kotlin.test.*
9+
10+
11+
@Serializable(with = LargeStringSerializer::class)
12+
data class LargeStringData(val largeString: String)
13+
14+
@Serializable
15+
data class ClassWithLargeStringDataField(val largeStringField: LargeStringData)
16+
17+
18+
object LargeStringSerializer : KSerializer<LargeStringData> {
19+
override val descriptor: SerialDescriptor = PrimitiveSerialDescriptor("LargeStringContent", PrimitiveKind.STRING)
20+
21+
override fun deserialize(decoder: Decoder): LargeStringData {
22+
require(decoder is ChunkedDecoder) { "Only chunked decoder supported" }
23+
24+
val outStringBuilder = StringBuilder()
25+
26+
decoder.decodeStringChunked { chunk ->
27+
outStringBuilder.append(chunk)
28+
}
29+
return LargeStringData(outStringBuilder.toString())
30+
}
31+
32+
override fun serialize(encoder: Encoder, value: LargeStringData) {
33+
encoder.encodeString(value.largeString)
34+
}
35+
}
36+
37+
open class JsonChunkedStringDecoderTest : JsonTestBase() {
38+
39+
@Test
40+
fun decodePlainLenientString() {
41+
val longString = "abcd".repeat(8192) // Make string more than 16k
42+
val sourceObject = ClassWithLargeStringDataField(LargeStringData(longString))
43+
val serializedObject = "{\"largeStringField\": $longString }"
44+
val jsonWithLenientMode = Json { isLenient = true }
45+
testDecodeInAllModes(jsonWithLenientMode, serializedObject, sourceObject)
46+
}
47+
48+
@Test
49+
fun decodePlainString() {
50+
val longStringWithEscape = "${"abcd".repeat(4096)}\"${"abcd".repeat(4096)}" // Make string more than 16k
51+
val sourceObject = ClassWithLargeStringDataField(LargeStringData(longStringWithEscape))
52+
val serializedObject = Json.encodeToString(sourceObject)
53+
testDecodeInAllModes(Json, serializedObject, sourceObject)
54+
}
55+
56+
private fun testDecodeInAllModes(
57+
seralizer: Json, serializedObject: String, sourceObject: ClassWithLargeStringDataField
58+
) {
59+
/* Filter out Java Streams mode in common tests. Java streams tested separately in java tests */
60+
JsonTestingMode.values().filterNot { it == JsonTestingMode.JAVA_STREAMS }.forEach { mode ->
61+
if (mode == JsonTestingMode.TREE) {
62+
assertFailsWithMessage<IllegalArgumentException>(
63+
"Only chunked decoder supported", "Shouldn't decode JSON in TREE mode"
64+
) {
65+
seralizer.decodeFromString<ClassWithLargeStringDataField>(serializedObject, mode)
66+
}
67+
} else {
68+
val deserializedObject =
69+
seralizer.decodeFromString<ClassWithLargeStringDataField>(serializedObject, mode)
70+
assertEquals(sourceObject.largeStringField, deserializedObject.largeStringField)
71+
}
72+
}
73+
}
74+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package kotlinx.serialization.json
2+
3+
import kotlinx.serialization.*
4+
import kotlinx.serialization.Serializable
5+
import kotlinx.serialization.descriptors.*
6+
import kotlinx.serialization.encoding.*
7+
import kotlinx.serialization.test.assertFailsWithMessage
8+
import org.junit.Test
9+
import java.io.*
10+
import java.util.*
11+
import kotlin.random.Random
12+
import kotlin.test.*
13+
14+
15+
@Serializable(with = LargeBase64StringSerializer::class)
16+
data class LargeBinaryData(val binaryData: ByteArray) {
17+
override fun equals(other: Any?): Boolean {
18+
if (this === other) return true
19+
if (javaClass != other?.javaClass) return false
20+
21+
other as LargeBinaryData
22+
23+
if (!binaryData.contentEquals(other.binaryData)) return false
24+
25+
return true
26+
}
27+
28+
override fun hashCode(): Int {
29+
return binaryData.contentHashCode()
30+
}
31+
}
32+
33+
@Serializable
34+
data class ClassWithBinaryDataField(val binaryField: LargeBinaryData)
35+
36+
object LargeBase64StringSerializer : KSerializer<LargeBinaryData> {
37+
private val b64Decoder: Base64.Decoder = Base64.getDecoder()
38+
override val descriptor: SerialDescriptor = PrimitiveSerialDescriptor("LargeStringContent", PrimitiveKind.STRING)
39+
40+
override fun deserialize(decoder: Decoder): LargeBinaryData {
41+
require(decoder is ChunkedDecoder) { "Only chunked decoder supported" }
42+
43+
var reminder = ""
44+
val decodedBytes = ByteArrayOutputStream().use { bos ->
45+
decoder.decodeStringChunked {
46+
val actualChunk = reminder + it
47+
val reminderLength = actualChunk.length % 4
48+
val alignedLength = actualChunk.length - reminderLength
49+
val alignedChunk = actualChunk.take(alignedLength)
50+
reminder = actualChunk.takeLast(reminderLength)
51+
bos.write(b64Decoder.decode(alignedChunk))
52+
}
53+
bos.toByteArray()
54+
}
55+
56+
return LargeBinaryData(decodedBytes)
57+
}
58+
59+
override fun serialize(encoder: Encoder, value: LargeBinaryData) {
60+
encoder.encodeString(Base64.getEncoder().encodeToString(value.binaryData))
61+
}
62+
}
63+
64+
class JsonChunkedBase64DecoderTest : JsonTestBase() {
65+
66+
@Test
67+
fun decodeBase64String() {
68+
val sourceObject =
69+
ClassWithBinaryDataField(LargeBinaryData(Random.nextBytes(16 * 1024))) // After encoding to Base64 will be larger than 16k (JsonLexer#BATCH_SIZE)
70+
val serializedObject = Json.encodeToString(sourceObject)
71+
72+
JsonTestingMode.values().forEach { mode ->
73+
if (mode == JsonTestingMode.TREE) {
74+
assertFailsWithMessage<IllegalArgumentException>(
75+
"Only chunked decoder supported", "Shouldn't decode JSON in TREE mode"
76+
) {
77+
Json.decodeFromString<ClassWithBinaryDataField>(serializedObject, mode)
78+
}
79+
} else {
80+
val deserializedObject = Json.decodeFromString<ClassWithBinaryDataField>(serializedObject, mode)
81+
assertEquals(sourceObject.binaryField, deserializedObject.binaryField)
82+
}
83+
}
84+
}
85+
}

formats/json/commonMain/src/kotlinx/serialization/json/internal/StreamingJsonDecoder.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ internal open class StreamingJsonDecoder(
2424
@JvmField internal val lexer: AbstractJsonLexer,
2525
descriptor: SerialDescriptor,
2626
discriminatorHolder: DiscriminatorHolder?
27-
) : JsonDecoder, AbstractDecoder() {
27+
) : JsonDecoder, ChunkedDecoder, AbstractDecoder() {
2828

2929
// A mutable reference to the discriminator that have to be skipped when in optimistic phase
3030
// of polymorphic serialization, see `decodeSerializableValue`
@@ -343,6 +343,10 @@ internal open class StreamingJsonDecoder(
343343
}
344344
}
345345

346+
override fun decodeStringChunked(consumeChunk: (chunk: String) -> Unit) {
347+
lexer.consumeStringChunked(configuration.isLenient, consumeChunk)
348+
}
349+
346350
override fun decodeInline(descriptor: SerialDescriptor): Decoder =
347351
if (descriptor.isUnsignedNumber) JsonDecoderForUnsignedTypes(lexer, json)
348352
else super.decodeInline(descriptor)

formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/AbstractJsonLexer.kt

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
package kotlinx.serialization.json.internal
66

7-
import kotlinx.serialization.json.internal.*
87
import kotlinx.serialization.json.internal.CharMappings.CHAR_TO_TOKEN
98
import kotlinx.serialization.json.internal.CharMappings.ESCAPE_2_CHAR
109
import kotlin.js.*
@@ -310,6 +309,58 @@ internal abstract class AbstractJsonLexer {
310309
*/
311310
abstract fun consumeKeyString(): String
312311

312+
private fun insideString(isLenient: Boolean, char: Char): Boolean = if (isLenient) {
313+
charToTokenClass(char) == TC_OTHER
314+
} else {
315+
char != STRING
316+
}
317+
318+
open fun consumeStringChunked(isLenient: Boolean, consumeChunk: (stringChunk: String) -> Unit) { // open to allow simpler implementations (i.e. StringJsonLexer)
319+
val nextToken = peekNextToken()
320+
if (isLenient && nextToken != TC_OTHER) return // noting to consume
321+
322+
if (!isLenient) {
323+
consumeNextToken(STRING)
324+
}
325+
var currentPosition = this.currentPosition
326+
var lastPosition = currentPosition
327+
var char = source[currentPosition] // Avoid two range checks visible in the profiler
328+
var usedAppend = false
329+
while (insideString(isLenient, char)) {
330+
if (!isLenient && char == STRING_ESC) { // handle escaping only in non-lenient mode
331+
usedAppend = true
332+
currentPosition = prefetchOrEof(appendEscape(lastPosition, currentPosition))
333+
lastPosition = currentPosition
334+
} else {
335+
currentPosition++
336+
}
337+
if (currentPosition >= source.length) {
338+
// end of chunk
339+
writeRange(lastPosition, currentPosition, usedAppend, consumeChunk)
340+
usedAppend = false
341+
currentPosition = prefetchOrEof(currentPosition)
342+
if (currentPosition == -1)
343+
fail("EOF", currentPosition)
344+
lastPosition = currentPosition
345+
}
346+
char = source[currentPosition]
347+
}
348+
writeRange(lastPosition, currentPosition, usedAppend, consumeChunk)
349+
this.currentPosition = currentPosition
350+
if (!isLenient) {
351+
consumeNextToken(STRING)
352+
}
353+
}
354+
355+
private fun writeRange(fromIndex: Int, toIndex: Int, currentChunkHasEscape: Boolean, consumeChunk: (stringChunk: String) -> Unit) {
356+
if (currentChunkHasEscape) {
357+
consumeChunk(decodedString(fromIndex, toIndex))
358+
} else {
359+
consumeChunk(substring(fromIndex, toIndex))
360+
}
361+
}
362+
363+
313364
fun consumeString(): String {
314365
if (peekedString != null) {
315366
return takePeeked()

formats/json/commonMain/src/kotlinx/serialization/json/internal/lexer/StringJsonLexer.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ internal class StringJsonLexer(override val source: String) : AbstractJsonLexer(
9797
return source.substring(current, closingQuote)
9898
}
9999

100+
override fun consumeStringChunked(isLenient: Boolean, consumeChunk: (stringChunk: String) -> Unit) {
101+
(if (isLenient) consumeStringLenient() else consumeString()).chunked(BATCH_SIZE).forEach(consumeChunk)
102+
}
103+
100104
override fun consumeLeadingMatchingValue(keyToMatch: String, isLenient: Boolean): String? {
101105
val positionSnapshot = currentPosition
102106
try {

0 commit comments

Comments
 (0)