Skip to content

Commit 5d75ab3

Browse files
committed
Fix json parsing for fields named 'value' and 'array'
1 parent da440c1 commit 5d75ab3

File tree

2 files changed

+129
-13
lines changed
  • core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io
  • tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io

2 files changed

+129
-13
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ import org.jetbrains.kotlinx.dataframe.DataColumn
1313
import org.jetbrains.kotlinx.dataframe.DataFrame
1414
import org.jetbrains.kotlinx.dataframe.DataRow
1515
import org.jetbrains.kotlinx.dataframe.api.cast
16+
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
1617
import org.jetbrains.kotlinx.dataframe.api.getColumn
1718
import org.jetbrains.kotlinx.dataframe.api.indices
18-
import org.jetbrains.kotlinx.dataframe.api.map
1919
import org.jetbrains.kotlinx.dataframe.api.mapIndexed
2020
import org.jetbrains.kotlinx.dataframe.api.name
2121
import org.jetbrains.kotlinx.dataframe.api.rows
@@ -71,17 +71,25 @@ public fun DataRow.Companion.readJson(stream: InputStream, header: List<String>
7171
public fun DataFrame.Companion.readJsonStr(text: String, header: List<String> = emptyList()): AnyFrame = readJson(Parser.default().parse(StringBuilder(text)), header)
7272
public fun DataRow.Companion.readJsonStr(text: String, header: List<String> = emptyList()): AnyRow = DataFrame.readJsonStr(text, header).single()
7373

74-
private fun readJson(parsed: Any?, header: List<String>) = when (parsed) {
75-
is JsonArray<*> -> fromJsonList(parsed.value, header)
76-
else -> fromJsonList(listOf(parsed))
74+
private fun readJson(parsed: Any?, header: List<String>): DataFrame<*> {
75+
val df = when (parsed) {
76+
is JsonArray<*> -> fromJsonList(parsed.value, header)
77+
else -> fromJsonList(listOf(parsed))
78+
}
79+
return df.unwrapUnnamedColumns()
7780
}
7881

82+
private fun DataFrame<Any?>.unwrapUnnamedColumns() =
83+
dataFrameOf(columns().map { it.unwrapUnnamedColumn() })
84+
85+
private fun AnyCol.unwrapUnnamedColumn() = if (this is UnnamedColumn) col else this
86+
7987
private val arrayColumnName = "array"
8088

8189
internal val valueColumnName = "value"
8290

8391
internal fun fromJsonList(records: List<*>, header: List<String> = emptyList()): AnyFrame {
84-
fun AnyFrame.isSingleUnnamedColumn() = ncol == 1 && getColumn(0).name.let { it == org.jetbrains.kotlinx.dataframe.io.valueColumnName || it == org.jetbrains.kotlinx.dataframe.io.arrayColumnName }
92+
fun AnyFrame.isSingleUnnamedColumn() = ncol == 1 && getColumn(0) is UnnamedColumn
8593

8694
var hasPrimitive = false
8795
var hasArray = false
@@ -108,7 +116,7 @@ internal fun fromJsonList(records: List<*>, header: List<String> = emptyList()):
108116

109117
val columns: List<AnyCol> = nameGenerator.names.map { colName ->
110118
when {
111-
colName == valueColumn -> {
119+
colName == valueColumn && hasPrimitive -> {
112120
val collector = createDataCollector(records.size)
113121
val nanIndices = mutableListOf<Int>()
114122
records.forEachIndexed { i, v ->
@@ -120,7 +128,7 @@ internal fun fromJsonList(records: List<*>, header: List<String> = emptyList()):
120128
}
121129
}
122130
val column = collector.toColumn(colName)
123-
if (nanIndices.isNotEmpty()) {
131+
val res = if (nanIndices.isNotEmpty()) {
124132
fun <C> DataColumn<C>.updateNaNs(nanValue: C): DataColumn<C> {
125133
var j = 0
126134
var nextNanIndex = nanIndices[j]
@@ -139,24 +147,27 @@ internal fun fromJsonList(records: List<*>, header: List<String> = emptyList()):
139147
else -> column
140148
}
141149
} else column
150+
UnnamedColumn(res)
142151
}
143-
colName == arrayColumn -> {
152+
colName == arrayColumn && hasArray -> {
144153
val values = mutableListOf<Any?>()
145154
val startIndices = ArrayList<Int>()
146155
records.forEach {
147156
startIndices.add(values.size)
148157
if (it is JsonArray<*>) values.addAll(it.value)
149158
}
150159
val parsed = fromJsonList(values)
151-
when {
160+
161+
val res = when {
152162
parsed.isSingleUnnamedColumn() -> {
153-
val col = parsed.getColumn(0)
163+
val col = (parsed.getColumn(0) as UnnamedColumn).col
154164
val elementType = col.type
155165
val values = col.values.asList().splitByIndices(startIndices.asSequence()).toList()
156166
DataColumn.createValueColumn(colName, values, List::class.createType(listOf(KTypeProjection.invariant(elementType))))
157167
}
158-
else -> DataColumn.createFrameColumn(colName, parsed, startIndices)
168+
else -> DataColumn.createFrameColumn(colName, parsed.unwrapUnnamedColumns(), startIndices)
159169
}
170+
UnnamedColumn(res)
160171
}
161172
else -> {
162173
val values = ArrayList<Any?>(records.size)
@@ -171,19 +182,26 @@ internal fun fromJsonList(records: List<*>, header: List<String> = emptyList()):
171182
val parsed = fromJsonList(values)
172183
when {
173184
parsed.ncol == 0 -> DataColumn.createValueColumn(colName, arrayOfNulls<Any?>(values.size).toList(), typeOf<Any?>())
174-
parsed.isSingleUnnamedColumn() -> parsed.getColumn(0).rename(colName)
175-
else -> DataColumn.createColumnGroup(colName, parsed) as AnyCol
185+
parsed.isSingleUnnamedColumn() -> (parsed.getColumn(0) as UnnamedColumn).col.rename(colName)
186+
else -> DataColumn.createColumnGroup(colName, parsed.unwrapUnnamedColumns()) as AnyCol
176187
}
177188
}
178189
}
179190
}
191+
180192
return when {
181193
columns.isEmpty() -> DataFrame.empty(records.size)
182194
columns.size == 1 && hasArray && header.isNotEmpty() && columns[0].typeClass == List::class -> columns[0].cast<List<*>>().splitInto(*header.toTypedArray())
183195
else -> columns.toDataFrame()
184196
}
185197
}
186198

199+
// we need it to check if AnyFrame created by recursive call has single unnamed column,
200+
// unnamed column means this column is not created from field of a record [{"value": 1}, {"value": 2}],
201+
// but filtered values [1, { ... }, []] -> [1, null, null]
202+
// or arrays: [1, { ...}, []] -> [null, null, []]
203+
private class UnnamedColumn(val col: DataColumn<Any?>) : DataColumn<Any?> by col
204+
187205
private val valueTypes = setOf(Boolean::class, Double::class, Int::class, Float::class, Long::class, Short::class, Byte::class)
188206

189207
internal fun KlaxonJson.encodeRow(frame: ColumnsContainer<*>, index: Int): JsonObject? {

tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import io.kotest.matchers.shouldBe
44
import org.jetbrains.kotlinx.dataframe.DataFrame
55
import org.jetbrains.kotlinx.dataframe.api.convert
66
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
7+
import org.jetbrains.kotlinx.dataframe.api.getColumnGroup
8+
import org.jetbrains.kotlinx.dataframe.api.getFrameColumn
79
import org.jetbrains.kotlinx.dataframe.api.toDouble
810
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
911
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
@@ -76,6 +78,28 @@ class JsonTests {
7678
val group = df["a"] as FrameColumn<*>
7779
}
7880

81+
@Test
82+
fun `parse json with nested json array with mixed values`() {
83+
val json = """[
84+
{"a":"text"},
85+
{"a":{"b":2}},
86+
{"a":[6, {"a": "b"}, [1, {"a" : "b"}],8]}
87+
]
88+
""".trimIndent()
89+
val df = DataFrame.readJsonStr(json)
90+
df.columnsCount() shouldBe 1
91+
df.rowsCount() shouldBe 3
92+
val group = df["a"] as ColumnGroup<*>
93+
group.columnsCount() shouldBe 3
94+
group["b"].type() shouldBe typeOf<Int?>()
95+
group["value"].type() shouldBe typeOf<String?>()
96+
group["array"].type() shouldBe typeOf<DataFrame<*>>()
97+
val nestedDf = group.getFrameColumn("array")[2]
98+
nestedDf["a"].type() shouldBe typeOf<String?>()
99+
nestedDf["value"].type() shouldBe typeOf<Int?>()
100+
nestedDf["array"].type() shouldBe typeOf<DataFrame<*>>()
101+
}
102+
79103
@Test
80104
fun `write df with primitive types`() {
81105
val df = dataFrameOf("colInt", "colDouble?", "colBoolean?")(
@@ -108,4 +132,78 @@ class JsonTests {
108132
df["v"].type() shouldBe typeOf<String>()
109133
DataFrame.readJsonStr(df.toJson()) shouldBe df
110134
}
135+
136+
@Test
137+
fun `literal json field named 'value'`() {
138+
val json = """
139+
{
140+
"data": {
141+
"source": {
142+
"value": "123"
143+
}
144+
}
145+
}
146+
""".trimIndent()
147+
val df = DataFrame.readJsonStr(json)
148+
df[0].getColumnGroup("data").getColumnGroup("source")["value"] shouldBe "123"
149+
}
150+
151+
@Test
152+
fun `array json field named 'value'`() {
153+
val json = """{ "value": ["123"] }"""
154+
155+
val df = DataFrame.readJsonStr(json)
156+
df[0]["value"] shouldBe listOf("123")
157+
}
158+
159+
@Test
160+
fun `record json field named 'value'`() {
161+
val json = """{ "value": { "test" : "123" } }"""
162+
163+
val df = DataFrame.readJsonStr(json)
164+
df[0].getColumnGroup("value")["test"] shouldBe "123"
165+
}
166+
167+
@Test
168+
fun `json field named 'array'`() {
169+
val json = """
170+
{
171+
"data": {
172+
"source": {
173+
"array": "123"
174+
}
175+
}
176+
}
177+
""".trimIndent()
178+
179+
val df = DataFrame.readJsonStr(json)
180+
df[0].getColumnGroup("data").getColumnGroup("source")["array"] shouldBe "123"
181+
}
182+
183+
@Test
184+
fun `array json field named 'array'`() {
185+
val json = """
186+
[{
187+
"a": {
188+
"value": "text",
189+
"array": []
190+
}
191+
}, {
192+
"a": {
193+
"b": 2,
194+
"array": []
195+
}
196+
}, {
197+
"a": {
198+
"array": [6, 7, 8]
199+
}
200+
}]
201+
""".trimIndent()
202+
203+
val df = DataFrame.readJsonStr(json)
204+
val group = df.getColumnGroup("a")
205+
group["array"].type() shouldBe typeOf<List<Int>>()
206+
group["value"].type() shouldBe typeOf<String?>()
207+
group["b"].type() shouldBe typeOf<Int?>()
208+
}
111209
}

0 commit comments

Comments
 (0)