Skip to content

Commit a6c51d8

Browse files
committed
testWritingBySchema
1 parent ec8ee5c commit a6c51d8

File tree

3 files changed

+161
-64
lines changed

3 files changed

+161
-64
lines changed

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriter.kt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,14 @@ import java.time.LocalTime
6868
import kotlin.reflect.full.isSubtypeOf
6969
import kotlin.reflect.typeOf
7070

71+
private val ignoreWarningMessage: (String) -> Unit = { message: String -> }
7172
private val writeWarningMessage: (String) -> Unit = {message: String -> System.err.println(message)}
7273

7374
/**
7475
* Create Arrow [Schema] matching [this] actual data.
7576
* Columns with not supported types will be interpreted as String
7677
*/
77-
public fun List<AnyCol>.toArrowSchema(warningSubscriber: (String) -> Unit = writeWarningMessage): Schema {
78+
public fun List<AnyCol>.toArrowSchema(warningSubscriber: (String) -> Unit = ignoreWarningMessage): Schema {
7879
val fields = this.map { column ->
7980
val columnType = column.type()
8081
val nullable = columnType.isMarkedNullable
@@ -121,7 +122,7 @@ public fun DataFrame<*>.arrowWriter(): ArrowWriter = this.arrowWriter(this.colum
121122
public fun DataFrame<*>.arrowWriter(
122123
targetSchema: Schema,
123124
mode: ArrowWriter.Companion.Mode = ArrowWriter.Companion.Mode.STRICT,
124-
warningSubscriber: (String) -> Unit = writeWarningMessage
125+
warningSubscriber: (String) -> Unit = ignoreWarningMessage
125126
): ArrowWriter = ArrowWriter(this, targetSchema, mode, warningSubscriber)
126127

127128
/**
@@ -132,7 +133,7 @@ public class ArrowWriter(
132133
private val dataFrame: DataFrame<*>,
133134
private val targetSchema: Schema,
134135
private val mode: Mode,
135-
private val warningSubscriber: (String) -> Unit = writeWarningMessage
136+
private val warningSubscriber: (String) -> Unit = ignoreWarningMessage
136137
): AutoCloseable {
137138

138139
public companion object {

dataframe-arrow/src/test/kotlin/ArrowKtTest.kt

Lines changed: 24 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
import io.kotest.assertions.throwables.shouldThrow
22
import io.kotest.matchers.shouldBe
3+
import org.apache.arrow.vector.types.pojo.Schema
34
import org.apache.arrow.vector.util.Text
45
import org.jetbrains.kotlinx.dataframe.DataColumn
56
import org.jetbrains.kotlinx.dataframe.DataFrame
67
import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
78
import org.jetbrains.kotlinx.dataframe.api.columnOf
89
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
910
import org.jetbrains.kotlinx.dataframe.api.toColumn
10-
import org.jetbrains.kotlinx.dataframe.io.arrowWriter
11-
import org.jetbrains.kotlinx.dataframe.io.readArrowFeather
12-
import org.jetbrains.kotlinx.dataframe.io.readArrowIPC
11+
import org.jetbrains.kotlinx.dataframe.io.*
1312
import org.junit.Test
1413
import java.io.File
1514
import java.net.URL
1615
import java.time.LocalDate
16+
import java.time.LocalDateTime
1717
import kotlin.reflect.typeOf
1818

1919
internal class ArrowKtTest {
@@ -93,75 +93,38 @@ internal class ArrowKtTest {
9393
assertEstimations(DataFrame.readArrowIPC(testArrowIPC("test-illegal.arrow"), NullabilityOptions.Widening), true, true)
9494
}
9595

96-
val cities = dataFrameOf(
97-
DataColumn.createValueColumn("name", listOf(
98-
"Berlin",
99-
"Hamburg",
100-
"New York",
101-
"Washington",
102-
"Saint Petersburg",
103-
"Vatican"
104-
)),
105-
DataColumn.createValueColumn("affiliation", listOf(
106-
"Germany",
107-
"Germany",
108-
"The USA",
109-
"The USA",
110-
"Russia",
111-
null
112-
)),
113-
DataColumn.createValueColumn("is_capital", listOf(
114-
true,
115-
false,
116-
false,
117-
true,
118-
false,
119-
null
120-
)),
121-
DataColumn.createValueColumn("population", listOf(
122-
3_769_495,
123-
1_845_229,
124-
8_467_513,
125-
689_545,
126-
5_377_503,
127-
825
128-
)),
129-
DataColumn.createValueColumn("area", listOf(
130-
891.7,
131-
755.22,
132-
1223.59,
133-
177.0,
134-
1439.0,
135-
0.44
136-
)),
137-
DataColumn.createValueColumn("settled", listOf(
138-
LocalDate.of(1237, 1, 1),
139-
LocalDate.of(1189, 5, 7),
140-
LocalDate.of(1624, 1, 1),
141-
LocalDate.of(1790, 7, 16),
142-
LocalDate.of(1703, 5, 27),
143-
LocalDate.of(1929, 2, 11)
144-
))
145-
)
14696

14797
@Test
14898
fun testWritingGeneral() {
14999
fun assertEstimation(citiesDeserialized: DataFrame<*>) {
150-
citiesDeserialized["name"] shouldBe cities["name"]
151-
citiesDeserialized["affiliation"] shouldBe cities["affiliation"]
152-
citiesDeserialized["is_capital"] shouldBe cities["is_capital"]
153-
citiesDeserialized["population"] shouldBe cities["population"]
154-
citiesDeserialized["area"] shouldBe cities["area"]
100+
citiesDeserialized["name"] shouldBe citiesExampleFrame["name"]
101+
citiesDeserialized["affiliation"] shouldBe citiesExampleFrame["affiliation"]
102+
citiesDeserialized["is_capital"] shouldBe citiesExampleFrame["is_capital"]
103+
citiesDeserialized["population"] shouldBe citiesExampleFrame["population"]
104+
citiesDeserialized["area"] shouldBe citiesExampleFrame["area"]
155105
citiesDeserialized["settled"].type() shouldBe typeOf<LocalDate>() // cities["settled"].type() refers to FlexibleTypeImpl(LocalDate..LocalDate?) and does not match typeOf<LocalDate>()
156-
citiesDeserialized["settled"].values() shouldBe cities["settled"].values()
106+
citiesDeserialized["settled"].values() shouldBe citiesExampleFrame["settled"].values()
107+
citiesDeserialized["page_in_wiki"].type() shouldBe typeOf<String>() // cities["page_in_wiki"].type() is URI, not supported by Arrow directly
108+
citiesDeserialized["page_in_wiki"].values() shouldBe citiesExampleFrame["page_in_wiki"].values().map { it.toString() }
157109
}
158110

159111
val testFile = File.createTempFile("cities", "arrow")
160-
cities.arrowWriter().writeArrowFeather(testFile)
112+
citiesExampleFrame.writeArrowFeather(testFile)
161113
assertEstimation(DataFrame.readArrowFeather(testFile))
162114

163-
val testByteArray = cities.arrowWriter().saveArrowIPCToByteArray()
115+
val testByteArray = citiesExampleFrame.arrowWriter().saveArrowIPCToByteArray()
164116
assertEstimation(DataFrame.readArrowIPC(testByteArray))
165117
}
166118

119+
@Test
120+
fun testWritingBySchema() {
121+
val testFile = File.createTempFile("cities", "arrow")
122+
citiesExampleFrame.arrowWriter(Schema.fromJSON(citiesExampleSchema)).writeArrowFeather(testFile)
123+
val citiesDeserialized = DataFrame.readArrowFeather(testFile, NullabilityOptions.Checking)
124+
citiesDeserialized["population"].type() shouldBe typeOf<Long?>()
125+
citiesDeserialized["area"].type() shouldBe typeOf<Float>()
126+
citiesDeserialized["settled"].type() shouldBe typeOf<LocalDateTime>()
127+
shouldThrow<IllegalArgumentException> { citiesDeserialized["page_in_wiki"] shouldBe null }
128+
citiesDeserialized["film_in_youtube"] shouldBe DataColumn.createValueColumn("film_in_youtube", arrayOfNulls<String>(citiesExampleFrame.rowsCount()).asList())
129+
}
167130
}
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import org.jetbrains.kotlinx.dataframe.DataColumn
2+
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
3+
import java.net.URL
4+
import java.time.LocalDate
5+
6+
/**
7+
* DataFrame to be saved in Apache Arrow
8+
*/
9+
val citiesExampleFrame = dataFrameOf(
10+
DataColumn.createValueColumn("name", listOf(
11+
"Berlin",
12+
"Hamburg",
13+
"New York",
14+
"Washington",
15+
"Saint Petersburg",
16+
"Vatican"
17+
)),
18+
DataColumn.createValueColumn("affiliation", listOf(
19+
"Germany",
20+
"Germany",
21+
"The USA",
22+
"The USA",
23+
"Russia",
24+
null
25+
)),
26+
DataColumn.createValueColumn("is_capital", listOf(
27+
true,
28+
false,
29+
false,
30+
true,
31+
false,
32+
null
33+
)),
34+
DataColumn.createValueColumn("population", listOf(
35+
3_769_495,
36+
1_845_229,
37+
8_467_513,
38+
689_545,
39+
5_377_503,
40+
825
41+
)),
42+
DataColumn.createValueColumn("area", listOf(
43+
891.7,
44+
755.22,
45+
1223.59,
46+
177.0,
47+
1439.0,
48+
0.44
49+
)),
50+
DataColumn.createValueColumn("settled", listOf(
51+
LocalDate.of(1237, 1, 1),
52+
LocalDate.of(1189, 5, 7),
53+
LocalDate.of(1624, 1, 1),
54+
LocalDate.of(1790, 7, 16),
55+
LocalDate.of(1703, 5, 27),
56+
LocalDate.of(1929, 2, 11)
57+
)),
58+
DataColumn.createValueColumn("page_in_wiki", listOf(
59+
URL("https://en.wikipedia.org/wiki/Berlin"),
60+
URL("https://en.wikipedia.org/wiki/Hamburg"),
61+
URL("https://en.wikipedia.org/wiki/New_York_City"),
62+
URL("https://en.wikipedia.org/wiki/Washington,_D.C."),
63+
URL("https://en.wikipedia.org/wiki/Saint_Petersburg"),
64+
URL("https://en.wikipedia.org/wiki/Vatican_City")
65+
))
66+
)
67+
68+
/**
69+
* [citiesExampleFrame] Apache Arrow schema with some changes.
70+
* Originally generated by `citiesExampleFrame.columns().toArrowSchema().toJson()`
71+
* Changes made to test converting and schema matching:
72+
* field "population" changed to nullable Long;
73+
* field "area" changed to single Float;
74+
* field "settled" changed to datetime (date with millisecond precision);
75+
* field "page_in_wiki" removed, nullable field "film_in_youtube" added.
76+
*/
77+
val citiesExampleSchema = """{
78+
"fields" : [ {
79+
"name" : "name",
80+
"nullable" : false,
81+
"type" : {
82+
"name" : "utf8"
83+
},
84+
"children" : [ ]
85+
}, {
86+
"name" : "affiliation",
87+
"nullable" : true,
88+
"type" : {
89+
"name" : "utf8"
90+
},
91+
"children" : [ ]
92+
}, {
93+
"name" : "is_capital",
94+
"nullable" : true,
95+
"type" : {
96+
"name" : "bool"
97+
},
98+
"children" : [ ]
99+
}, {
100+
"name" : "population",
101+
"nullable" : true,
102+
"type" : {
103+
"name" : "int",
104+
"bitWidth" : 64,
105+
"isSigned" : true
106+
},
107+
"children" : [ ]
108+
}, {
109+
"name" : "area",
110+
"nullable" : false,
111+
"type" : {
112+
"name" : "floatingpoint",
113+
"precision" : "SINGLE"
114+
},
115+
"children" : [ ]
116+
}, {
117+
"name" : "settled",
118+
"nullable" : false,
119+
"type" : {
120+
"name" : "date",
121+
"unit" : "MILLISECOND"
122+
},
123+
"children" : [ ]
124+
}, {
125+
"name" : "film_in_youtube",
126+
"nullable" : true,
127+
"type" : {
128+
"name" : "utf8"
129+
},
130+
"children" : [ ]
131+
} ]
132+
}
133+
""".trimIndent()

0 commit comments

Comments
 (0)