Skip to content

Commit 96e209f

Browse files
committed
korro docs
1 parent 0895584 commit 96e209f

File tree

5 files changed

+195
-14
lines changed
  • core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api
  • dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io
  • docs/StardustDocs/topics
  • tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api

5 files changed

+195
-14
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,14 @@ public fun <T : Any> DataColumn<T?>.convertToLocalDate(): DataColumn<LocalDate?>
119119
public fun <T : Any> DataColumn<T>.convertToLocalTime(): DataColumn<LocalTime> = convertTo()
120120
public fun <T : Any> DataColumn<T?>.convertToLocalTime(): DataColumn<LocalTime?> = convertTo()
121121

122+
@JvmName("convertToByteFromT")
123+
public fun <T : Any> DataColumn<T>.convertToByte(): DataColumn<Byte> = convertTo()
124+
public fun <T : Any> DataColumn<T?>.convertToByte(): DataColumn<Byte?> = convertTo()
125+
126+
@JvmName("convertToShortFromT")
127+
public fun <T : Any> DataColumn<T>.convertToShort(): DataColumn<Short> = convertTo()
128+
public fun <T : Any> DataColumn<T?>.convertToShort(): DataColumn<Short?> = convertTo()
129+
122130
@JvmName("convertToIntFromT")
123131
public fun <T : Any> DataColumn<T>.convertToInt(): DataColumn<Int> = convertTo()
124132
public fun <T : Any> DataColumn<T?>.convertToInt(): DataColumn<Int?> = convertTo()

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriter.kt

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,20 @@ import org.apache.arrow.vector.util.Text
4040
import org.jetbrains.kotlinx.dataframe.AnyCol
4141
import org.jetbrains.kotlinx.dataframe.AnyFrame
4242
import org.jetbrains.kotlinx.dataframe.DataFrame
43-
import org.jetbrains.kotlinx.dataframe.api.convertTo
4443
import org.jetbrains.kotlinx.dataframe.api.convertToBigDecimal
4544
import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
45+
import org.jetbrains.kotlinx.dataframe.api.convertToByte
4646
import org.jetbrains.kotlinx.dataframe.api.convertToDouble
4747
import org.jetbrains.kotlinx.dataframe.api.convertToFloat
4848
import org.jetbrains.kotlinx.dataframe.api.convertToInt
4949
import org.jetbrains.kotlinx.dataframe.api.convertToLocalDate
5050
import org.jetbrains.kotlinx.dataframe.api.convertToLocalDateTime
5151
import org.jetbrains.kotlinx.dataframe.api.convertToLocalTime
5252
import org.jetbrains.kotlinx.dataframe.api.convertToLong
53+
import org.jetbrains.kotlinx.dataframe.api.convertToShort
5354
import org.jetbrains.kotlinx.dataframe.api.convertToString
5455
import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
56+
import org.jetbrains.kotlinx.dataframe.api.map
5557
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
5658
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
5759
import org.jetbrains.kotlinx.dataframe.typeClass
@@ -182,14 +184,14 @@ public class ArrowWriter(
182184
private fun convertColumnToTarget(column: AnyCol?, targetFieldType: ArrowType): AnyCol? {
183185
if (column == null) return null
184186
return when (targetFieldType) {
185-
ArrowType.Utf8() -> column.convertToString()
186-
ArrowType.LargeUtf8() -> column.convertToString()
187+
ArrowType.Utf8() -> column.map { it.toString() }
188+
ArrowType.LargeUtf8() -> column.map { it.toString() }
187189
ArrowType.Binary(), ArrowType.LargeBinary() -> TODO("Saving var binary is currently not implemented")
188190
ArrowType.Bool() -> column.convertToBoolean()
189-
ArrowType.Int(8, true) -> column.convertTo<Byte>()
190-
ArrowType.Int(16, true) -> column.convertTo<Short>()
191-
ArrowType.Int(32, true) -> column.convertTo<Int>()
192-
ArrowType.Int(64, true) -> column.convertTo<Long>()
191+
ArrowType.Int(8, true) -> column.convertToByte()
192+
ArrowType.Int(16, true) -> column.convertToShort()
193+
ArrowType.Int(32, true) -> column.convertToInt()
194+
ArrowType.Int(64, true) -> column.convertToLong()
193195
// ArrowType.Int(8, false), ArrowType.Int(16, false), ArrowType.Int(32, false), ArrowType.Int(64, false) -> todo
194196
is ArrowType.Decimal -> column.convertToBigDecimal()
195197
ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) -> column.convertToFloat()

docs/StardustDocs/topics/read.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -399,16 +399,13 @@ implementation("org.jetbrains.kotlinx:dataframe-arrow:$dataframe_version")
399399
Make sure to follow [Apache Arrow Java compatibility](https://arrow.apache.org/docs/java/install.html#java-compatibility) guide when using Java 9+
400400
</warning>
401401

402-
Dataframe supports reading
403-
from [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
402+
Dataframe supports reading [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
404403
and [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files)
405-
404+
from raw Channel (ReadableByteChannel for streaming and SeekableByteChannel for random access), InputStream, File or ByteArray.
406405
<!---FUN readArrowFeather-->
407406

408407
```kotlin
409408
val df = DataFrame.readArrowFeather(file)
410409
```
411410

412411
<!---END-->
413-
414-

docs/StardustDocs/topics/write.md

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[//]: # (title: Write)
22
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Write-->
33

4-
`DataFrame` can be saved into CSV, TSV, JSON and XLS, XLSX formats.
4+
`DataFrame` can be saved into CSV, TSV, JSON, XLS and XLSX, Apache Arrow formats.
55

66
### Writing to CSV
77

@@ -49,7 +49,7 @@ val jsonStr = df.toJson(prettyPrint = true)
4949

5050
<!---END-->
5151

52-
### Write to excel spreadsheet
52+
### Write to Excel spreadsheet
5353

5454
Add dependency:
5555

@@ -116,3 +116,84 @@ wb.close()
116116
```
117117

118118
<!---END-->
119+
120+
### Writing to Apache Arrow formats
121+
122+
Add dependency:
123+
124+
```kotlin
125+
implementation("org.jetbrains.kotlinx:dataframe-arrow:$dataframe_version")
126+
```
127+
128+
<warning>
129+
Make sure to follow [Apache Arrow Java compatibility](https://arrow.apache.org/docs/java/install.html#java-compatibility) guide when using Java 9+
130+
</warning>
131+
132+
Dataframe supports writing [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
133+
and [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files)
134+
to raw WritableByteChannel, OutputStream, File or ByteArray.
135+
136+
Data may be saved "as is" or converted to match some target [Schema](https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Schema.html)
137+
if you have it.
138+
139+
First is quite easy:
140+
<!---FUN writeArrowFile-->
141+
142+
```kotlin
143+
df.writeArrowIPC(file)
144+
// or
145+
df.writeArrowFeather(file)
146+
```
147+
148+
<!---END-->
149+
(writing to file, opened stream or channel),
150+
<!---FUN writeArrowByteArray-->
151+
152+
```kotlin
153+
val ipcByteArray: ByteArray = df.saveArrowIPCToByteArray()
154+
// or
155+
val featherByteArray: ByteArray = df.saveArrowFeatherToByteArray()
156+
```
157+
158+
<!---END-->
159+
(creating byte array). Nested frames and columns with mixed or unsupported types will be saved as String.
160+
161+
Second is a bit more tricky. You have to create specify schema itself and casting behavior mode as `ArrowWriter` parameters.
162+
Behavior `Mode` has four independent switchers: `restrictWidening`, `restrictNarrowing`, `strictType`, `strictNullable`.
163+
You can use `Mode.STRICT` (this is default), `Mode.LOYAL` or any combination you want.
164+
`ArrowWriter` object should be closed after using because Arrow uses random access buffers not managed by Java GC.
165+
Finally, you can specify a callback to be invoked if some data is lost or can not be saved according to your schema.
166+
167+
Here is full example:
168+
<!---FUN writeArrowPerSchema-->
169+
170+
```kotlin
171+
// Get schema from anywhere you want. It can be deserialized from JSON, generated from another dataset
172+
// (including DataFrame.columns().toArrowSchema() method), created manually and so on.
173+
val schema = Schema.fromJSON(schemaJson)
174+
175+
df.arrowWriter(
176+
// Specify your schema
177+
schema,
178+
// Specify desired behavior mode
179+
ArrowWriter.Companion.Mode(
180+
restrictWidening = true,
181+
restrictNarrowing = true,
182+
strictType = true,
183+
strictNullable = false
184+
),
185+
// Specify warning subscriber
186+
writeWarningMessage
187+
).use { writer ->
188+
// Save to any format and sink, like in previous example
189+
writer.writeArrowFeather(file)
190+
}
191+
```
192+
193+
<!---END-->
194+
On executing you should get two warnings:
195+
>Column "city" contains nulls but expected not nullable
196+
197+
and
198+
199+
>Column "isHappy" is not described in target schema and was ignored

tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Write.kt

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,24 @@
11
package org.jetbrains.kotlinx.dataframe.samples.api
22

33
import io.kotest.matchers.string.shouldStartWith
4+
import org.apache.arrow.vector.types.pojo.Schema
45
import org.apache.commons.csv.CSVFormat
56
import org.apache.poi.ss.usermodel.Sheet
67
import org.apache.poi.ss.usermodel.WorkbookFactory
78
import org.jetbrains.kotlinx.dataframe.api.filter
89
import org.jetbrains.kotlinx.dataframe.api.remove
10+
import org.jetbrains.kotlinx.dataframe.io.ArrowWriter
11+
import org.jetbrains.kotlinx.dataframe.io.arrowWriter
12+
import org.jetbrains.kotlinx.dataframe.io.saveArrowFeatherToByteArray
13+
import org.jetbrains.kotlinx.dataframe.io.saveArrowIPCToByteArray
914
import org.jetbrains.kotlinx.dataframe.io.toCsv
1015
import org.jetbrains.kotlinx.dataframe.io.toJson
16+
import org.jetbrains.kotlinx.dataframe.io.writeArrowFeather
17+
import org.jetbrains.kotlinx.dataframe.io.writeArrowIPC
1118
import org.jetbrains.kotlinx.dataframe.io.writeCSV
1219
import org.jetbrains.kotlinx.dataframe.io.writeExcel
1320
import org.jetbrains.kotlinx.dataframe.io.writeJson
21+
import org.jetbrains.kotlinx.dataframe.io.writeWarningMessage
1422
import org.junit.Test
1523
import java.io.File
1624
import kotlin.io.path.deleteExisting
@@ -121,6 +129,91 @@ class Write : TestBase() {
121129
}
122130
}
123131

132+
@Test
133+
fun writeArrowFile() {
134+
useTempFile { file ->
135+
// SampleStart
136+
df.writeArrowIPC(file)
137+
// or
138+
df.writeArrowFeather(file)
139+
// SampleEnd
140+
}
141+
}
142+
143+
@Test
144+
fun writeArrowByteArray() {
145+
// SampleStart
146+
val ipcByteArray: ByteArray = df.saveArrowIPCToByteArray()
147+
// or
148+
val featherByteArray: ByteArray = df.saveArrowFeatherToByteArray()
149+
// SampleEnd
150+
}
151+
152+
@Test
153+
fun writeArrowPerSchema() {
154+
useTempFile { file ->
155+
val schemaJson =
156+
"""{
157+
"fields" : [ {
158+
"name" : "name",
159+
"nullable" : true,
160+
"type" : {
161+
"name" : "utf8"
162+
},
163+
"children" : [ ]
164+
}, {
165+
"name" : "age",
166+
"nullable" : false,
167+
"type" : {
168+
"name" : "int",
169+
"bitWidth" : 32,
170+
"isSigned" : true
171+
},
172+
"children" : [ ]
173+
}, {
174+
"name" : "city",
175+
"nullable" : false,
176+
"type" : {
177+
"name" : "utf8"
178+
},
179+
"children" : [ ]
180+
}, {
181+
"name" : "weight",
182+
"nullable" : true,
183+
"type" : {
184+
"name" : "floatingpoint",
185+
"precision" : "DOUBLE"
186+
},
187+
"children" : [ ]
188+
} ]
189+
}
190+
"""
191+
192+
// SampleStart
193+
// Get schema from anywhere you want. It can be deserialized from JSON, generated from another dataset
194+
// (including DataFrame.columns().toArrowSchema() method), created manually and so on.
195+
val schema = Schema.fromJSON(schemaJson)
196+
197+
df.arrowWriter(
198+
// Specify your schema
199+
schema,
200+
// Specify desired behavior mode
201+
ArrowWriter.Companion.Mode(
202+
restrictWidening = true,
203+
restrictNarrowing = true,
204+
strictType = true,
205+
strictNullable = false
206+
),
207+
// Specify warning subscriber
208+
writeWarningMessage
209+
).use { writer ->
210+
// Save to any format and sink, like in previous example
211+
writer.writeArrowFeather(file)
212+
}
213+
// SampleEnd
214+
}
215+
}
216+
124217
companion object {
125218
private fun String.rejoinWithSystemLineSeparator() = rejoinWithLineSeparator(System.lineSeparator())
126219

0 commit comments

Comments
 (0)