korro docs

Kopilov · Kopilov · commit 96e209fd1393 · 2022-12-01T10:23:18.000+03:00
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt
@@ -119,6 +119,14 @@ public fun <T : Any> DataColumn<T?>.convertToLocalDate(): DataColumn<LocalDate?>
 public fun <T : Any> DataColumn<T>.convertToLocalTime(): DataColumn<LocalTime> = convertTo()
 public fun <T : Any> DataColumn<T?>.convertToLocalTime(): DataColumn<LocalTime?> = convertTo()
 
+@JvmName("convertToByteFromT")
+public fun <T : Any> DataColumn<T>.convertToByte(): DataColumn<Byte> = convertTo()
+public fun <T : Any> DataColumn<T?>.convertToByte(): DataColumn<Byte?> = convertTo()
+
+@JvmName("convertToShortFromT")
+public fun <T : Any> DataColumn<T>.convertToShort(): DataColumn<Short> = convertTo()
+public fun <T : Any> DataColumn<T?>.convertToShort(): DataColumn<Short?> = convertTo()
+
 @JvmName("convertToIntFromT")
 public fun <T : Any> DataColumn<T>.convertToInt(): DataColumn<Int> = convertTo()
 public fun <T : Any> DataColumn<T?>.convertToInt(): DataColumn<Int?> = convertTo()
diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriter.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriter.kt
@@ -40,18 +40,20 @@ import org.apache.arrow.vector.util.Text
 import org.jetbrains.kotlinx.dataframe.AnyCol
 import org.jetbrains.kotlinx.dataframe.AnyFrame
 import org.jetbrains.kotlinx.dataframe.DataFrame
-import org.jetbrains.kotlinx.dataframe.api.convertTo
 import org.jetbrains.kotlinx.dataframe.api.convertToBigDecimal
 import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
+import org.jetbrains.kotlinx.dataframe.api.convertToByte
 import org.jetbrains.kotlinx.dataframe.api.convertToDouble
 import org.jetbrains.kotlinx.dataframe.api.convertToFloat
 import org.jetbrains.kotlinx.dataframe.api.convertToInt
 import org.jetbrains.kotlinx.dataframe.api.convertToLocalDate
 import org.jetbrains.kotlinx.dataframe.api.convertToLocalDateTime
 import org.jetbrains.kotlinx.dataframe.api.convertToLocalTime
 import org.jetbrains.kotlinx.dataframe.api.convertToLong
+import org.jetbrains.kotlinx.dataframe.api.convertToShort
 import org.jetbrains.kotlinx.dataframe.api.convertToString
 import org.jetbrains.kotlinx.dataframe.api.forEachIndexed
+import org.jetbrains.kotlinx.dataframe.api.map
 import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
 import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
 import org.jetbrains.kotlinx.dataframe.typeClass
@@ -182,14 +184,14 @@ public class ArrowWriter(
     private fun convertColumnToTarget(column: AnyCol?, targetFieldType: ArrowType): AnyCol? {
         if (column == null) return null
         return when (targetFieldType) {
-            ArrowType.Utf8() -> column.convertToString()
-            ArrowType.LargeUtf8() -> column.convertToString()
+            ArrowType.Utf8() -> column.map { it.toString() }
+            ArrowType.LargeUtf8() -> column.map { it.toString() }
             ArrowType.Binary(), ArrowType.LargeBinary() -> TODO("Saving var binary is currently not implemented")
             ArrowType.Bool() -> column.convertToBoolean()
-            ArrowType.Int(8, true) -> column.convertTo<Byte>()
-            ArrowType.Int(16, true) -> column.convertTo<Short>()
-            ArrowType.Int(32, true) -> column.convertTo<Int>()
-            ArrowType.Int(64, true) -> column.convertTo<Long>()
+            ArrowType.Int(8, true) -> column.convertToByte()
+            ArrowType.Int(16, true) -> column.convertToShort()
+            ArrowType.Int(32, true) -> column.convertToInt()
+            ArrowType.Int(64, true) -> column.convertToLong()
 //            ArrowType.Int(8, false), ArrowType.Int(16, false), ArrowType.Int(32, false), ArrowType.Int(64, false) -> todo
             is ArrowType.Decimal -> column.convertToBigDecimal()
             ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) -> column.convertToFloat()
diff --git a/docs/StardustDocs/topics/read.md b/docs/StardustDocs/topics/read.md
@@ -399,16 +399,13 @@ implementation("org.jetbrains.kotlinx:dataframe-arrow:$dataframe_version")
 Make sure to follow [Apache Arrow Java compatibility](https://arrow.apache.org/docs/java/install.html#java-compatibility) guide when using Java 9+ 
 </warning>
 
-Dataframe supports reading
-from [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
+Dataframe supports reading [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
 and [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files)
-
+from raw Channel (ReadableByteChannel for streaming and SeekableByteChannel for random access), InputStream, File or ByteArray.
 <!---FUN readArrowFeather-->
 
 ```kotlin
 val df = DataFrame.readArrowFeather(file)
 ```
 
 <!---END-->
-
-
diff --git a/docs/StardustDocs/topics/write.md b/docs/StardustDocs/topics/write.md
@@ -1,7 +1,7 @@
 [//]: # (title: Write)
 <!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Write-->
 
-`DataFrame` can be saved into CSV, TSV, JSON and XLS, XLSX formats.
+`DataFrame` can be saved into CSV, TSV, JSON, XLS and XLSX, Apache Arrow formats.
 
 ### Writing to CSV
 
@@ -49,7 +49,7 @@ val jsonStr = df.toJson(prettyPrint = true)
 
 <!---END-->
 
-### Write to excel spreadsheet
+### Write to Excel spreadsheet
 
 Add dependency:
 
@@ -116,3 +116,84 @@ wb.close()
 ```
 
 <!---END-->
+
+### Writing to Apache Arrow formats
+
+Add dependency:
+
+```kotlin
+implementation("org.jetbrains.kotlinx:dataframe-arrow:$dataframe_version")
+```
+
+<warning>
+Make sure to follow [Apache Arrow Java compatibility](https://arrow.apache.org/docs/java/install.html#java-compatibility) guide when using Java 9+
+</warning>
+
+Dataframe supports writing [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format)
+and [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files)
+to raw WritableByteChannel, OutputStream, File or ByteArray.
+
+Data may be saved "as is" or converted to match some target [Schema](https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Schema.html)
+if you have it.
+
+First is quite easy:
+<!---FUN writeArrowFile-->
+
+```kotlin
+df.writeArrowIPC(file)
+// or
+df.writeArrowFeather(file)
+```
+
+<!---END-->
+(writing to file, opened stream or channel),
+<!---FUN writeArrowByteArray-->
+
+```kotlin
+val ipcByteArray: ByteArray = df.saveArrowIPCToByteArray()
+// or
+val featherByteArray: ByteArray = df.saveArrowFeatherToByteArray()
+```
+
+<!---END-->
+(creating byte array). Nested frames and columns with mixed or unsupported types will be saved as String.
+
+Second is a bit more tricky. You have to create specify schema itself and casting behavior mode as `ArrowWriter` parameters.
+Behavior `Mode` has four independent switchers: `restrictWidening`, `restrictNarrowing`, `strictType`, `strictNullable`.
+You can use `Mode.STRICT` (this is default), `Mode.LOYAL` or any combination you want.
+`ArrowWriter` object should be closed after using because Arrow uses random access buffers not managed by Java GC.
+Finally, you can specify a callback to be invoked if some data is lost or can not be saved according to your schema.
+
+Here is full example:
+<!---FUN writeArrowPerSchema-->
+
+```kotlin
+// Get schema from anywhere you want. It can be deserialized from JSON, generated from another dataset
+// (including DataFrame.columns().toArrowSchema() method), created manually and so on.
+val schema = Schema.fromJSON(schemaJson)
+
+df.arrowWriter(
+    // Specify your schema
+    schema,
+    // Specify desired behavior mode
+    ArrowWriter.Companion.Mode(
+        restrictWidening = true,
+        restrictNarrowing = true,
+        strictType = true,
+        strictNullable = false
+    ),
+    // Specify warning subscriber
+    writeWarningMessage
+).use { writer ->
+    // Save to any format and sink, like in previous example
+    writer.writeArrowFeather(file)
+}
+```
+
+<!---END-->
+On executing you should get two warnings:
+>Column "city" contains nulls but expected not nullable
+
+and
+
+>Column "isHappy" is not described in target schema and was ignored
diff --git a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Write.kt b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Write.kt
@@ -1,16 +1,24 @@
 package org.jetbrains.kotlinx.dataframe.samples.api
 
 import io.kotest.matchers.string.shouldStartWith
+import org.apache.arrow.vector.types.pojo.Schema
 import org.apache.commons.csv.CSVFormat
 import org.apache.poi.ss.usermodel.Sheet
 import org.apache.poi.ss.usermodel.WorkbookFactory
 import org.jetbrains.kotlinx.dataframe.api.filter
 import org.jetbrains.kotlinx.dataframe.api.remove
+import org.jetbrains.kotlinx.dataframe.io.ArrowWriter
+import org.jetbrains.kotlinx.dataframe.io.arrowWriter
+import org.jetbrains.kotlinx.dataframe.io.saveArrowFeatherToByteArray
+import org.jetbrains.kotlinx.dataframe.io.saveArrowIPCToByteArray
 import org.jetbrains.kotlinx.dataframe.io.toCsv
 import org.jetbrains.kotlinx.dataframe.io.toJson
+import org.jetbrains.kotlinx.dataframe.io.writeArrowFeather
+import org.jetbrains.kotlinx.dataframe.io.writeArrowIPC
 import org.jetbrains.kotlinx.dataframe.io.writeCSV
 import org.jetbrains.kotlinx.dataframe.io.writeExcel
 import org.jetbrains.kotlinx.dataframe.io.writeJson
+import org.jetbrains.kotlinx.dataframe.io.writeWarningMessage
 import org.junit.Test
 import java.io.File
 import kotlin.io.path.deleteExisting
@@ -121,6 +129,91 @@ class Write : TestBase() {
         }
     }
 
+    @Test
+    fun writeArrowFile() {
+        useTempFile { file ->
+            // SampleStart
+            df.writeArrowIPC(file)
+            // or
+            df.writeArrowFeather(file)
+            // SampleEnd
+        }
+    }
+
+    @Test
+    fun writeArrowByteArray() {
+        // SampleStart
+        val ipcByteArray: ByteArray = df.saveArrowIPCToByteArray()
+        // or
+        val featherByteArray: ByteArray = df.saveArrowFeatherToByteArray()
+        // SampleEnd
+    }
+
+    @Test
+    fun writeArrowPerSchema() {
+        useTempFile { file ->
+            val schemaJson =
+"""{
+  "fields" : [ {
+    "name" : "name",
+    "nullable" : true,
+    "type" : {
+      "name" : "utf8"
+    },
+    "children" : [ ]
+  }, {
+    "name" : "age",
+    "nullable" : false,
+    "type" : {
+      "name" : "int",
+      "bitWidth" : 32,
+      "isSigned" : true
+    },
+    "children" : [ ]
+  }, {
+    "name" : "city",
+    "nullable" : false,
+    "type" : {
+      "name" : "utf8"
+    },
+    "children" : [ ]
+  }, {
+    "name" : "weight",
+    "nullable" : true,
+    "type" : {
+      "name" : "floatingpoint",
+      "precision" : "DOUBLE"
+    },
+    "children" : [ ]
+  } ]
+}
+"""
+
+            // SampleStart
+            // Get schema from anywhere you want. It can be deserialized from JSON, generated from another dataset
+            // (including DataFrame.columns().toArrowSchema() method), created manually and so on.
+            val schema = Schema.fromJSON(schemaJson)
+
+            df.arrowWriter(
+                // Specify your schema
+                schema,
+                // Specify desired behavior mode
+                ArrowWriter.Companion.Mode(
+                    restrictWidening = true,
+                    restrictNarrowing = true,
+                    strictType = true,
+                    strictNullable = false
+                ),
+                // Specify warning subscriber
+                writeWarningMessage
+            ).use { writer ->
+                // Save to any format and sink, like in previous example
+                writer.writeArrowFeather(file)
+            }
+            // SampleEnd
+        }
+    }
+
     companion object {
         private fun String.rejoinWithSystemLineSeparator() = rejoinWithLineSeparator(System.lineSeparator())