Review-requested modifications

Kopilov · Kopilov · commit 6eea98dd84e0 · 2022-12-01T16:31:50.000+03:00
diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriter.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriter.kt
@@ -50,7 +50,7 @@ public interface ArrowWriter : AutoCloseable {
          * If [strictType] is true, [dataFrame] columns described in [targetSchema] with non-compatible type will produce exception (otherwise, would be saved as is).
          * If [strictNullable] is true, [targetSchema] fields that are not nullable and contain nulls in [dataFrame] will produce exception (otherwise, would be saved as is with nullable = true).
          */
-        public class Mode(
+        public data class Mode(
             public val restrictWidening: Boolean,
             public val restrictNarrowing: Boolean,
             public val strictType: Boolean,
diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriterImpl.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriterImpl.kt
@@ -71,15 +71,15 @@ internal class ArrowWriterImpl(
         when (vector) {
             is FixedWidthVector -> vector.allocateNew(size)
             is VariableWidthVector -> vector.allocateNew(size)
-            else -> TODO("Not implemented for ${vector.javaClass.canonicalName}")
+            else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")
         }
     }
 
     private fun infillWithNulls(vector: FieldVector, size: Int) {
         when (vector) {
             is BaseFixedWidthVector -> for (i in 0 until size) { vector.setNull(i) }
             is BaseVariableWidthVector -> for (i in 0 until size) { vector.setNull(i) }
-            else -> TODO("Not implemented for ${vector.javaClass.canonicalName}")
+            else -> throw IllegalArgumentException("Can not infill ${vector.javaClass.canonicalName}")
         }
         vector.valueCount = size
     }
@@ -89,7 +89,7 @@ internal class ArrowWriterImpl(
         return when (targetFieldType) {
             ArrowType.Utf8() -> column.map { it?.toString() }
             ArrowType.LargeUtf8() -> column.map { it?.toString() }
-            ArrowType.Binary(), ArrowType.LargeBinary() -> TODO("Saving var binary is currently not implemented")
+            ArrowType.Binary(), ArrowType.LargeBinary() -> throw NotImplementedError("Saving var binary is currently not implemented")
             ArrowType.Bool() -> column.convertToBoolean()
             ArrowType.Int(8, true) -> column.convertToByte()
             ArrowType.Int(16, true) -> column.convertToShort()
@@ -105,7 +105,7 @@ internal class ArrowWriterImpl(
 //            is ArrowType.Duration -> todo
 //            is ArrowType.Struct -> todo
             else -> {
-                TODO("Saving ${targetFieldType.javaClass.canonicalName} is not implemented")
+                throw NotImplementedError("Saving ${targetFieldType.javaClass.canonicalName} is currently not implemented")
             }
         }
     }
@@ -140,7 +140,7 @@ internal class ArrowWriterImpl(
             is TimeSecVector -> column.convertToLocalTime().forEachIndexed { i, value -> value?.let { vector.set(i, (value.toNanoOfDay() / 1000 / 1000 / 1000).toInt()); value } ?: vector.setNull(i) }
 //            is StructVector -> todo
             else -> {
-                TODO("Saving to ${vector.javaClass.canonicalName} is not implemented")
+                throw NotImplementedError("Saving to ${vector.javaClass.canonicalName} is currently not implemented")
             }
         }
 
diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt
@@ -1,13 +1,10 @@
 package org.jetbrains.kotlinx.dataframe.io
 
 import org.apache.arrow.memory.RootAllocator
-import org.apache.arrow.vector.ipc.ArrowFileReader
-import org.apache.arrow.vector.ipc.ArrowStreamReader
 import org.apache.commons.compress.utils.SeekableInMemoryByteChannel
 import org.jetbrains.kotlinx.dataframe.AnyFrame
 import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.api.NullabilityOptions
-import org.jetbrains.kotlinx.dataframe.api.toDataFrame
 import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
 import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
 import java.io.File
@@ -54,41 +51,15 @@ public fun DataFrame.Companion.readArrowIPC(
     channel: ReadableByteChannel,
     allocator: RootAllocator = Allocator.ROOT,
     nullability: NullabilityOptions = NullabilityOptions.Infer,
-): AnyFrame {
-    ArrowStreamReader(channel, allocator).use { reader ->
-        val dfs = buildList {
-            val root = reader.vectorSchemaRoot
-            val schema = root.schema
-            while (reader.loadNextBatch()) {
-                val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
-                add(df)
-            }
-        }
-        return dfs.concatKeepingSchema()
-    }
-}
-
+): AnyFrame = readArrowIPCImpl(channel, allocator, nullability)
 /**
  * Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
  */
 public fun DataFrame.Companion.readArrowFeather(
     channel: SeekableByteChannel,
     allocator: RootAllocator = Allocator.ROOT,
     nullability: NullabilityOptions = NullabilityOptions.Infer,
-): AnyFrame {
-    ArrowFileReader(channel, allocator).use { reader ->
-        val dfs = buildList {
-            reader.recordBlocks.forEach { block ->
-                reader.loadRecordBatch(block)
-                val root = reader.vectorSchemaRoot
-                val schema = root.schema
-                val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
-                add(df)
-            }
-        }
-        return dfs.concatKeepingSchema()
-    }
-}
+): AnyFrame = readArrowFeatherImpl(channel, allocator, nullability)
 
 // IPC reading block
 
diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReadingImpl.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReadingImpl.kt
@@ -1,5 +1,6 @@
 package org.jetbrains.kotlinx.dataframe.io
 
+import org.apache.arrow.memory.RootAllocator
 import org.apache.arrow.vector.BigIntVector
 import org.apache.arrow.vector.BitVector
 import org.apache.arrow.vector.DateDayVector
@@ -26,9 +27,12 @@ import org.apache.arrow.vector.VarBinaryVector
 import org.apache.arrow.vector.VarCharVector
 import org.apache.arrow.vector.VectorSchemaRoot
 import org.apache.arrow.vector.complex.StructVector
+import org.apache.arrow.vector.ipc.ArrowFileReader
+import org.apache.arrow.vector.ipc.ArrowStreamReader
 import org.apache.arrow.vector.types.pojo.Field
 import org.apache.arrow.vector.util.DateUtility
 import org.jetbrains.kotlinx.dataframe.AnyBaseCol
+import org.jetbrains.kotlinx.dataframe.AnyFrame
 import org.jetbrains.kotlinx.dataframe.DataColumn
 import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.api.Infer
@@ -39,9 +43,12 @@ import org.jetbrains.kotlinx.dataframe.api.cast
 import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
 import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame
 import org.jetbrains.kotlinx.dataframe.api.getColumn
+import org.jetbrains.kotlinx.dataframe.api.toDataFrame
 import org.jetbrains.kotlinx.dataframe.impl.asList
 import java.math.BigDecimal
 import java.math.BigInteger
+import java.nio.channels.ReadableByteChannel
+import java.nio.channels.SeekableByteChannel
 import java.time.Duration
 import java.time.LocalDate
 import java.time.LocalDateTime
@@ -197,11 +204,54 @@ internal fun readField(root: VectorSchemaRoot, field: Field, nullability: Nullab
             is TimeSecVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
             is StructVector -> vector.values(range).withTypeNullable(field.isNullable, nullability)
             else -> {
-                TODO("not fully implemented")
+                throw NotImplementedError("reading from ${vector.javaClass.canonicalName} is not implemented")
             }
         }
         return DataColumn.createValueColumn(field.name, list, type, Infer.None)
     } catch (unexpectedNull: NullabilityException) {
         throw IllegalArgumentException("Column `${field.name}` should be not nullable but has nulls")
     }
 }
+
+/**
+ * Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [channel]
+ */
+public fun DataFrame.Companion.readArrowIPCImpl(
+    channel: ReadableByteChannel,
+    allocator: RootAllocator = Allocator.ROOT,
+    nullability: NullabilityOptions = NullabilityOptions.Infer,
+): AnyFrame {
+    ArrowStreamReader(channel, allocator).use { reader ->
+        val dfs = buildList {
+            val root = reader.vectorSchemaRoot
+            val schema = root.schema
+            while (reader.loadNextBatch()) {
+                val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
+                add(df)
+            }
+        }
+        return dfs.concatKeepingSchema()
+    }
+}
+
+/**
+ * Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
+ */
+public fun DataFrame.Companion.readArrowFeatherImpl(
+    channel: SeekableByteChannel,
+    allocator: RootAllocator = Allocator.ROOT,
+    nullability: NullabilityOptions = NullabilityOptions.Infer,
+): AnyFrame {
+    ArrowFileReader(channel, allocator).use { reader ->
+        val dfs = buildList {
+            reader.recordBlocks.forEach { block ->
+                reader.loadRecordBatch(block)
+                val root = reader.vectorSchemaRoot
+                val schema = root.schema
+                val df = schema.fields.map { f -> readField(root, f, nullability) }.toDataFrame()
+                add(df)
+            }
+        }
+        return dfs.concatKeepingSchema()
+    }
+}
diff --git a/docs/StardustDocs/topics/write.md b/docs/StardustDocs/topics/write.md
@@ -1,7 +1,7 @@
 [//]: # (title: Write)
 <!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Write-->
 
-`DataFrame` can be saved into CSV, TSV, JSON, XLS and XLSX, Apache Arrow formats.
+`DataFrame` instances can be saved in the following formats: CSV, TSV, JSON, XLS(X) and Apache Arrow.
 
 ### Writing to CSV
 
@@ -161,7 +161,7 @@ val featherByteArray: ByteArray = df.saveArrowFeatherToByteArray()
 Second is a bit more tricky. You have to create specify schema itself and casting behavior mode as `ArrowWriter` parameters.
 Behavior `Mode` has four independent switchers: `restrictWidening`, `restrictNarrowing`, `strictType`, `strictNullable`.
 You can use `Mode.STRICT` (this is default), `Mode.LOYAL` or any combination you want.
-`ArrowWriter` object should be closed after using because Arrow uses random access buffers not managed by Java GC.
+The `ArrowWriter` object should be closed after using because Arrow uses random access buffers not managed by Java GC.
 Finally, you can specify a callback to be invoked if some data is lost or can not be saved according to your schema.
 
 Here is full example:

Original file line number	Diff line number	Diff line change
`@@ -71,15 +71,15 @@ internal class ArrowWriterImpl(`
`71`	`71`	`when (vector) {`
`72`	`72`	`is FixedWidthVector -> vector.allocateNew(size)`
`73`	`73`	`is VariableWidthVector -> vector.allocateNew(size)`
`74`		`- else -> TODO("Not implemented for ${vector.javaClass.canonicalName}")`
	`74`	`+ else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")`
`75`	`75`	`}`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`private fun infillWithNulls(vector: FieldVector, size: Int) {`
`79`	`79`	`when (vector) {`
`80`	`80`	`is BaseFixedWidthVector -> for (i in 0 until size) { vector.setNull(i) }`
`81`	`81`	`is BaseVariableWidthVector -> for (i in 0 until size) { vector.setNull(i) }`
`82`		`- else -> TODO("Not implemented for ${vector.javaClass.canonicalName}")`
	`82`	`+ else -> throw IllegalArgumentException("Can not infill ${vector.javaClass.canonicalName}")`
`83`	`83`	`}`
`84`	`84`	`vector.valueCount = size`
`85`	`85`	`}`
`@@ -89,7 +89,7 @@ internal class ArrowWriterImpl(`
`89`	`89`	`return when (targetFieldType) {`
`90`	`90`	`ArrowType.Utf8() -> column.map { it?.toString() }`
`91`	`91`	`ArrowType.LargeUtf8() -> column.map { it?.toString() }`
`92`		`- ArrowType.Binary(), ArrowType.LargeBinary() -> TODO("Saving var binary is currently not implemented")`
	`92`	`+ ArrowType.Binary(), ArrowType.LargeBinary() -> throw NotImplementedError("Saving var binary is currently not implemented")`
`93`	`93`	`ArrowType.Bool() -> column.convertToBoolean()`
`94`	`94`	`ArrowType.Int(8, true) -> column.convertToByte()`
`95`	`95`	`ArrowType.Int(16, true) -> column.convertToShort()`
`@@ -105,7 +105,7 @@ internal class ArrowWriterImpl(`
`105`	`105`	`// is ArrowType.Duration -> todo`
`106`	`106`	`// is ArrowType.Struct -> todo`
`107`	`107`	`else -> {`
`108`		`- TODO("Saving ${targetFieldType.javaClass.canonicalName} is not implemented")`
	`108`	`+ throw NotImplementedError("Saving ${targetFieldType.javaClass.canonicalName} is currently not implemented")`
`109`	`109`	`}`
`110`	`110`	`}`
`111`	`111`	`}`
`@@ -140,7 +140,7 @@ internal class ArrowWriterImpl(`
`140`	`140`	`is TimeSecVector -> column.convertToLocalTime().forEachIndexed { i, value -> value?.let { vector.set(i, (value.toNanoOfDay() / 1000 / 1000 / 1000).toInt()); value } ?: vector.setNull(i) }`
`141`	`141`	`// is StructVector -> todo`
`142`	`142`	`else -> {`
`143`		`- TODO("Saving to ${vector.javaClass.canonicalName} is not implemented")`
	`143`	`+ throw NotImplementedError("Saving to ${vector.javaClass.canonicalName} is currently not implemented")`
`144`	`144`	`}`
`145`	`145`	`}`
`146`	`146`