Skip to content

Commit c3b9939

Browse files
KopilovKopilov
authored andcommitted
Fix allocating VariableWidthVector in Arrow
1 parent 13f3c4b commit c3b9939

File tree

2 files changed

+20
-4
lines changed

2 files changed

+20
-4
lines changed

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriterImpl.kt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ import org.jetbrains.kotlinx.dataframe.api.map
5353
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
5454
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
5555
import org.jetbrains.kotlinx.dataframe.name
56+
import org.jetbrains.kotlinx.dataframe.values
57+
import java.nio.charset.Charset
58+
import kotlin.reflect.full.isSubtypeOf
59+
import kotlin.reflect.typeOf
5660

5761
/**
5862
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
@@ -67,14 +71,25 @@ internal class ArrowWriterImpl(
6771

6872
private val allocator = RootAllocator()
6973

70-
private fun allocateVector(vector: FieldVector, size: Int) {
74+
private fun allocateVector(vector: FieldVector, size: Int, totalBytes: Long? = null) {
7175
when (vector) {
7276
is FixedWidthVector -> vector.allocateNew(size)
73-
is VariableWidthVector -> vector.allocateNew(size)
77+
is VariableWidthVector -> totalBytes?.let { vector.allocateNew(it, size) } ?: vector.allocateNew(size)
7478
else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")
7579
}
7680
}
7781

82+
/**
83+
* Calculate buffer size for VariableWidthVector (return null for FixedWidthVector)
84+
*/
85+
private fun countTotalBytes(column: AnyCol): Long? {
86+
val columnType = column.type()
87+
return when {
88+
columnType.isSubtypeOf(typeOf<String?>()) -> column.values.fold(0L) {totalBytes, value -> totalBytes + value.toString().length * 4}
89+
else -> null
90+
}
91+
}
92+
7893
private fun infillWithNulls(vector: FieldVector, size: Int) {
7994
when (vector) {
8095
is BaseFixedWidthVector -> for (i in 0 until size) { vector.setNull(i) }
@@ -189,11 +204,12 @@ internal class ArrowWriterImpl(
189204
actualField.createVector(allocator)!!
190205
}
191206

192-
allocateVector(vector, dataFrame.rowsCount())
193207
if (convertedColumn == null) {
194208
check(actualField.isNullable)
209+
allocateVector(vector, dataFrame.rowsCount())
195210
infillWithNulls(vector, dataFrame.rowsCount())
196211
} else {
212+
allocateVector(vector, dataFrame.rowsCount(), countTotalBytes(convertedColumn))
197213
infillVector(vector, convertedColumn)
198214
}
199215
return vector

gradle/libs.versions.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ openapi = "2.1.13"
2323
junit = "4.13.2"
2424
kotestAsserions = "4.6.3"
2525
jsoup = "1.14.3"
26-
arrow = "10.0.0"
26+
arrow = "11.0.0"
2727
docProcessor = "0.1.6"
2828
simpleGit = "2.0.1"
2929

0 commit comments

Comments
 (0)