Skip to content

Commit 8b371dc

Browse files
authored
Merge pull request #350 from Kopilov/VariableWidthVector-fix
Variable width vector fix
2 parents 923dc0f + c3b9939 commit 8b371dc

File tree

4 files changed

+63
-4
lines changed

4 files changed

+63
-4
lines changed

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriterImpl.kt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ import org.jetbrains.kotlinx.dataframe.api.map
5353
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
5454
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
5555
import org.jetbrains.kotlinx.dataframe.name
56+
import org.jetbrains.kotlinx.dataframe.values
57+
import java.nio.charset.Charset
58+
import kotlin.reflect.full.isSubtypeOf
59+
import kotlin.reflect.typeOf
5660

5761
/**
5862
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
@@ -67,14 +71,25 @@ internal class ArrowWriterImpl(
6771

6872
private val allocator = RootAllocator()
6973

70-
private fun allocateVector(vector: FieldVector, size: Int) {
74+
private fun allocateVector(vector: FieldVector, size: Int, totalBytes: Long? = null) {
7175
when (vector) {
7276
is FixedWidthVector -> vector.allocateNew(size)
73-
is VariableWidthVector -> vector.allocateNew(size)
77+
is VariableWidthVector -> totalBytes?.let { vector.allocateNew(it, size) } ?: vector.allocateNew(size)
7478
else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")
7579
}
7680
}
7781

82+
/**
83+
* Calculate buffer size for VariableWidthVector (return null for FixedWidthVector)
84+
*/
85+
private fun countTotalBytes(column: AnyCol): Long? {
86+
val columnType = column.type()
87+
return when {
88+
columnType.isSubtypeOf(typeOf<String?>()) -> column.values.fold(0L) {totalBytes, value -> totalBytes + value.toString().length * 4}
89+
else -> null
90+
}
91+
}
92+
7893
private fun infillWithNulls(vector: FieldVector, size: Int) {
7994
when (vector) {
8095
is BaseFixedWidthVector -> for (i in 0 until size) { vector.setNull(i) }
@@ -189,11 +204,12 @@ internal class ArrowWriterImpl(
189204
actualField.createVector(allocator)!!
190205
}
191206

192-
allocateVector(vector, dataFrame.rowsCount())
193207
if (convertedColumn == null) {
194208
check(actualField.isNullable)
209+
allocateVector(vector, dataFrame.rowsCount())
195210
infillWithNulls(vector, dataFrame.rowsCount())
196211
} else {
212+
allocateVector(vector, dataFrame.rowsCount(), countTotalBytes(convertedColumn))
197213
infillVector(vector, convertedColumn)
198214
}
199215
return vector

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowKtTest.kt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,4 +266,11 @@ internal class ArrowKtTest {
266266
Locale.setDefault(currentLocale)
267267
}
268268
}
269+
270+
@Test
271+
fun testBigStringColumn() {
272+
val dataFrame = dataFrameOf(bigStringColumn)
273+
val data = dataFrame.saveArrowFeatherToByteArray()
274+
DataFrame.readArrowFeather(data) shouldBe dataFrame
275+
}
269276
}

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/examplesToWrite.kt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,39 @@ val citiesExampleSchema = """{
154154
} ]
155155
}
156156
""".trimIndent()
157+
158+
/**
159+
* String column (variable length vector) with size >1 MiB
160+
*/
161+
val bigStringColumn = run {
162+
val list = ArrayList<String>()
163+
for (i in 0 until 1024) {
164+
val row = StringBuilder()
165+
for (j in 0 until 64) {
166+
row.append("abcd")
167+
}
168+
list.add(row.toString())
169+
}
170+
for (i in 0 until 1024) {
171+
val row = StringBuilder()
172+
for (j in 0 until 64) {
173+
row.append("гдёж")
174+
}
175+
list.add(row.toString())
176+
}
177+
for (i in 0 until 1024) {
178+
val row = StringBuilder()
179+
for (j in 0 until 64) {
180+
row.append("αβγδ")
181+
}
182+
list.add(row.toString())
183+
}
184+
for (i in 0 until 1024) {
185+
val row = StringBuilder()
186+
for (j in 0 until 64) {
187+
row.append("正体字")
188+
}
189+
list.add(row.toString())
190+
}
191+
DataColumn.createValueColumn("bigStringColumn", list)
192+
}

gradle/libs.versions.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ openapi = "2.1.13"
2323
junit = "4.13.2"
2424
kotestAsserions = "4.6.3"
2525
jsoup = "1.14.3"
26-
arrow = "10.0.0"
26+
arrow = "11.0.0"
2727
docProcessor = "0.1.6"
2828
simpleGit = "2.0.1"
2929

0 commit comments

Comments
 (0)