Skip to content

Commit efb8a04

Browse files
committed
Merge branch 'master' into dfs-rename3
# Conflicts: # examples/jupyter-notebooks/titanic/Titanic.ipynb
2 parents 4bf8219 + 8b371dc commit efb8a04

File tree

24 files changed

+46856
-9663
lines changed

24 files changed

+46856
-9663
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/TypeConversions.kt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ public fun <T> DataColumn<T>.castToNullable(): DataColumn<T?> = cast()
9595

9696
public fun <T> ColumnReference<T>.castToNullable(): ColumnReference<T?> = cast()
9797

98+
public fun AnyCol.setNullable(nullable: Boolean): AnyCol {
99+
return if (nullable) {
100+
this.castToNullable()
101+
} else {
102+
this.castToNotNullable()
103+
}
104+
}
105+
98106
// region to array
99107

100108
public inline fun <reified T> DataColumn<T>.toTypedArray(): Array<T> = toList().toTypedArray()

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ import java.time.LocalTime
3636
import java.util.*
3737
import kotlin.reflect.KProperty
3838
import kotlin.reflect.KType
39+
import kotlin.reflect.full.isSubtypeOf
40+
import kotlin.reflect.full.withNullability
3941
import kotlin.reflect.typeOf
4042

4143
public fun <T, C> DataFrame<T>.convert(columns: ColumnsSelector<T, C>): Convert<T, C> =
@@ -103,8 +105,9 @@ public fun <T, C> Convert<T, C>.to(columnConverter: DataFrame<T>.(DataColumn<C>)
103105

104106
public inline fun <reified C> AnyCol.convertTo(): DataColumn<C> = convertTo(typeOf<C>()) as DataColumn<C>
105107
public fun AnyCol.convertTo(newType: KType): AnyCol {
106-
if (this.type() == typeOf<String>() && newType == typeOf<Double>()) return (this as DataColumn<String>).convertToDouble()
107-
if (this.type() == typeOf<String?>() && newType == typeOf<Double?>()) return (this as DataColumn<String?>).convertToDouble()
108+
if (this.type().withNullability(true).isSubtypeOf(typeOf<String?>()) && newType.withNullability(true) == typeOf<Double?>()) {
109+
return (this as DataColumn<String?>).convertToDouble().setNullable(newType.isMarkedNullable)
110+
}
108111
return convertToTypeImpl(newType)
109112
}
110113

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,14 +132,14 @@ class ParserTests {
132132
fun `converting String to Double in different locales`() {
133133
val currentLocale = Locale.getDefault()
134134
try {
135-
// Test 36 behaviour combinations:
135+
// Test 45 behaviour combinations:
136136

137137
// 3 source columns
138138
val columnDot = columnOf("12.345", "67.890")
139139
val columnComma = columnOf("12,345", "67,890")
140140
val columnMixed = columnOf("12.345", "67,890")
141141
// *
142-
// (3 locales as converting parameter + original converting)
142+
// (3 locales as converting parameter + original converting + original converting to nullable)
143143
val parsingLocaleNotDefined: Locale? = null
144144
val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US")
145145
val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU")
@@ -152,6 +152,10 @@ class ParserTests {
152152
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
153153
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))
154154

155+
columnDot.convertTo<Double?>().shouldBe(columnOf(12.345, 67.89))
156+
columnComma.convertTo<Double?>().shouldBe(columnOf(12345.0, 67890.0))
157+
columnMixed.convertTo<Double?>().shouldBe(columnOf(12.345, 67890.0))
158+
155159
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
156160
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0))
157161
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))
@@ -170,6 +174,10 @@ class ParserTests {
170174
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
171175
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))
172176

177+
columnDot.convertTo<Double?>().shouldBe(columnOf(12.345, 67.89))
178+
columnComma.convertTo<Double?>().shouldBe(columnOf(12345.0, 67890.0))
179+
columnMixed.convertTo<Double?>().shouldBe(columnOf(12.345, 67890.0))
180+
173181
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
174182
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0))
175183
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))
@@ -188,6 +196,10 @@ class ParserTests {
188196
columnComma.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
189197
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))
190198

199+
columnDot.convertTo<Double?>().shouldBe(columnOf(12.345, 67.89))
200+
columnComma.convertTo<Double?>().shouldBe(columnOf(12.345, 67.89))
201+
columnMixed.convertTo<Double?>().shouldBe(columnOf(12.345, 67890.0))
202+
191203
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
192204
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
193205
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/SampleNotebooksTests.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ class SampleNotebooksTests : DataFrameJupyterTest() {
109109
cleanup: () -> Unit = {},
110110
) {
111111
val fileName = if (notebookName == null) "$dir.ipynb" else "$notebookName.ipynb"
112-
doTest("$jupyterExamplesPath/$dir/$fileName", replacer, cellClause, cleanup)
112+
doTest("$notebookExamplesPath/$dir/$fileName", replacer, cellClause, cleanup)
113113
}
114114

115115
data class CodeCellData(
@@ -119,8 +119,8 @@ class SampleNotebooksTests : DataFrameJupyterTest() {
119119

120120
companion object {
121121
const val ideaExamplesPath = "../examples/idea-examples"
122-
const val jupyterExamplesPath = "../examples/jupyter-notebooks"
122+
const val notebookExamplesPath = "../examples/notebooks"
123123

124-
fun testFile(folder: String, fileName: String) = fileName to "$jupyterExamplesPath/$folder/$fileName"
124+
fun testFile(folder: String, fileName: String) = fileName to "$notebookExamplesPath/$folder/$fileName"
125125
}
126126
}

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowWriterImpl.kt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ import org.jetbrains.kotlinx.dataframe.api.map
5353
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
5454
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException
5555
import org.jetbrains.kotlinx.dataframe.name
56+
import org.jetbrains.kotlinx.dataframe.values
57+
import java.nio.charset.Charset
58+
import kotlin.reflect.full.isSubtypeOf
59+
import kotlin.reflect.typeOf
5660

5761
/**
5862
* Save [dataFrame] content in Apache Arrow format (can be written to File, ByteArray, OutputStream or raw Channel) with [targetSchema].
@@ -67,14 +71,25 @@ internal class ArrowWriterImpl(
6771

6872
private val allocator = RootAllocator()
6973

70-
private fun allocateVector(vector: FieldVector, size: Int) {
74+
private fun allocateVector(vector: FieldVector, size: Int, totalBytes: Long? = null) {
7175
when (vector) {
7276
is FixedWidthVector -> vector.allocateNew(size)
73-
is VariableWidthVector -> vector.allocateNew(size)
77+
is VariableWidthVector -> totalBytes?.let { vector.allocateNew(it, size) } ?: vector.allocateNew(size)
7478
else -> throw IllegalArgumentException("Can not allocate ${vector.javaClass.canonicalName}")
7579
}
7680
}
7781

82+
/**
83+
* Calculate buffer size for VariableWidthVector (return null for FixedWidthVector)
84+
*/
85+
private fun countTotalBytes(column: AnyCol): Long? {
86+
val columnType = column.type()
87+
return when {
88+
columnType.isSubtypeOf(typeOf<String?>()) -> column.values.fold(0L) {totalBytes, value -> totalBytes + value.toString().length * 4}
89+
else -> null
90+
}
91+
}
92+
7893
private fun infillWithNulls(vector: FieldVector, size: Int) {
7994
when (vector) {
8095
is BaseFixedWidthVector -> for (i in 0 until size) { vector.setNull(i) }
@@ -189,11 +204,12 @@ internal class ArrowWriterImpl(
189204
actualField.createVector(allocator)!!
190205
}
191206

192-
allocateVector(vector, dataFrame.rowsCount())
193207
if (convertedColumn == null) {
194208
check(actualField.isNullable)
209+
allocateVector(vector, dataFrame.rowsCount())
195210
infillWithNulls(vector, dataFrame.rowsCount())
196211
} else {
212+
allocateVector(vector, dataFrame.rowsCount(), countTotalBytes(convertedColumn))
197213
infillVector(vector, convertedColumn)
198214
}
199215
return vector

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowKtTest.kt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ package org.jetbrains.kotlinx.dataframe.io
33
import io.kotest.assertions.throwables.shouldThrow
44
import io.kotest.matchers.collections.shouldContain
55
import io.kotest.matchers.shouldBe
6+
import org.apache.arrow.vector.types.FloatingPointPrecision
7+
import org.apache.arrow.vector.types.pojo.ArrowType
8+
import org.apache.arrow.vector.types.pojo.Field
9+
import org.apache.arrow.vector.types.pojo.FieldType
610
import org.apache.arrow.vector.types.pojo.Schema
711
import org.apache.arrow.vector.util.Text
812
import org.jetbrains.kotlinx.dataframe.DataColumn
@@ -23,6 +27,7 @@ import java.io.File
2327
import java.net.URL
2428
import java.time.LocalDate
2529
import java.time.LocalDateTime
30+
import java.util.Locale
2631
import kotlin.reflect.typeOf
2732

2833
internal class ArrowKtTest {
@@ -238,4 +243,34 @@ internal class ArrowKtTest {
238243
DataFrame.readArrowFeather(testLoyalNullable)["settled"].type() shouldBe typeOf<LocalDateTime?>()
239244
DataFrame.readArrowFeather(testLoyalNullable)["settled"].values() shouldBe arrayOfNulls<LocalDate>(frameRenaming.rowsCount()).asList()
240245
}
246+
247+
@Test
248+
fun testParsing() {
249+
val columnStringDot = columnOf("12.345", "67.890")
250+
val columnStringComma = columnOf("12,345", "67,890")
251+
val frameString = dataFrameOf("columnDot", "columnComma")(columnStringDot, columnStringComma)
252+
val columnDoubleFraction = columnOf(12.345, 67.890)
253+
val columnDoubleRound = columnOf(12345.0, 67890.0)
254+
val targetType = FieldType.notNullable(ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE))
255+
val targetSchema = Schema(listOf(Field("columnDot", targetType, emptyList()), Field("columnComma", targetType, emptyList())))
256+
257+
val currentLocale = Locale.getDefault()
258+
try {
259+
Locale.setDefault(Locale.forLanguageTag("en-US"))
260+
val serializedAsUs = frameString.arrowWriter(targetSchema).saveArrowFeatherToByteArray()
261+
DataFrame.readArrowFeather(serializedAsUs) shouldBe dataFrameOf("columnDot", "columnComma")(columnDoubleFraction, columnDoubleRound)
262+
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
263+
val serializedAsRu = frameString.arrowWriter(targetSchema).saveArrowFeatherToByteArray()
264+
DataFrame.readArrowFeather(serializedAsRu) shouldBe dataFrameOf("columnDot", "columnComma")(columnDoubleFraction, columnDoubleFraction)
265+
} finally {
266+
Locale.setDefault(currentLocale)
267+
}
268+
}
269+
270+
@Test
271+
fun testBigStringColumn() {
272+
val dataFrame = dataFrameOf(bigStringColumn)
273+
val data = dataFrame.saveArrowFeatherToByteArray()
274+
DataFrame.readArrowFeather(data) shouldBe dataFrame
275+
}
241276
}

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/examplesToWrite.kt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,39 @@ val citiesExampleSchema = """{
154154
} ]
155155
}
156156
""".trimIndent()
157+
158+
/**
159+
* String column (variable length vector) with size >1 MiB
160+
*/
161+
val bigStringColumn = run {
162+
val list = ArrayList<String>()
163+
for (i in 0 until 1024) {
164+
val row = StringBuilder()
165+
for (j in 0 until 64) {
166+
row.append("abcd")
167+
}
168+
list.add(row.toString())
169+
}
170+
for (i in 0 until 1024) {
171+
val row = StringBuilder()
172+
for (j in 0 until 64) {
173+
row.append("гдёж")
174+
}
175+
list.add(row.toString())
176+
}
177+
for (i in 0 until 1024) {
178+
val row = StringBuilder()
179+
for (j in 0 until 64) {
180+
row.append("αβγδ")
181+
}
182+
list.add(row.toString())
183+
}
184+
for (i in 0 until 1024) {
185+
val row = StringBuilder()
186+
for (j in 0 until 64) {
187+
row.append("正体字")
188+
}
189+
list.add(row.toString())
190+
}
191+
DataColumn.createValueColumn("bigStringColumn", list)
192+
}

examples/README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,27 +11,27 @@
1111
* people [Datalore](https://datalore.jetbrains.com/view/notebook/aOTioEClQQrsZZBKeUPAQj)
1212
Small artificial dataset used in [DataFrame API examples](https://kotlin.github.io/dataframe/operations.html)
1313

14-
* puzzles ([Jupyter](jupyter-notebooks/puzzles/40%20puzzles.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/CVp3br3CDXjUGaxxqfJjFF)) &ndash;
14+
* puzzles ([notebook](notebooks/puzzles/40%20puzzles.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/CVp3br3CDXjUGaxxqfJjFF)) &ndash;
1515
Inspired [by 100 pandas puzzles](https://github.com/ajcr/100-pandas-puzzles). You will go from the simplest tasks to
1616
complex problems where need to think. This notebook will show you how to solve these tasks with the Kotlin
1717
Dataframe in a laconic, beautiful style.
1818
___
19-
* movies ([Jupyter](jupyter-notebooks/movies/movies.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/89IMYb1zbHZxHfwAta6eKP)) &ndash;
19+
* movies ([notebook](notebooks/movies/movies.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/89IMYb1zbHZxHfwAta6eKP)) &ndash;
2020
In this notebook you can see the basic operations of the Kotlin Dataframe on data from [movielens](https://movielens.org/).
2121
You can take the data from the [link](https://grouplens.org/datasets/movielens/latest/).
2222
___
23-
* netflix ([Jupyter](jupyter-notebooks/netflix/netflix.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/wB6Vq1oKU3GniCi1i05l2X)) &ndash;
23+
* netflix ([notebook](notebooks/netflix/netflix.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/xSJ4rx49hcH71pPnFgZBCq)) &ndash;
2424
Explore TV shows and movies from Netflix with the powerful Kotlin Dataframe API and beautiful
2525
visualizations from [lets-plot](https://github.com/JetBrains/lets-plot-kotlin).
2626
___
27-
* github ([Jupyter](jupyter-notebooks/github/github.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/wGlYql3ObFCloN0YpWR1Xw)) &ndash;
27+
* github ([notebook](notebooks/github/github.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/P9n6jYL4mmY1gx3phz5TsX)) &ndash;
2828
This notebook shows the hierarchical dataframes look like and how to work with them.
2929
___
30-
* titanic ([Jupyter](jupyter-notebooks/titanic/Titanic.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/B5YeMMONSAR78FgKQ9yJyW)) &ndash;
30+
* titanic ([notebook](notebooks/titanic/Titanic.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/B5YeMMONSAR78FgKQ9yJyW)) &ndash;
3131
Let's see how the new library will show itself on the famous Titanic dataset.
3232
___
33-
* wine ([Jupyter](jupyter-notebooks/wine/WineNetWIthKotlinDL.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/aK9vYHH8pCA8H1KbKB5WsI)) &ndash;
33+
* wine ([notebook](notebooks/wine/WineNetWIthKotlinDL.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/aK9vYHH8pCA8H1KbKB5WsI)) &ndash;
3434
Wine. Kotlin Dataframe. KotlinDL. What came out of this can be seen in this notebook.
3535
___
36-
* youtube ([Jupyter](jupyter-notebooks/youtube/Youtube.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/uXH0VfIM6qrrmwPJnLBi0j)) &ndash;
36+
* youtube ([notebook](notebooks/youtube/Youtube.ipynb)/[Datalore](https://datalore.jetbrains.com/view/notebook/uXH0VfIM6qrrmwPJnLBi0j)) &ndash;
3737
Explore YouTube videos with YouTube REST API and Kotlin Dataframe

0 commit comments

Comments
 (0)