Skip to content

Commit fb9fa11

Browse files
authored
Merge pull request #115 from Kotlin/extract-arrow
Extract arrow
2 parents 0d3f633 + c65731e commit fb9fa11

File tree

26 files changed

+488
-333
lines changed

26 files changed

+488
-333
lines changed

build.gradle.kts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,7 @@ dependencies {
4444
implementation(libs.poi.ooxml)
4545

4646
implementation(libs.kotlin.datetimeJvm)
47-
48-
implementation(libs.arrow.vector)
49-
implementation(libs.arrow.format)
50-
implementation(libs.arrow.memory)
47+
implementation("com.squareup:kotlinpoet:1.11.0")
5148

5249
testImplementation(libs.junit)
5350
testImplementation(libs.kotestAssertions) {
@@ -67,6 +64,7 @@ tasks.withType<JavaCompile> {
6764
}
6865

6966
tasks.withType<KotlinCompile> {
67+
dependsOn(tasks.lintKotlin)
7068
kotlinOptions {
7169
freeCompilerArgs = freeCompilerArgs + listOf("-Xinline-classes", "-Xopt-in=kotlin.RequiresOptIn")
7270
}

dataframe-arrow/build.gradle.kts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
plugins {
2+
kotlin("jvm")
3+
kotlin("libs.publisher")
4+
}
5+
6+
group = "org.jetbrains.kotlinx"
7+
8+
dependencies {
9+
api(project(":"))
10+
11+
implementation(libs.arrow.vector)
12+
implementation(libs.arrow.format)
13+
implementation(libs.arrow.memory)
14+
implementation(libs.commonsCompress)
15+
16+
testApi(project(":"))
17+
testImplementation(libs.junit)
18+
testImplementation(libs.kotestAssertions) {
19+
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
20+
}
21+
}
22+
23+
kotlinPublications {
24+
publication {
25+
publicationName.set("dataframeArrow")
26+
artifactId.set(project.name)
27+
description.set("Apache Arrow support for Kotlin Dataframe")
28+
packageName.set(artifactId)
29+
}
30+
}
31+
32+
kotlin {
33+
explicitApi()
34+
}

src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrow.kt renamed to dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrow.kt

Lines changed: 94 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package org.jetbrains.kotlinx.dataframe.io
22

33
import org.apache.arrow.memory.RootAllocator
44
import org.apache.arrow.vector.BigIntVector
5+
import org.apache.arrow.vector.BitVector
56
import org.apache.arrow.vector.Decimal256Vector
67
import org.apache.arrow.vector.DecimalVector
78
import org.apache.arrow.vector.DurationVector
@@ -27,13 +28,16 @@ import org.apache.arrow.vector.complex.StructVector
2728
import org.apache.arrow.vector.ipc.ArrowFileReader
2829
import org.apache.arrow.vector.ipc.ArrowStreamReader
2930
import org.apache.arrow.vector.types.pojo.Field
31+
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel
3032
import org.jetbrains.kotlinx.dataframe.AnyBaseColumn
3133
import org.jetbrains.kotlinx.dataframe.AnyFrame
3234
import org.jetbrains.kotlinx.dataframe.DataColumn
3335
import org.jetbrains.kotlinx.dataframe.DataFrame
3436
import org.jetbrains.kotlinx.dataframe.api.Infer
3537
import org.jetbrains.kotlinx.dataframe.api.concat
3638
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
39+
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
40+
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
3741
import java.io.File
3842
import java.io.InputStream
3943
import java.math.BigDecimal
@@ -47,13 +51,34 @@ import java.time.Duration
4751
import java.time.LocalDateTime
4852
import kotlin.reflect.typeOf
4953

54+
public class ArrowFeather : SupportedFormat {
55+
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame = DataFrame.readArrowFeather(stream)
56+
57+
override fun readDataFrame(file: File, header: List<String>): AnyFrame = DataFrame.readArrowFeather(file)
58+
59+
override fun acceptsExtension(ext: String): Boolean = ext == "feather"
60+
61+
override val testOrder: Int = 50000
62+
63+
override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod {
64+
return DefaultReadArrowMethod(pathRepresentation)
65+
}
66+
}
67+
68+
private const val readArrowFeather = "readArrowFeather"
69+
70+
private class DefaultReadArrowMethod(path: String?) : AbstractDefaultReadMethod(path, MethodArguments.EMPTY, readArrowFeather)
71+
5072
internal object Allocator {
5173
val ROOT by lazy {
5274
RootAllocator(Long.MAX_VALUE)
5375
}
5476
}
5577

56-
private fun readArrow(channel: ReadableByteChannel, allocator: RootAllocator = Allocator.ROOT): AnyFrame {
78+
/**
79+
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [channel]
80+
*/
81+
public fun readArrowIPC(channel: ReadableByteChannel, allocator: RootAllocator = Allocator.ROOT): AnyFrame {
5782
ArrowStreamReader(channel, allocator).use { reader ->
5883
val dfs = buildList {
5984
val root = reader.vectorSchemaRoot
@@ -67,7 +92,10 @@ private fun readArrow(channel: ReadableByteChannel, allocator: RootAllocator = A
6792
}
6893
}
6994

70-
private fun readArrow(channel: SeekableByteChannel, allocator: RootAllocator = Allocator.ROOT): AnyFrame {
95+
/**
96+
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [channel]
97+
*/
98+
public fun readArrowFeather(channel: SeekableByteChannel, allocator: RootAllocator = Allocator.ROOT): AnyFrame {
7199
ArrowFileReader(channel, allocator).use { reader ->
72100
val dfs = buildList {
73101
reader.recordBlocks.forEach { block ->
@@ -82,6 +110,8 @@ private fun readArrow(channel: SeekableByteChannel, allocator: RootAllocator = A
82110
}
83111
}
84112

113+
private fun BitVector.values(range: IntRange): List<Boolean?> = range.map { getObject(it) }
114+
85115
private fun UInt1Vector.values(range: IntRange): List<Byte?> = range.map { getObject(it) }
86116
private fun UInt2Vector.values(range: IntRange): List<Char?> = range.map { getObject(it) }
87117
private fun UInt4Vector.values(range: IntRange): List<Long?> = range.map { getObjectNoOverflow(it) }
@@ -146,6 +176,7 @@ private fun readField(root: VectorSchemaRoot, field: Field): AnyBaseColumn {
146176
is LargeVarCharVector -> vector.values(range).withType()
147177
is VarBinaryVector -> vector.values(range).withType()
148178
is LargeVarBinaryVector -> vector.values(range).withType()
179+
is BitVector -> vector.values(range).withType()
149180
is SmallIntVector -> vector.values(range).withType()
150181
is TinyIntVector -> vector.values(range).withType()
151182
is UInt1Vector -> vector.values(range).withType()
@@ -171,23 +202,75 @@ private fun readField(root: VectorSchemaRoot, field: Field): AnyBaseColumn {
171202
return DataColumn.createValueColumn(field.name, list, type, Infer.Nulls)
172203
}
173204

174-
public fun DataFrame.Companion.readArrow(file: File): AnyFrame {
175-
return Files.newByteChannel(file.toPath()).use { readArrow(it) }
205+
// IPC reading block
206+
207+
/**
208+
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [file]
209+
*/
210+
public fun DataFrame.Companion.readArrowIPC(file: File): AnyFrame = Files.newByteChannel(file.toPath()).use { readArrowIPC(it) }
211+
212+
/**
213+
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [byteArray]
214+
*/
215+
public fun DataFrame.Companion.readArrowIPC(byteArray: ByteArray): AnyFrame = SeekableInMemoryByteChannel(byteArray).use { readArrowIPC(it) }
216+
217+
/**
218+
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [stream]
219+
*/
220+
public fun DataFrame.Companion.readArrowIPC(stream: InputStream): AnyFrame = Channels.newChannel(stream).use { readArrowIPC(it) }
221+
222+
/**
223+
* Read [Arrow interprocess streaming format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-streaming-format) data from existing [url]
224+
*/
225+
public fun DataFrame.Companion.readArrowIPC(url: URL): AnyFrame =
226+
when {
227+
isFile(url) -> readArrowIPC(urlAsFile(url))
228+
isProtocolSupported(url) -> url.openStream().use { readArrowIPC(it) }
229+
else -> {
230+
throw IllegalArgumentException("Invalid protocol for url $url")
231+
}
232+
}
233+
234+
public fun DataFrame.Companion.readArrowIPC(path: String): AnyFrame = if (isURL(path)) {
235+
readArrowIPC(URL(path))
236+
} else {
237+
readArrowIPC(File(path))
176238
}
177239

178-
public fun DataFrame.Companion.readArrow(stream: InputStream): AnyFrame = Channels.newChannel(stream).use { readArrow(it) }
240+
// Feather reading block
241+
242+
/**
243+
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [file]
244+
*/
245+
public fun DataFrame.Companion.readArrowFeather(file: File): AnyFrame = Files.newByteChannel(file.toPath()).use { readArrowFeather(it) }
246+
247+
/**
248+
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [byteArray]
249+
*/
250+
public fun DataFrame.Companion.readArrowFeather(byteArray: ByteArray): AnyFrame = SeekableInMemoryByteChannel(byteArray).use { readArrowFeather(it) }
251+
252+
/**
253+
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [stream]
254+
*/
255+
public fun DataFrame.Companion.readArrowFeather(stream: InputStream): AnyFrame = readArrowFeather(stream.readAllBytes())
179256

180-
public fun DataFrame.Companion.readArrow(url: URL): AnyFrame =
257+
/**
258+
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [url]
259+
*/
260+
public fun DataFrame.Companion.readArrowFeather(url: URL): AnyFrame =
181261
when {
182-
isFile(url) -> readArrow(urlAsFile(url))
183-
isProtocolSupported(url) -> url.openStream().use { readArrow(it) }
262+
isFile(url) -> readArrowFeather(urlAsFile(url))
263+
isProtocolSupported(url) -> readArrowFeather(url.readBytes())
184264
else -> {
185265
throw IllegalArgumentException("Invalid protocol for url $url")
186266
}
187267
}
188268

189-
public fun DataFrame.Companion.readArrow(path: String): AnyFrame = if (isURL(path)) {
190-
readArrow(URL(path))
269+
/**
270+
* Read [Arrow random access format](https://arrow.apache.org/docs/java/ipc.html#writing-and-reading-random-access-files) data from existing [path]
271+
*/
272+
public fun DataFrame.Companion.readArrowFeather(path: String): AnyFrame = if (isURL(path)) {
273+
readArrowFeather(URL(path))
191274
} else {
192-
readArrow(File(path))
275+
readArrowFeather(File(path))
193276
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org.jetbrains.kotlinx.dataframe.io.ArrowFeather

src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowKtTest.kt renamed to dataframe-arrow/src/test/kotlin/ArrowKtTest.kt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1-
package org.jetbrains.kotlinx.dataframe.io
2-
31
import io.kotest.matchers.shouldBe
42
import org.apache.arrow.vector.util.Text
3+
import org.jetbrains.kotlinx.dataframe.DataFrame
54
import org.jetbrains.kotlinx.dataframe.api.columnOf
65
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
76
import org.jetbrains.kotlinx.dataframe.api.toColumn
8-
import org.jetbrains.kotlinx.dataframe.testArrowFeather
7+
import org.jetbrains.kotlinx.dataframe.io.readArrowFeather
98
import org.junit.Test
9+
import java.net.URL
1010

1111
internal class ArrowKtTest {
12+
13+
fun testResource(resourcePath: String): URL = ArrowKtTest::class.java.classLoader.getResource(resourcePath)!!
14+
15+
fun testArrowFeather(name: String) = testResource("$name.feather")
16+
1217
@Test
1318
fun testReadingFromFile() {
1419
val feather = testArrowFeather("data-arrow_2.0.0_uncompressed")
15-
val df = feather.readDataFrame()
20+
val df = DataFrame.readArrowFeather(feather)
1621
val a by columnOf("one")
1722
val b by columnOf(2.0)
1823
val c by listOf(

gradle/libs.versions.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dataframe = "0.8.0-dev-939"
99
korro = "0.1.1-dev-29"
1010

1111
commonsCsv = "1.8"
12+
commonsCompress = "1.21"
1213
klaxon = "5.5"
1314
fuel = "2.3.1"
1415
poi = "5.2.0"
@@ -17,7 +18,7 @@ kotlinDatetime = "0.3.1"
1718
junit = "4.13.2"
1819
kotestAsserions = "4.6.3"
1920
jsoup = "1.14.3"
20-
arrow = "7.0.0"
21+
arrow = "8.0.0"
2122

2223
[libraries]
2324
ksp-gradle = { group = "com.google.devtools.ksp", name = "symbol-processing-gradle-plugin", version.ref = "ksp" }
@@ -30,6 +31,7 @@ kotlin-stdlib-jdk8 = { group = "org.jetbrains.kotlin", name = "kotlin-stdlib-jdk
3031
kotlin-reflect = { group = "org.jetbrains.kotlin", name = "kotlin-reflect", version.ref = "kotlin" }
3132
kotlin-scriptingJvm = { group = "org.jetbrains.kotlin", name = "kotlin-scripting-jvm", version.ref = "kotlin" }
3233
commonsCsv = { module = "org.apache.commons:commons-csv", version.ref = "commonsCsv" }
34+
commonsCompress = { module = "org.apache.commons:commons-compress", version.ref = "commonsCompress" }
3335
klaxon = { module = "com.beust:klaxon", version.ref = "klaxon" }
3436
fuel = { module = "com.github.kittinunf.fuel:fuel", version.ref = "fuel" }
3537
poi = { module = "org.apache.poi:poi", version.ref = "poi" }

plugins/dataframe-gradle-plugin/build.gradle.kts

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,23 @@ repositories {
1313

1414
group = "org.jetbrains.kotlin"
1515

16+
dependencies {
17+
implementation(project(":"))
18+
implementation(project(":dataframe-arrow"))
19+
implementation(kotlin("gradle-plugin-api"))
20+
implementation(kotlin("gradle-plugin"))
21+
implementation("com.beust:klaxon:5.5")
22+
implementation(libs.ksp.gradle)
23+
implementation(libs.ksp.api)
24+
25+
testImplementation("junit:junit:4.12")
26+
testImplementation("io.kotest:kotest-assertions-core:4.6.0")
27+
testImplementation("com.android.tools.build:gradle-api:4.1.1")
28+
testImplementation("com.android.tools.build:gradle:4.1.1")
29+
testImplementation("io.ktor:ktor-server-netty:1.6.7")
30+
testImplementation(gradleApi())
31+
}
32+
1633
tasks.withType<ProcessResources> {
1734
filesMatching("**/plugin.properties") {
1835
filter {
@@ -69,22 +86,6 @@ tasks.withType<JavaCompile>().all {
6986
targetCompatibility = JavaVersion.VERSION_1_8.toString()
7087
}
7188

72-
dependencies {
73-
implementation(project(":"))
74-
implementation(kotlin("gradle-plugin-api"))
75-
implementation(kotlin("gradle-plugin"))
76-
implementation("com.beust:klaxon:5.5")
77-
implementation(libs.ksp.gradle)
78-
implementation(libs.ksp.api)
79-
80-
testImplementation("junit:junit:4.12")
81-
testImplementation("io.kotest:kotest-assertions-core:4.6.0")
82-
testImplementation("com.android.tools.build:gradle-api:4.1.1")
83-
testImplementation("com.android.tools.build:gradle:4.1.1")
84-
testImplementation("io.ktor:ktor-server-netty:1.6.7")
85-
testImplementation(gradleApi())
86-
}
87-
8889
sourceSets {
8990
create("integrationTest") {
9091
withConvention(org.jetbrains.kotlin.gradle.plugin.KotlinSourceSet::class) {
@@ -103,7 +104,9 @@ val integrationTestConfiguration by configurations.creating {
103104

104105
val integrationTestTask = task<Test>("integrationTest") {
105106
dependsOn(":plugins:symbol-processor:publishToMavenLocal")
107+
dependsOn(":dataframe-arrow:publishToMavenLocal")
106108
dependsOn(":publishApiPublicationToMavenLocal")
109+
dependsOn(":dataframe-arrow:publishDataframeArrowPublicationToMavenLocal")
107110
description = "Runs integration tests."
108111
group = "verification"
109112

plugins/dataframe-gradle-plugin/src/integrationTest/kotlin/org/jetbrains/dataframe/gradle/ApiChangesDetectionTest.kt

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,14 @@ package org.jetbrains.dataframe.gradle
33
import io.kotest.matchers.shouldBe
44
import org.gradle.testkit.runner.TaskOutcome
55
import org.jetbrains.kotlinx.dataframe.DataFrame
6-
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadCsvMethod
7-
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadJsonMethod
86
import org.junit.Test
97
import java.io.File
10-
import kotlin.reflect.KClass
11-
12-
annotation class RelatedGenerator(vararg val clazz: KClass<*>)
138

149
class ApiChangesDetectionTest : AbstractDataFramePluginIntegrationTest() {
15-
@RelatedGenerator(
16-
GenerateDataSchemaTask::class,
17-
DefaultReadCsvMethod::class,
18-
DefaultReadJsonMethod::class
19-
)
10+
11+
// GenerateDataSchemaTask::class,
12+
// DefaultReadCsvMethod::class,
13+
// DefaultReadJsonMethod::class
2014
@Test
2115
fun `cast api`() {
2216
compiles {
@@ -34,10 +28,8 @@ class ApiChangesDetectionTest : AbstractDataFramePluginIntegrationTest() {
3428
}
3529
}
3630

37-
@RelatedGenerator(
38-
GenerateDataSchemaTask::class,
39-
DefaultReadJsonMethod::class
40-
)
31+
// GenerateDataSchemaTask::class,
32+
// DefaultReadJsonMethod::class
4133
@Test
4234
fun `read json api`() {
4335
compiles {
@@ -51,11 +43,8 @@ class ApiChangesDetectionTest : AbstractDataFramePluginIntegrationTest() {
5143
""".trimIndent()
5244
}
5345
}
54-
55-
@RelatedGenerator(
56-
GenerateDataSchemaTask::class,
57-
DefaultReadCsvMethod::class,
58-
)
46+
// GenerateDataSchemaTask::class,
47+
// DefaultReadCsvMethod::class,
5948
@Test
6049
fun `read csv api`() {
6150
compiles {

0 commit comments

Comments
 (0)