Skip to content

Commit 076c26d

Browse files
committed
Implement format guessing with ServiceLoader
Previous approach with enum for all supported formats no longer works when arrow support is in the different module
1 parent cc8816f commit 076c26d

File tree

9 files changed

+101
-25
lines changed

9 files changed

+101
-25
lines changed

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrow.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,16 @@ import java.time.Duration
4949
import java.time.LocalDateTime
5050
import kotlin.reflect.typeOf
5151

52+
public class ArrowFeather : SupportedFormat {
53+
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame = DataFrame.readArrowFeather(stream)
54+
55+
override fun readDataFrame(file: File, header: List<String>): AnyFrame = DataFrame.readArrowFeather(file)
56+
57+
override fun acceptsExtension(ext: String): Boolean = ext == "feather"
58+
59+
override val testOrder: Int = 50000
60+
}
61+
5262
internal object Allocator {
5363
val ROOT by lazy {
5464
RootAllocator(Long.MAX_VALUE)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org.jetbrains.kotlinx.dataframe.io.ArrowFeather

src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/codeGen/SchemaReader.kt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
1010
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadExcelMethod
1111
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadJsonMethod
1212
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadTsvMethod
13+
import org.jetbrains.kotlinx.dataframe.io.CSV
14+
import org.jetbrains.kotlinx.dataframe.io.Excel
15+
import org.jetbrains.kotlinx.dataframe.io.JSON
1316
import org.jetbrains.kotlinx.dataframe.io.SupportedFormats
17+
import org.jetbrains.kotlinx.dataframe.io.TSV
1418
import org.jetbrains.kotlinx.dataframe.io.guessFormat
1519
import org.jetbrains.kotlinx.dataframe.io.readCSV
1620
import org.jetbrains.kotlinx.dataframe.io.readExcel
@@ -38,11 +42,11 @@ public val CodeGenerator.Companion.urlReader: (url: URL, csvOptions: CsvOptions)
3842
fun readExcel(url: URL) = DfReadResult.Success(DataFrame.readExcel(url), SupportedFormats.EXCEL, csvOptions)
3943
try {
4044
val res = when (guessFormat(url.path)) {
41-
SupportedFormats.CSV -> readCSV(url)
42-
SupportedFormats.TSV -> readTSV(url)
43-
SupportedFormats.JSON -> readJson(url)
45+
is CSV -> readCSV(url)
46+
is TSV -> readTSV(url)
47+
is JSON -> readJson(url)
4448
// SupportedFormats.ARROW -> readArrow(url)
45-
SupportedFormats.EXCEL -> readExcel(url)
49+
is Excel -> readExcel(url)
4650
null -> try {
4751
readExcel(url)
4852
} catch (e: Exception) {
@@ -56,6 +60,7 @@ public val CodeGenerator.Companion.urlReader: (url: URL, csvOptions: CsvOptions)
5660
}
5761
}
5862
}
63+
else -> TODO()
5964
}
6065
res
6166
} catch (e: Throwable) {

src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,16 @@ import kotlin.reflect.KClass
3232
import kotlin.reflect.full.withNullability
3333
import kotlin.reflect.typeOf
3434

35+
public class CSV : SupportedFormat {
36+
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame = DataFrame.readCSV(stream, header = header)
37+
38+
override fun readDataFrame(file: File, header: List<String>): AnyFrame = DataFrame.readCSV(file, header = header)
39+
40+
override fun acceptsExtension(ext: String): Boolean = ext == "csv"
41+
42+
override val testOrder: Int = 20000
43+
}
44+
3545
public enum class CSVType(public val format: CSVFormat) {
3646
DEFAULT(CSVFormat.DEFAULT.withAllowMissingColumnNames().withIgnoreSurroundingSpaces()),
3747
TDF(CSVFormat.TDF.withAllowMissingColumnNames())

src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess.kt

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,19 @@ import java.io.File
1010
import java.io.FileNotFoundException
1111
import java.io.InputStream
1212
import java.net.URL
13+
import java.util.ServiceLoader
14+
15+
public interface SupportedFormat {
16+
public fun readDataFrame(stream: InputStream, header: List<String> = emptyList()): AnyFrame
17+
18+
public fun readDataFrame(file: File, header: List<String> = emptyList()): AnyFrame
19+
20+
public fun acceptsExtension(ext: String): Boolean
21+
22+
// `DataFrame.Companion.read` methods uses this to sort list of all supported formats in ascending order (-1, 2, 10)
23+
// sorted list is used to test if any format can read given input
24+
public val testOrder: Int
25+
}
1326

1427
public enum class SupportedFormats {
1528
CSV {
@@ -56,21 +69,19 @@ public enum class SupportedFormats {
5669
internal abstract fun acceptsExtension(ext: String): Boolean
5770
}
5871

59-
private val testOrder get() = listOf(
60-
SupportedFormats.JSON,
61-
SupportedFormats.CSV,
62-
SupportedFormats.TSV,
63-
SupportedFormats.EXCEL,
64-
// SupportedFormats.ARROW,
65-
)
72+
internal val supportedFormats: List<SupportedFormat> by lazy {
73+
ServiceLoader.load(SupportedFormat::class.java).toList()
74+
}
75+
76+
internal val testOrder: List<SupportedFormat> by lazy { supportedFormats.sortedBy { it.testOrder } }
6677

67-
internal fun guessFormatForExtension(ext: String) = SupportedFormats.values().firstOrNull { it.acceptsExtension(ext) }
78+
internal fun guessFormatForExtension(ext: String) = supportedFormats.firstOrNull { it.acceptsExtension(ext) }
6879

69-
internal fun guessFormat(file: File): SupportedFormats? = file.extension.lowercase().let { guessFormatForExtension(it) }
80+
internal fun guessFormat(file: File): SupportedFormat? = file.extension.lowercase().let { guessFormatForExtension(it) }
7081

71-
internal fun guessFormat(url: URL): SupportedFormats? = guessFormat(url.path)
82+
internal fun guessFormat(url: URL): SupportedFormat? = guessFormat(url.path)
7283

73-
internal fun guessFormat(url: String): SupportedFormats? = guessFormatForExtension(url.substringAfterLast("."))
84+
internal fun guessFormat(url: String): SupportedFormat? = guessFormatForExtension(url.substringAfterLast("."))
7485

7586
private class NotCloseableStream(val src: InputStream) : InputStream() {
7687
override fun read(): Int = src.read()
@@ -85,10 +96,10 @@ private class NotCloseableStream(val src: InputStream) : InputStream() {
8596

8697
internal fun DataFrame.Companion.read(
8798
stream: InputStream,
88-
format: SupportedFormats? = null,
99+
format: SupportedFormat? = null,
89100
header: List<String> = emptyList()
90-
): AnyFrame {
91-
if (format != null) return format.readDataFrame(stream, header = header)
101+
): ReadAnyFrame {
102+
if (format != null) return format to format.readDataFrame(stream, header = header)
92103
val input = NotCloseableStream(if (stream.markSupported()) stream else BufferedInputStream(stream))
93104
try {
94105
val readLimit = 10000
@@ -97,7 +108,7 @@ internal fun DataFrame.Companion.read(
97108
testOrder.forEach {
98109
try {
99110
input.reset()
100-
return it.readDataFrame(input, header = header)
111+
return it to it.readDataFrame(input, header = header)
101112
} catch (e: Exception) {
102113
}
103114
}
@@ -109,24 +120,28 @@ internal fun DataFrame.Companion.read(
109120

110121
internal fun DataFrame.Companion.read(
111122
file: File,
112-
format: SupportedFormats? = null,
123+
format: SupportedFormat? = null,
113124
header: List<String> = emptyList()
114-
): AnyFrame {
115-
if (format != null) return format.readDataFrame(file, header = header)
125+
): ReadAnyFrame {
126+
if (format != null) return format to format.readDataFrame(file, header = header)
116127
testOrder.forEach {
117128
try {
118-
return it.readDataFrame(file, header = header)
129+
return it to it.readDataFrame(file, header = header)
119130
} catch (e: FileNotFoundException) { throw e } catch (e: Exception) { }
120131
}
121132
throw IllegalArgumentException("Unknown file format")
122133
}
123134

124-
public fun DataFrame.Companion.read(file: File, header: List<String> = emptyList()): AnyFrame = read(file, guessFormat(file), header)
135+
internal data class ReadAnyFrame(val format: SupportedFormat, val df: AnyFrame)
136+
137+
internal infix fun SupportedFormat.to(df: AnyFrame) = ReadAnyFrame(this, df)
138+
139+
public fun DataFrame.Companion.read(file: File, header: List<String> = emptyList()): AnyFrame = read(file, guessFormat(file), header).df
125140
public fun DataRow.Companion.read(file: File, header: List<String> = emptyList()): AnyRow = DataFrame.read(file, header).single()
126141

127142
public fun DataFrame.Companion.read(url: URL, header: List<String> = emptyList()): AnyFrame = when {
128143
isFile(url) -> read(urlAsFile(url), header)
129-
isProtocolSupported(url) -> catchHttpResponse(url) { read(it, guessFormat(url), header) }
144+
isProtocolSupported(url) -> catchHttpResponse(url) { read(it, guessFormat(url), header).df }
130145
else -> throw IllegalArgumentException("Invalid protocol for url $url")
131146
}
132147

src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ import kotlin.reflect.KTypeProjection
3939
import kotlin.reflect.full.createType
4040
import kotlin.reflect.typeOf
4141

42+
public class JSON : SupportedFormat {
43+
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame = DataFrame.readJson(stream, header = header)
44+
45+
override fun readDataFrame(file: File, header: List<String>): AnyFrame = DataFrame.readJson(file, header = header)
46+
47+
override fun acceptsExtension(ext: String): Boolean = ext == "json"
48+
49+
override val testOrder: Int = 10000
50+
}
51+
4252
public fun DataFrame.Companion.readJson(file: File, header: List<String> = emptyList()): AnyFrame = readJson(file.toURI().toURL(), header)
4353
public fun DataRow.Companion.readJson(file: File, header: List<String> = emptyList()): AnyRow = DataFrame.readJson(file, header).single()
4454

src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import org.jetbrains.kotlinx.dataframe.AnyFrame
34
import org.jetbrains.kotlinx.dataframe.DataFrame
45
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
56
import java.io.File
@@ -8,6 +9,16 @@ import java.io.InputStream
89
import java.net.URL
910
import java.nio.charset.Charset
1011

12+
public class TSV : SupportedFormat {
13+
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame = DataFrame.readTSV(stream, header = header)
14+
15+
override fun readDataFrame(file: File, header: List<String>): AnyFrame = DataFrame.readTSV(file, header = header)
16+
17+
override fun acceptsExtension(ext: String): Boolean = ext == "tsv"
18+
19+
override val testOrder: Int = 30000
20+
}
21+
1122
private val tabChar = '\t'
1223

1324
public fun DataFrame.Companion.readTSV(

src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ import java.time.LocalDate
2929
import java.time.LocalDateTime
3030
import java.util.*
3131

32+
public class Excel : SupportedFormat {
33+
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame = DataFrame.readExcel(stream)
34+
35+
override fun readDataFrame(file: File, header: List<String>): AnyFrame = DataFrame.readExcel(file)
36+
37+
override fun acceptsExtension(ext: String): Boolean = ext == "xls" || ext == "xlsx"
38+
39+
override val testOrder: Int = 40000
40+
}
41+
3242
public fun DataFrame.Companion.readExcel(
3343
url: URL,
3444
sheetName: String? = null,
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
org.jetbrains.kotlinx.dataframe.io.CSV
2+
org.jetbrains.kotlinx.dataframe.io.Excel
3+
org.jetbrains.kotlinx.dataframe.io.JSON
4+
org.jetbrains.kotlinx.dataframe.io.TSV

0 commit comments

Comments
 (0)