Skip to content

Commit 9870305

Browse files
authored
Merge pull request #831 from Kotlin/csv-with-bom-fix
Fixes reading CSV files with BOM characters
2 parents 4caf141 + 8d20f1a commit 9870305

File tree

5 files changed

+29
-15
lines changed

5 files changed

+29
-15
lines changed

core/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ dependencies {
6666
implementation(libs.kotlin.stdlib.jdk8)
6767

6868
api(libs.commonsCsv)
69+
implementation(libs.commonsIo)
6970
implementation(libs.serialization.core)
7071
implementation(libs.serialization.json)
7172

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package org.jetbrains.kotlinx.dataframe.io
22

33
import org.apache.commons.csv.CSVFormat
44
import org.apache.commons.csv.CSVRecord
5+
import org.apache.commons.io.input.BOMInputStream
56
import org.jetbrains.kotlinx.dataframe.AnyFrame
67
import org.jetbrains.kotlinx.dataframe.AnyRow
78
import org.jetbrains.kotlinx.dataframe.DataColumn
@@ -19,6 +20,7 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
1920
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
2021
import org.jetbrains.kotlinx.dataframe.impl.api.parse
2122
import org.jetbrains.kotlinx.dataframe.values
23+
import java.io.BufferedInputStream
2224
import java.io.BufferedReader
2325
import java.io.File
2426
import java.io.FileInputStream
@@ -272,21 +274,20 @@ public fun DataFrame.Companion.readDelim(
272274
duplicate: Boolean = true,
273275
charset: Charset = defaultCharset,
274276
parserOptions: ParserOptions? = null,
275-
): AnyFrame =
276-
if (isCompressed) {
277-
InputStreamReader(GZIPInputStream(inStream), charset)
278-
} else {
279-
BufferedReader(InputStreamReader(inStream, charset))
280-
}.run {
281-
readDelim(
282-
this,
283-
getFormat(csvType, delimiter, header, duplicate),
284-
colTypes,
285-
skipLines,
286-
readLines,
287-
parserOptions,
288-
)
289-
}
277+
): AnyFrame {
278+
val bufferedInStream = BufferedInputStream(if (isCompressed) GZIPInputStream(inStream) else inStream)
279+
val bomIn = BOMInputStream.builder().setInputStream(bufferedInStream).get()
280+
val bufferedReader = BufferedReader(InputStreamReader(bomIn, charset))
281+
282+
return readDelim(
283+
reader = bufferedReader,
284+
format = getFormat(csvType, delimiter, header, duplicate),
285+
colTypes = colTypes,
286+
skipLines = skipLines,
287+
readLines = readLines,
288+
parserOptions = parserOptions,
289+
)
290+
}
290291

291292
public enum class ColType {
292293
Int,

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,10 +276,17 @@ class CsvTests {
276276
df shouldBe dataFrameOf("a", "b", "c")(1, 2, 3)
277277
}
278278

279+
@Test
280+
fun `file with BOM`() {
281+
val df = DataFrame.readCSV(withBomCsv, delimiter = ';')
282+
df.columnNames() shouldBe listOf("Column1", "Column2")
283+
}
284+
279285
companion object {
280286
private val simpleCsv = testCsv("testCSV")
281287
private val csvWithFrenchLocale = testCsv("testCSVwithFrenchLocale")
282288
private val wineCsv = testCsv("wine")
283289
private val durationCsv = testCsv("duration")
290+
private val withBomCsv = testCsv("with-bom")
284291
}
285292
}

core/src/test/resources/with-bom.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Column1;Column2
2+
0,25;18
3+
1,24;19

gradle/libs.versions.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ kover = "0.6.1"
2424

2525
commonsCsv = "1.10.0"
2626
commonsCompress = "1.26.0"
27+
commonsIo = "2.16.1"
2728
serialization = "1.7.0"
2829
fuel = "2.3.1"
2930
poi = "5.2.5"
@@ -71,6 +72,7 @@ kotlin-reflect = { group = "org.jetbrains.kotlin", name = "kotlin-reflect", vers
7172
kotlin-scriptingJvm = { group = "org.jetbrains.kotlin", name = "kotlin-scripting-jvm", version.ref = "kotlin" }
7273
commonsCsv = { group = "org.apache.commons", name = "commons-csv", version.ref = "commonsCsv" }
7374
commonsCompress = { group = "org.apache.commons", name = "commons-compress", version.ref = "commonsCompress" }
75+
commonsIo = { group = "commons-io", name = "commons-io", version.ref = "commonsIo" }
7476
# Serialization
7577
serialization-core = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-core", version.ref = "serialization" }
7678
serialization-json = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-json", version.ref = "serialization" }

0 commit comments

Comments
 (0)