Skip to content

Commit acc932d

Browse files
authored
Merge pull request #743 from Kotlin/read-csv-update
Add delimiter parameter to readDelimStr
2 parents e0dfae9 + 3c9201b commit acc932d

File tree

11 files changed

+358
-21
lines changed

11 files changed

+358
-21
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ public class CSV(private val delimiter: Char = ',') : SupportedDataFrameFormat {
5757
}
5858

5959
public enum class CSVType(public val format: CSVFormat) {
60-
DEFAULT(CSVFormat.DEFAULT.withAllowMissingColumnNames().withIgnoreSurroundingSpaces()),
61-
TDF(CSVFormat.TDF.withAllowMissingColumnNames())
60+
DEFAULT(CSVFormat.DEFAULT.builder().setAllowMissingColumnNames(true).setIgnoreSurroundingSpaces(true).build()),
61+
TDF(CSVFormat.TDF.builder().setAllowMissingColumnNames(true).build())
6262
}
6363

6464
private val defaultCharset = Charsets.UTF_8
@@ -73,11 +73,15 @@ internal fun isCompressed(url: URL) = isCompressed(url.path)
7373
@Interpretable("ReadDelimStr")
7474
public fun DataFrame.Companion.readDelimStr(
7575
text: String,
76+
delimiter: Char = ',',
7677
colTypes: Map<String, ColType> = mapOf(),
7778
skipLines: Int = 0,
7879
readLines: Int? = null,
7980
): DataFrame<*> =
80-
StringReader(text).use { readDelim(it, CSVType.DEFAULT.format.withHeader(), colTypes, skipLines, readLines) }
81+
StringReader(text).use {
82+
val format = CSVType.DEFAULT.format.builder().setHeader().setDelimiter(delimiter).build()
83+
readDelim(it, format, colTypes, skipLines, readLines)
84+
}
8185

8286
public fun DataFrame.Companion.read(
8387
fileOrUrl: String,
@@ -212,7 +216,7 @@ public fun asURL(fileOrUrl: String): URL = (
212216
).toURL()
213217

214218
private fun getFormat(type: CSVType, delimiter: Char, header: List<String>, duplicate: Boolean): CSVFormat =
215-
type.format.withDelimiter(delimiter).withHeader(*header.toTypedArray()).withAllowDuplicateHeaderNames(duplicate)
219+
type.format.builder().setDelimiter(delimiter).setHeader(*header.toTypedArray()).setAllowMissingColumnNames(duplicate).build()
216220

217221
public fun DataFrame.Companion.readDelim(
218222
inStream: InputStream,
@@ -268,7 +272,7 @@ public fun ColType.toType(): KClass<out Any> = when (this) {
268272

269273
public fun DataFrame.Companion.readDelim(
270274
reader: Reader,
271-
format: CSVFormat = CSVFormat.DEFAULT.withHeader(),
275+
format: CSVFormat = CSVFormat.DEFAULT.builder().setHeader().build(),
272276
colTypes: Map<String, ColType> = mapOf(),
273277
skipLines: Int = 0,
274278
readLines: Int? = null,

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ class CsvTests {
242242
)
243243
df.writeCSV(
244244
"src/test/resources/without_header.csv",
245-
CSVFormat.DEFAULT.withSkipHeaderRecord(),
245+
CSVFormat.DEFAULT.builder().setSkipHeaderRecord(true).build(),
246246
)
247247
val producedFile = File("src/test/resources/without_header.csv")
248248
producedFile.exists() shouldBe true
@@ -258,6 +258,16 @@ class CsvTests {
258258
df shouldBe DataFrame.readCSV("../data/jetbrains repositories.csv")
259259
}
260260

261+
@Test
262+
fun `readDelimStr delimiter`() {
263+
val tsv = """
264+
a b c
265+
1 2 3
266+
""".trimIndent()
267+
val df = DataFrame.readDelimStr(tsv, '\t')
268+
df shouldBe dataFrameOf("a", "b", "c")(1, 2, 3)
269+
}
270+
261271
companion object {
262272
private val simpleCsv = testCsv("testCSV")
263273
private val csvWithFrenchLocale = testCsv("testCSVwithFrenchLocale")

docs/StardustDocs/topics/write.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ df.writeCSV(file)
2121
<!---FUN writeCsvStr-->
2222

2323
```kotlin
24-
val csvStr = df.toCsv(CSVFormat.DEFAULT.withDelimiter(';').withRecordSeparator(System.lineSeparator()))
24+
val format = CSVFormat.DEFAULT.builder().setDelimiter(';').setRecordSeparator(System.lineSeparator()).build()
25+
val csvStr = df.toCsv(format)
2526
```
2627

2728
<!---END-->
@@ -104,8 +105,10 @@ val wb = WorkbookFactory.create(true)
104105

105106
// Create different sheets from different data frames in the workbook
106107
val allPersonsSheet = df.writeExcel(wb, sheetName = "allPersons")
107-
val happyPersonsSheet = df.filter { person -> person.isHappy }.remove("isHappy").writeExcel(wb, sheetName = "happyPersons")
108-
val unhappyPersonsSheet = df.filter { person -> !person.isHappy }.remove("isHappy").writeExcel(wb, sheetName = "unhappyPersons")
108+
val happyPersonsSheet =
109+
df.filter { person -> person.isHappy }.remove("isHappy").writeExcel(wb, sheetName = "happyPersons")
110+
val unhappyPersonsSheet =
111+
df.filter { person -> !person.isHappy }.remove("isHappy").writeExcel(wb, sheetName = "unhappyPersons")
109112

110113
// Do anything you want by POI
111114
listOf(happyPersonsSheet, unhappyPersonsSheet).forEach { setStyles(it) }
@@ -125,9 +128,11 @@ Add new sheets without using Apache POI directly by using a parameter to keep us
125128
// Create a new Excel workbook with a single sheet called "allPersons", replacing the file if it already exists -> Current sheets: allPersons
126129
df.writeExcel(file, sheetName = "allPersons")
127130
// Add a new sheet to the previous file without replacing it, by setting keepFile = true -> Current sheets: allPersons, happyPersons
128-
df.filter { person -> person.isHappy }.remove("isHappy").writeExcel(file, sheetName = "happyPersons", keepFile = true)
131+
df.filter { person -> person.isHappy }.remove("isHappy")
132+
.writeExcel(file, sheetName = "happyPersons", keepFile = true)
129133
// Add a new sheet to the previous file without replacing it, by setting keepFile = true -> Current sheets: allPersons, happyPersons, unhappyPersons
130-
df.filter { person -> !person.isHappy }.remove("isHappy").writeExcel(file, sheetName = "unhappyPersons", keepFile = true)
134+
df.filter { person -> !person.isHappy }.remove("isHappy")
135+
.writeExcel(file, sheetName = "unhappyPersons", keepFile = true)
131136
```
132137

133138
<!---END-->
@@ -203,7 +208,7 @@ df.arrowWriter(
203208
// Specify mismatch subscriber
204209
mismatchSubscriber = writeMismatchMessage,
205210

206-
).use { writer: ArrowWriter ->
211+
).use { writer: ArrowWriter ->
207212

208213
// Save to any format and sink, like in the previous example
209214
writer.writeArrowFeather(file)

plugins/kotlin-dataframe/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/read.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,10 @@ private fun resolveFile(resolutionPath: String?, path: String): File? {
9090

9191
internal class ReadDelimStr : AbstractInterpreter<PluginDataFrameSchema>() {
9292
val Arguments.text: String by arg()
93+
val Arguments.delimiter: Char by arg(defaultValue = Present(','))
9394

9495
override fun Arguments.interpret(): PluginDataFrameSchema {
95-
return DataFrame.readDelimStr(text).schema().toPluginDataFrameSchema()
96+
return DataFrame.readDelimStr(text, delimiter).schema().toPluginDataFrameSchema()
9697
}
9798
}
9899

plugins/kotlin-dataframe/testData/box/diff.fir.ir.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ FILE fqName:org.jetbrains.kotlinx.dataframe fileName:/diff.kt
4949
FUN name:box visibility:public modality:FINAL <> () returnType:kotlin.String
5050
BLOCK_BODY
5151
VAR name:df type:org.jetbrains.kotlinx.dataframe.DataFrame<*> [val]
52-
CALL 'public final fun readDelimStr (text: kotlin.String, colTypes: kotlin.collections.Map<kotlin.String, org.jetbrains.kotlinx.dataframe.io.ColType>, skipLines: kotlin.Int, readLines: kotlin.Int?): org.jetbrains.kotlinx.dataframe.DataFrame<*> declared in org.jetbrains.kotlinx.dataframe.io' type=org.jetbrains.kotlinx.dataframe.DataFrame<*> origin=null
52+
CALL 'public final fun readDelimStr (text: kotlin.String, delimiter: kotlin.Char, colTypes: kotlin.collections.Map<kotlin.String, org.jetbrains.kotlinx.dataframe.io.ColType>, skipLines: kotlin.Int, readLines: kotlin.Int?): org.jetbrains.kotlinx.dataframe.DataFrame<*> declared in org.jetbrains.kotlinx.dataframe.io' type=org.jetbrains.kotlinx.dataframe.DataFrame<*> origin=null
5353
$receiver: GET_OBJECT 'CLASS OBJECT name:Companion modality:FINAL visibility:public [companion] superTypes:[kotlin.Any]' type=org.jetbrains.kotlinx.dataframe.DataFrame.Companion
5454
text: CALL 'public final fun trimIndent (): kotlin.String declared in kotlin.text' type=kotlin.String origin=null
5555
$receiver: CONST String type=kotlin.String value="\n char,level,race,charclass,zone,guild,timestamp\n 59425,1,Orc,Rogue,Orgrimmar,165,01/01/08 00:02:04\n 65494,9,Orc,Hunter,Durotar,-1,01/01/08 00:02:04\n "

plugins/kotlin-dataframe/testData/box/flexibleReturnType.fir.ir.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ FILE fqName:org.jetbrains.kotlinx.dataframe fileName:/flexibleReturnType.kt
2727
FUN name:box visibility:public modality:FINAL <> () returnType:kotlin.String
2828
BLOCK_BODY
2929
VAR name:df type:org.jetbrains.kotlinx.dataframe.DataFrame<*> [val]
30-
CALL 'public final fun readDelimStr (text: kotlin.String, colTypes: kotlin.collections.Map<kotlin.String, org.jetbrains.kotlinx.dataframe.io.ColType>, skipLines: kotlin.Int, readLines: kotlin.Int?): org.jetbrains.kotlinx.dataframe.DataFrame<*> declared in org.jetbrains.kotlinx.dataframe.io' type=org.jetbrains.kotlinx.dataframe.DataFrame<*> origin=null
30+
CALL 'public final fun readDelimStr (text: kotlin.String, delimiter: kotlin.Char, colTypes: kotlin.collections.Map<kotlin.String, org.jetbrains.kotlinx.dataframe.io.ColType>, skipLines: kotlin.Int, readLines: kotlin.Int?): org.jetbrains.kotlinx.dataframe.DataFrame<*> declared in org.jetbrains.kotlinx.dataframe.io' type=org.jetbrains.kotlinx.dataframe.DataFrame<*> origin=null
3131
$receiver: GET_OBJECT 'CLASS OBJECT name:Companion modality:FINAL visibility:public [companion] superTypes:[kotlin.Any]' type=org.jetbrains.kotlinx.dataframe.DataFrame.Companion
3232
text: CALL 'public final fun trimIndent (): kotlin.String declared in kotlin.text' type=kotlin.String origin=null
3333
$receiver: CONST String type=kotlin.String value="\n char,level,race,charclass,zone,guild,timestamp\n 59425,1,Orc,Rogue,Orgrimmar,165,01/01/08 00:02:04\n 65494,9,Orc,Hunter,Durotar,-1,01/01/08 00:02:04\n "

0 commit comments

Comments
 (0)