Skip to content

Commit 2e3acfe

Browse files
authored
Merge pull request #745 from Kotlin/excel-string-value
Add an option to read Excel cell values as a String regardless of their content type
2 parents 9d5f0a7 + c980e81 commit 2e3acfe

File tree

5 files changed

+97
-39
lines changed
  • dataframe-excel/src
  • docs/StardustDocs/topics
  • tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api

5 files changed

+97
-39
lines changed

dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 68 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import kotlinx.datetime.toKotlinLocalDateTime
66
import org.apache.poi.hssf.usermodel.HSSFWorkbook
77
import org.apache.poi.ss.usermodel.Cell
88
import org.apache.poi.ss.usermodel.CellType
9+
import org.apache.poi.ss.usermodel.DataFormatter
910
import org.apache.poi.ss.usermodel.DateUtil
1011
import org.apache.poi.ss.usermodel.RichTextString
1112
import org.apache.poi.ss.usermodel.Row
@@ -83,6 +84,8 @@ private fun setWorkbookTempDirectory() {
8384
/**
8485
* @param sheetName sheet to read. By default, the first sheet in the document
8586
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
87+
* @param stringColumns range of columns to read as String regardless of a cell type.
88+
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
8689
* @param skipRows number of rows before header
8790
* @param rowsCount number of rows to read.
8891
* @param nameRepairStrategy handling of column names.
@@ -93,17 +96,22 @@ public fun DataFrame.Companion.readExcel(
9396
sheetName: String? = null,
9497
skipRows: Int = 0,
9598
columns: String? = null,
99+
stringColumns: StringColumns? = null,
96100
rowsCount: Int? = null,
97101
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
98102
): AnyFrame {
99103
setWorkbookTempDirectory()
100104
val wb = WorkbookFactory.create(url.openStream())
101-
return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
105+
return wb.use {
106+
readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
107+
}
102108
}
103109

104110
/**
105111
* @param sheetName sheet to read. By default, the first sheet in the document
106112
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
113+
* @param stringColumns range of columns to read as String regardless of a cell type.
114+
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
107115
* @param skipRows number of rows before header
108116
* @param rowsCount number of rows to read.
109117
* @param nameRepairStrategy handling of column names.
@@ -114,17 +122,22 @@ public fun DataFrame.Companion.readExcel(
114122
sheetName: String? = null,
115123
skipRows: Int = 0,
116124
columns: String? = null,
125+
stringColumns: StringColumns? = null,
117126
rowsCount: Int? = null,
118127
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
119128
): AnyFrame {
120129
setWorkbookTempDirectory()
121130
val wb = WorkbookFactory.create(file)
122-
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
131+
return wb.use {
132+
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
133+
}
123134
}
124135

125136
/**
126137
* @param sheetName sheet to read. By default, the first sheet in the document
127138
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
139+
* @param stringColumns range of columns to read as String regardless of a cell type.
140+
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
128141
* @param skipRows number of rows before header
129142
* @param rowsCount number of rows to read.
130143
* @param nameRepairStrategy handling of column names.
@@ -135,13 +148,17 @@ public fun DataFrame.Companion.readExcel(
135148
sheetName: String? = null,
136149
skipRows: Int = 0,
137150
columns: String? = null,
151+
stringColumns: StringColumns? = null,
138152
rowsCount: Int? = null,
139153
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
140-
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy)
154+
): AnyFrame =
155+
readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy)
141156

142157
/**
143158
* @param sheetName sheet to read. By default, the first sheet in the document
144159
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
160+
* @param stringColumns range of columns to read as String regardless of a cell type.
161+
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
145162
* @param skipRows number of rows before header
146163
* @param rowsCount number of rows to read.
147164
* @param nameRepairStrategy handling of column names.
@@ -152,17 +169,23 @@ public fun DataFrame.Companion.readExcel(
152169
sheetName: String? = null,
153170
skipRows: Int = 0,
154171
columns: String? = null,
172+
stringColumns: StringColumns? = null,
155173
rowsCount: Int? = null,
156174
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
157175
): AnyFrame {
158176
setWorkbookTempDirectory()
159177
val wb = WorkbookFactory.create(inputStream)
160-
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
178+
return wb.use {
179+
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
180+
}
161181
}
162182

163183
/**
164184
* @param sheetName sheet to read. By default, the first sheet in the document
165185
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
186+
* @param formattingOptions range of columns to read as String regardless of a cell type.
187+
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
188+
* See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue].
166189
* @param skipRows number of rows before header
167190
* @param rowsCount number of rows to read.
168191
* @param nameRepairStrategy handling of column names.
@@ -173,18 +196,39 @@ public fun DataFrame.Companion.readExcel(
173196
sheetName: String? = null,
174197
skipRows: Int = 0,
175198
columns: String? = null,
199+
formattingOptions: FormattingOptions? = null,
176200
rowsCount: Int? = null,
177201
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
178202
): AnyFrame {
179203
val sheet: Sheet = sheetName
180204
?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
181205
?: wb.getSheetAt(0)
182-
return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy)
206+
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy)
207+
}
208+
209+
/**
210+
* @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
211+
*/
212+
@JvmInline
213+
public value class StringColumns(public val range: String)
214+
215+
public fun StringColumns.toFormattingOptions(formatter: DataFormatter = DataFormatter()): FormattingOptions =
216+
FormattingOptions(range, formatter)
217+
218+
/**
219+
* @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
220+
* @param formatter
221+
*/
222+
public class FormattingOptions(range: String, public val formatter: DataFormatter = DataFormatter()) {
223+
public val columnIndices: Set<Int> = getColumnIndices(range).toSet()
183224
}
184225

185226
/**
186227
* @param sheet sheet to read.
187228
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
229+
* @param formattingOptions range of columns to read as String regardless of a cell's type.
230+
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
231+
* See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue].
188232
* @param skipRows number of rows before header
189233
* @param rowsCount number of rows to read.
190234
* @param nameRepairStrategy handling of column names.
@@ -193,19 +237,13 @@ public fun DataFrame.Companion.readExcel(
193237
public fun DataFrame.Companion.readExcel(
194238
sheet: Sheet,
195239
columns: String? = null,
240+
formattingOptions: FormattingOptions? = null,
196241
skipRows: Int = 0,
197242
rowsCount: Int? = null,
198243
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
199244
): AnyFrame {
200245
val columnIndexes: Iterable<Int> = if (columns != null) {
201-
columns.split(",").flatMap {
202-
if (it.contains(":")) {
203-
val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
204-
start..end
205-
} else {
206-
listOf(CellReference.convertColStringToIndex(it))
207-
}
208-
}
246+
getColumnIndices(columns)
209247
} else {
210248
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
211249
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
@@ -235,17 +273,32 @@ public fun DataFrame.Companion.readExcel(
235273
val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
236274
columnNameCounters[nameFromCell] =
237275
columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
276+
val getCellValue: (Cell?) -> Any? = when {
277+
formattingOptions != null && index in formattingOptions.columnIndices -> { cell: Cell? ->
278+
formattingOptions.formatter.formatCellValue(cell)
279+
}
238280

281+
else -> { cell -> cell.cellValue(sheet.sheetName) }
282+
}
239283
val values: List<Any?> = valueRowsRange.map {
240284
val row: Row? = sheet.getRow(it)
241285
val cell: Cell? = row?.getCell(index)
242-
cell.cellValue(sheet.sheetName)
286+
getCellValue(cell)
243287
}
244288
DataColumn.createWithTypeInference(name, values)
245289
}
246290
return dataFrameOf(columns)
247291
}
248292

293+
private fun getColumnIndices(columns: String): List<Int> = columns.split(",").flatMap {
294+
if (it.contains(":")) {
295+
val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
296+
start..end
297+
} else {
298+
listOf(CellReference.convertColStringToIndex(it))
299+
}
300+
}
301+
249302
/**
250303
* This is a universal function for name repairing
251304
* and should be moved to the API module later,
@@ -324,7 +377,7 @@ public fun <T> DataFrame<T>.writeExcel(
324377
keepFile: Boolean = false,
325378
) {
326379
val factory =
327-
if (keepFile){
380+
if (keepFile) {
328381
when (workBookType) {
329382
WorkBookType.XLS -> HSSFWorkbook(file.inputStream())
330383
WorkBookType.XLSX -> XSSFWorkbook(file.inputStream())

dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,17 @@ class XlsxTest {
4545
df shouldBe dataFrameOf("col1", "col2", "C")(1.0, null, 3.0)
4646
}
4747

48+
@Test
49+
fun `column with empty header and with formatting`() {
50+
val df = DataFrame.readExcel(
51+
testResource("sample2.xlsx"),
52+
"Sheet1",
53+
columns = "A:C",
54+
stringColumns = StringColumns("A:C")
55+
)
56+
df shouldBe dataFrameOf("col1", "col2", "C")("1", "", "3")
57+
}
58+
4859
@Test
4960
fun `limit row number`() {
5061
val df = DataFrame.readExcel(testResource("sample4.xls"), "Sheet1", rowsCount = 5)
@@ -179,4 +190,14 @@ class XlsxTest {
179190
val df = DataFrame.readExcel(testResource("formula_cell.xlsx"))
180191
df.columnNames() shouldBe listOf("Number", "Greater than 5", "Multiplied by 10", "Divided by 5")
181192
}
193+
194+
@Test
195+
fun `read mixed column`() {
196+
val df = DataFrame.readExcel(
197+
testResource("mixed_column.xlsx"),
198+
stringColumns = StringColumns("A")
199+
)
200+
df["col1"].type() shouldBe typeOf<String>()
201+
df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100")
202+
}
182203
}
Binary file not shown.

docs/StardustDocs/topics/read.md

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -401,28 +401,20 @@ Sometimes cells can have the wrong format in an Excel file. For example, you exp
401401

402402
```text
403403
IDS
404-
100 <-- Intended to be String, but has wrong cell format in original .xlsx file
404+
100 <-- Intended to be String, but has numeric cell format in original .xlsx file
405405
A100
406406
B100
407407
C100
408408
```
409409

410410
You will get column of `Serializable` instead (common parent for `Double` and `String`).
411411

412-
You can fix it using the `.convert()` function:
412+
You can fix it by providing an additional parameter:
413413

414414
<!---FUN fixMixedColumn-->
415415

416416
```kotlin
417-
val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100")
418-
val df1 = df.convert("IDS").with(Infer.Type) {
419-
if (it is Double) {
420-
it.toLong().toString()
421-
} else {
422-
it
423-
}
424-
}
425-
df1["IDS"].type() shouldBe typeOf<String>()
417+
val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
426418
```
427419

428420
<!---END-->

tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,19 @@ package org.jetbrains.kotlinx.dataframe.samples.api
33
import io.kotest.matchers.shouldBe
44
import org.jetbrains.kotlinx.dataframe.DataFrame
55
import org.jetbrains.kotlinx.dataframe.DataRow
6-
import org.jetbrains.kotlinx.dataframe.api.Infer
76
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
87
import org.jetbrains.kotlinx.dataframe.api.columnNames
98
import org.jetbrains.kotlinx.dataframe.api.columnTypes
10-
import org.jetbrains.kotlinx.dataframe.api.convert
11-
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
12-
import org.jetbrains.kotlinx.dataframe.api.with
139
import org.jetbrains.kotlinx.dataframe.io.ColType
10+
import org.jetbrains.kotlinx.dataframe.io.StringColumns
1411
import org.jetbrains.kotlinx.dataframe.io.readArrowFeather
1512
import org.jetbrains.kotlinx.dataframe.io.readCSV
13+
import org.jetbrains.kotlinx.dataframe.io.readExcel
1614
import org.jetbrains.kotlinx.dataframe.io.readJson
1715
import org.jetbrains.kotlinx.dataframe.testArrowFeather
1816
import org.jetbrains.kotlinx.dataframe.testCsv
1917
import org.jetbrains.kotlinx.dataframe.testJson
18+
import org.junit.Ignore
2019
import org.junit.Test
2120
import java.util.*
2221
import kotlin.reflect.typeOf
@@ -63,17 +62,10 @@ class Read {
6362
}
6463

6564
@Test
65+
@Ignore
6666
fun fixMixedColumn() {
6767
// SampleStart
68-
val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100")
69-
val df1 = df.convert("IDS").with(Infer.Type) {
70-
if (it is Double) {
71-
it.toLong().toString()
72-
} else {
73-
it
74-
}
75-
}
76-
df1["IDS"].type() shouldBe typeOf<String>()
68+
val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
7769
// SampleEnd
7870
}
7971

0 commit comments

Comments
 (0)