Add an option to read Excel cell values as a String regardless of their content type

koperagen · koperagen · commit 2e97bd211b85 · 2024-06-20T16:14:48.000+03:00
fixes #669
diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt
@@ -6,6 +6,7 @@ import kotlinx.datetime.toKotlinLocalDateTime
 import org.apache.poi.hssf.usermodel.HSSFWorkbook
 import org.apache.poi.ss.usermodel.Cell
 import org.apache.poi.ss.usermodel.CellType
+import org.apache.poi.ss.usermodel.DataFormatter
 import org.apache.poi.ss.usermodel.DateUtil
 import org.apache.poi.ss.usermodel.RichTextString
 import org.apache.poi.ss.usermodel.Row
@@ -83,6 +84,8 @@ private fun setWorkbookTempDirectory() {
 /**
  * @param sheetName sheet to read. By default, the first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ * @param stringColumns range of columns to read as String regardless of a cell type.
+ * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
  * @param nameRepairStrategy handling of column names.
@@ -93,17 +96,22 @@ public fun DataFrame.Companion.readExcel(
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
+    stringColumns: StringColumns? = null,
     rowsCount: Int? = null,
     nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     setWorkbookTempDirectory()
     val wb = WorkbookFactory.create(url.openStream())
-    return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
+    return wb.use {
+        readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
+    }
 }
 
 /**
  * @param sheetName sheet to read. By default, the first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ * @param stringColumns range of columns to read as String regardless of a cell type.
+ * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
  * @param nameRepairStrategy handling of column names.
@@ -114,17 +122,22 @@ public fun DataFrame.Companion.readExcel(
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
+    stringColumns: StringColumns? = null,
     rowsCount: Int? = null,
     nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     setWorkbookTempDirectory()
     val wb = WorkbookFactory.create(file)
-    return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
+    return wb.use {
+        readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
+    }
 }
 
 /**
  * @param sheetName sheet to read. By default, the first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ * @param stringColumns range of columns to read as String regardless of a cell type.
+ * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
  * @param nameRepairStrategy handling of column names.
@@ -135,13 +148,17 @@ public fun DataFrame.Companion.readExcel(
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
+    stringColumns: StringColumns? = null,
     rowsCount: Int? = null,
     nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
-): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy)
+): AnyFrame =
+    readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy)
 
 /**
  * @param sheetName sheet to read. By default, the first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ * @param stringColumns range of columns to read as String regardless of a cell type.
+ * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
  * @param nameRepairStrategy handling of column names.
@@ -152,17 +169,23 @@ public fun DataFrame.Companion.readExcel(
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
+    stringColumns: StringColumns? = null,
     rowsCount: Int? = null,
     nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     setWorkbookTempDirectory()
     val wb = WorkbookFactory.create(inputStream)
-    return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
+    return wb.use {
+        readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
+    }
 }
 
 /**
  * @param sheetName sheet to read. By default, the first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ * @param formattingOptions range of columns to read as String regardless of a cell type.
+ * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
+ * See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue].
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
  * @param nameRepairStrategy handling of column names.
@@ -173,18 +196,37 @@ public fun DataFrame.Companion.readExcel(
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
+    formattingOptions: FormattingOptions? = null,
     rowsCount: Int? = null,
     nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val sheet: Sheet = sheetName
         ?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
         ?: wb.getSheetAt(0)
-    return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy)
+    return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy)
+}
+
+/**
+ * @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ */
+public class StringColumns(public val range: String)
+
+public fun StringColumns.toFormattingOptions(formatter: DataFormatter = DataFormatter()): FormattingOptions =
+    FormattingOptions(range, formatter)
+
+/**
+ * @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ */
+public class FormattingOptions(range: String, public val formatter: DataFormatter = DataFormatter()) {
+    public val columnIndices: Set<Int> = getColumnIndices(range).toSet()
 }
 
 /**
  * @param sheet sheet to read.
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
+ * @param formattingOptions range of columns to read as String regardless of a cell type.
+ * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
+ * See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue].
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
  * @param nameRepairStrategy handling of column names.
@@ -193,19 +235,13 @@ public fun DataFrame.Companion.readExcel(
 public fun DataFrame.Companion.readExcel(
     sheet: Sheet,
     columns: String? = null,
+    formattingOptions: FormattingOptions? = null,
     skipRows: Int = 0,
     rowsCount: Int? = null,
     nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val columnIndexes: Iterable<Int> = if (columns != null) {
-        columns.split(",").flatMap {
-            if (it.contains(":")) {
-                val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
-                start..end
-            } else {
-                listOf(CellReference.convertColStringToIndex(it))
-            }
-        }
+        getColumnIndices(columns)
     } else {
         val headerRow = checkNotNull(sheet.getRow(skipRows)) {
             "Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
@@ -235,17 +271,32 @@ public fun DataFrame.Companion.readExcel(
         val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
         columnNameCounters[nameFromCell] =
             columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
+        val getCellValue: (Cell?) -> Any? = when {
+            formattingOptions != null && index in formattingOptions.columnIndices -> { cell: Cell? ->
+                formattingOptions.formatter.formatCellValue(cell)
+            }
 
+            else -> { cell -> cell.cellValue(sheet.sheetName) }
+        }
         val values: List<Any?> = valueRowsRange.map {
             val row: Row? = sheet.getRow(it)
             val cell: Cell? = row?.getCell(index)
-            cell.cellValue(sheet.sheetName)
+            getCellValue(cell)
         }
         DataColumn.createWithTypeInference(name, values)
     }
     return dataFrameOf(columns)
 }
 
+private fun getColumnIndices(columns: String): List<Int> = columns.split(",").flatMap {
+    if (it.contains(":")) {
+        val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
+        start..end
+    } else {
+        listOf(CellReference.convertColStringToIndex(it))
+    }
+}
+
 /**
  * This is a universal function for name repairing
  * and should be moved to the API module later,
@@ -324,7 +375,7 @@ public fun <T> DataFrame<T>.writeExcel(
     keepFile: Boolean = false,
 ) {
     val factory =
-        if (keepFile){
+        if (keepFile) {
             when (workBookType) {
                 WorkBookType.XLS -> HSSFWorkbook(file.inputStream())
                 WorkBookType.XLSX -> XSSFWorkbook(file.inputStream())
diff --git a/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt b/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt
@@ -45,6 +45,17 @@ class XlsxTest {
         df shouldBe dataFrameOf("col1", "col2", "C")(1.0, null, 3.0)
     }
 
+    @Test
+    fun `column with empty header and with formatting`() {
+        val df = DataFrame.readExcel(
+            testResource("sample2.xlsx"),
+            "Sheet1",
+            columns = "A:C",
+            stringColumns = StringColumns("A:C")
+        )
+        df shouldBe dataFrameOf("col1", "col2", "C")("1", "", "3")
+    }
+
     @Test
     fun `limit row number`() {
         val df = DataFrame.readExcel(testResource("sample4.xls"), "Sheet1", rowsCount = 5)
@@ -179,4 +190,14 @@ class XlsxTest {
         val df = DataFrame.readExcel(testResource("formula_cell.xlsx"))
         df.columnNames() shouldBe listOf("Number", "Greater than 5", "Multiplied by 10", "Divided by 5")
     }
+
+    @Test
+    fun `read mixed column`() {
+        val df = DataFrame.readExcel(
+            testResource("mixed_column.xlsx"),
+            stringColumns = StringColumns("A")
+        )
+        df["col1"].type() shouldBe typeOf<String>()
+        df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100")
+    }
 }
diff --git a/dataframe-excel/src/test/resources/mixed_column.xlsx b/dataframe-excel/src/test/resources/mixed_column.xlsx
diff --git a/docs/StardustDocs/topics/read.md b/docs/StardustDocs/topics/read.md
@@ -401,28 +401,20 @@ Sometimes cells can have the wrong format in an Excel file. For example, you exp
 
 ```text
 IDS
-100 <-- Intended to be String, but has wrong cell format in original .xlsx file
+100 <-- Intended to be String, but has numeric cell format in original .xlsx file
 A100
 B100
 C100
 ```
 
 You will get column of `Serializable` instead (common parent for `Double` and `String`).
 
-You can fix it using the `.convert()` function:
+You can fix it by providing an additional parameter:
 
 <!---FUN fixMixedColumn-->
 
 ```kotlin
-val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100")
-val df1 = df.convert("IDS").with(Infer.Type) {
-    if (it is Double) {
-        it.toLong().toString()
-    } else {
-        it
-    }
-}
-df1["IDS"].type() shouldBe typeOf<String>()
+val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
 ```
 
 <!---END-->
diff --git a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt
@@ -3,20 +3,19 @@ package org.jetbrains.kotlinx.dataframe.samples.api
 import io.kotest.matchers.shouldBe
 import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.DataRow
-import org.jetbrains.kotlinx.dataframe.api.Infer
 import org.jetbrains.kotlinx.dataframe.api.ParserOptions
 import org.jetbrains.kotlinx.dataframe.api.columnNames
 import org.jetbrains.kotlinx.dataframe.api.columnTypes
-import org.jetbrains.kotlinx.dataframe.api.convert
-import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
-import org.jetbrains.kotlinx.dataframe.api.with
 import org.jetbrains.kotlinx.dataframe.io.ColType
+import org.jetbrains.kotlinx.dataframe.io.StringColumns
 import org.jetbrains.kotlinx.dataframe.io.readArrowFeather
 import org.jetbrains.kotlinx.dataframe.io.readCSV
+import org.jetbrains.kotlinx.dataframe.io.readExcel
 import org.jetbrains.kotlinx.dataframe.io.readJson
 import org.jetbrains.kotlinx.dataframe.testArrowFeather
 import org.jetbrains.kotlinx.dataframe.testCsv
 import org.jetbrains.kotlinx.dataframe.testJson
+import org.junit.Ignore
 import org.junit.Test
 import java.util.*
 import kotlin.reflect.typeOf
@@ -63,17 +62,10 @@ class Read {
     }
 
     @Test
+    @Ignore
     fun fixMixedColumn() {
         // SampleStart
-        val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100")
-        val df1 = df.convert("IDS").with(Infer.Type) {
-            if (it is Double) {
-                it.toLong().toString()
-            } else {
-                it
-            }
-        }
-        df1["IDS"].type() shouldBe typeOf<String>()
+        val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
         // SampleEnd
     }