Added some name repairing strategies (#386)

zaleslaw · web-flow · commit 4875411e8724 · 2023-06-06T16:17:08.000+02:00
* Added tests, signatures, docs

* Added implementation

* Added implementation

* Fixed bug and updated documentation

* Fixed review
diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/NameRepairStrategy.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/NameRepairStrategy.kt
@@ -0,0 +1,16 @@
+package org.jetbrains.kotlinx.dataframe.io
+
+/**
+ * This strategy defines how the repeatable name column will be handled
+ * during the creation new dataframe from the IO sources.
+ */
+public enum class NameRepairStrategy {
+    /** No actions, keep as is. */
+    DO_NOTHING,
+
+    /** Check the uniqueness of the column names without any actions. */
+    CHECK_UNIQUE,
+
+    /** Check the uniqueness of the column names and repair it. */
+    MAKE_UNIQUE
+}
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/NameRepairStrategy.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/NameRepairStrategy.kt
@@ -0,0 +1,16 @@
+package org.jetbrains.kotlinx.dataframe.io
+
+/**
+ * This strategy defines how the repeatable name column will be handled
+ * during the creation new dataframe from the IO sources.
+ */
+public enum class NameRepairStrategy {
+    /** No actions, keep as is. */
+    DO_NOTHING,
+
+    /** Check the uniqueness of the column names without any actions. */
+    CHECK_UNIQUE,
+
+    /** Check the uniqueness of the column names and repair it. */
+    MAKE_UNIQUE
+}
diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt
@@ -26,6 +26,7 @@ import org.jetbrains.kotlinx.dataframe.api.forEach
 import org.jetbrains.kotlinx.dataframe.api.select
 import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
 import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
+import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
 import java.io.File
 import java.io.InputStream
 import java.io.OutputStream
@@ -60,96 +61,114 @@ private const val readExcel = "readExcel"
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
+ * @param nameRepairStrategy handling of column names.
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
  */
 public fun DataFrame.Companion.readExcel(
     url: URL,
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
     rowsCount: Int? = null,
+    nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val wb = WorkbookFactory.create(url.openStream())
-    return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount) }
+    return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
 }
 
 /**
  * @param sheetName sheet to read. By default, first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
+ * @param nameRepairStrategy handling of column names.
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
  */
 public fun DataFrame.Companion.readExcel(
     file: File,
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
     rowsCount: Int? = null,
+    nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val wb = WorkbookFactory.create(file)
-    return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount) }
+    return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
 }
 
 /**
  * @param sheetName sheet to read. By default, first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
+ * @param nameRepairStrategy handling of column names.
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
  */
 public fun DataFrame.Companion.readExcel(
     fileOrUrl: String,
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
     rowsCount: Int? = null,
-): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount)
+    nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
+): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy)
 
 /**
  * @param sheetName sheet to read. By default, first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
+ * @param nameRepairStrategy handling of column names.
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
  */
 public fun DataFrame.Companion.readExcel(
     inputStream: InputStream,
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
     rowsCount: Int? = null,
+    nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val wb = WorkbookFactory.create(inputStream)
-    return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount) }
+    return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
 }
 
 /**
  * @param sheetName sheet to read. By default, first sheet in the document
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
+ * @param nameRepairStrategy handling of column names.
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
  */
 public fun DataFrame.Companion.readExcel(
     wb: Workbook,
     sheetName: String? = null,
     skipRows: Int = 0,
     columns: String? = null,
     rowsCount: Int? = null,
+    nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val sheet: Sheet = sheetName
         ?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
         ?: wb.getSheetAt(0)
-    return readExcel(sheet, columns, skipRows, rowsCount)
+    return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy)
 }
 
 /**
  * @param sheet sheet to read.
  * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
  * @param skipRows number of rows before header
  * @param rowsCount number of rows to read.
+ * @param nameRepairStrategy handling of column names.
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
  */
 public fun DataFrame.Companion.readExcel(
     sheet: Sheet,
     columns: String? = null,
     skipRows: Int = 0,
     rowsCount: Int? = null,
+    nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
 ): AnyFrame {
     val columnIndexes: Iterable<Int> = if (columns != null) {
         columns.split(",").flatMap {
@@ -176,15 +195,19 @@ public fun DataFrame.Companion.readExcel(
     val last = rowsCount?.let { first + it - 1 } ?: sheet.lastRowNum
     val valueRowsRange = (first..last)
 
+    val columnNameCounters = mutableMapOf<String, Int>()
     val columns = columnIndexes.map { index ->
         val headerCell = headerRow?.getCell(index)
-        val name = if (headerCell?.cellType == CellType.NUMERIC) {
+        val nameFromCell = if (headerCell?.cellType == CellType.NUMERIC) {
             headerCell.numericCellValue.toString() // Support numeric-named columns
         } else {
             headerCell?.stringCellValue
                 ?: CellReference.convertNumToColString(index) // Use Excel column names if no data
         }
 
+        val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
+        columnNameCounters[nameFromCell] = columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
+
         val values: List<Any?> = valueRowsRange.map {
             val row: Row? = sheet.getRow(it)
             val cell: Cell? = row?.getCell(index)
@@ -195,6 +218,31 @@ public fun DataFrame.Companion.readExcel(
     return dataFrameOf(columns)
 }
 
+/**
+ * This is a universal function for name repairing
+ * and should be moved to the API module later,
+ * when the functionality will be enabled for all IO sources.
+ *
+ * TODO: https://github.com/Kotlin/dataframe/issues/387
+ */
+private fun repairNameIfRequired(nameFromCell: String, columnNameCounters: MutableMap<String, Int>, nameRepairStrategy: NameRepairStrategy): String {
+    return when (nameRepairStrategy) {
+        NameRepairStrategy.DO_NOTHING -> nameFromCell
+        NameRepairStrategy.CHECK_UNIQUE -> if (columnNameCounters.contains(nameFromCell)) throw DuplicateColumnNamesException(columnNameCounters.keys.toList()) else nameFromCell
+        NameRepairStrategy.MAKE_UNIQUE -> if (nameFromCell.isEmpty()) { // probably it's never empty because of filling empty column names earlier
+            val emptyName = "Unknown column"
+            if (columnNameCounters.contains(emptyName)) "${emptyName}${columnNameCounters[emptyName]}"
+            else emptyName
+        } else {
+            if (columnNameCounters.contains(nameFromCell)) {
+                "${nameFromCell}${columnNameCounters[nameFromCell]}"
+            } else {
+                nameFromCell
+            }
+        }
+    }
+}
+
 private fun Cell?.cellValue(sheetName: String): Any? =
     when (this?.cellType) {
         CellType._NONE -> error("Cell $address of sheet $sheetName has a CellType that should only be used internally. This is a bug, please report https://github.com/Kotlin/dataframe/issues")
diff --git a/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt b/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt
@@ -8,6 +8,7 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.api.concat
 import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
 import org.jetbrains.kotlinx.dataframe.api.toColumn
+import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
 import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
 import org.jetbrains.kotlinx.dataframe.size
 import org.junit.Test
@@ -109,4 +110,15 @@ class XlsxTest {
             DataFrame.readExcel(testResource("xlsx6.xlsx"), skipRows = 4)
         }
     }
+
+    @Test
+    fun `read xlsx file with duplicated columns and repair column names`() {
+        shouldThrow<DuplicateColumnNamesException> {
+            DataFrame.readExcel(testResource("iris_duplicated_column.xlsx"))
+        }
+
+        val df = DataFrame.readExcel(testResource("iris_duplicated_column.xlsx"), nameRepairStrategy = NameRepairStrategy.MAKE_UNIQUE)
+        df.columnNames() shouldBe listOf("Sepal.Length", "Sepal.Width", "C",
+            "Petal.Length", "Petal.Width", "Species", "Other.Width", "Species1", "I", "Other.Width1", "Species2")
+    }
 }
diff --git a/dataframe-excel/src/test/resources/iris_duplicated_column.xlsx b/dataframe-excel/src/test/resources/iris_duplicated_column.xlsx