Skip to content

Commit 4875411

Browse files
authored
Added some name repairing strategies (#386)
* Added tests, signatures, docs * Added implementation * Added implementation * Fixed bug and updated documentation * Fixed review
1 parent 6b43324 commit 4875411

File tree

5 files changed

+98
-6
lines changed

5 files changed

+98
-6
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package org.jetbrains.kotlinx.dataframe.io
2+
3+
/**
4+
* This strategy defines how the repeatable name column will be handled
5+
* during the creation new dataframe from the IO sources.
6+
*/
7+
public enum class NameRepairStrategy {
8+
/** No actions, keep as is. */
9+
DO_NOTHING,
10+
11+
/** Check the uniqueness of the column names without any actions. */
12+
CHECK_UNIQUE,
13+
14+
/** Check the uniqueness of the column names and repair it. */
15+
MAKE_UNIQUE
16+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package org.jetbrains.kotlinx.dataframe.io
2+
3+
/**
4+
* This strategy defines how the repeatable name column will be handled
5+
* during the creation new dataframe from the IO sources.
6+
*/
7+
public enum class NameRepairStrategy {
8+
/** No actions, keep as is. */
9+
DO_NOTHING,
10+
11+
/** Check the uniqueness of the column names without any actions. */
12+
CHECK_UNIQUE,
13+
14+
/** Check the uniqueness of the column names and repair it. */
15+
MAKE_UNIQUE
16+
}

dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import org.jetbrains.kotlinx.dataframe.api.forEach
2626
import org.jetbrains.kotlinx.dataframe.api.select
2727
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
2828
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
29+
import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
2930
import java.io.File
3031
import java.io.InputStream
3132
import java.io.OutputStream
@@ -60,96 +61,114 @@ private const val readExcel = "readExcel"
6061
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
6162
* @param skipRows number of rows before header
6263
* @param rowsCount number of rows to read.
64+
* @param nameRepairStrategy handling of column names.
65+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
6366
*/
6467
public fun DataFrame.Companion.readExcel(
6568
url: URL,
6669
sheetName: String? = null,
6770
skipRows: Int = 0,
6871
columns: String? = null,
6972
rowsCount: Int? = null,
73+
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
7074
): AnyFrame {
7175
val wb = WorkbookFactory.create(url.openStream())
72-
return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount) }
76+
return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
7377
}
7478

7579
/**
7680
* @param sheetName sheet to read. By default, first sheet in the document
7781
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
7882
* @param skipRows number of rows before header
7983
* @param rowsCount number of rows to read.
84+
* @param nameRepairStrategy handling of column names.
85+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
8086
*/
8187
public fun DataFrame.Companion.readExcel(
8288
file: File,
8389
sheetName: String? = null,
8490
skipRows: Int = 0,
8591
columns: String? = null,
8692
rowsCount: Int? = null,
93+
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
8794
): AnyFrame {
8895
val wb = WorkbookFactory.create(file)
89-
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount) }
96+
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
9097
}
9198

9299
/**
93100
* @param sheetName sheet to read. By default, first sheet in the document
94101
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
95102
* @param skipRows number of rows before header
96103
* @param rowsCount number of rows to read.
104+
* @param nameRepairStrategy handling of column names.
105+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
97106
*/
98107
public fun DataFrame.Companion.readExcel(
99108
fileOrUrl: String,
100109
sheetName: String? = null,
101110
skipRows: Int = 0,
102111
columns: String? = null,
103112
rowsCount: Int? = null,
104-
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount)
113+
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
114+
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy)
105115

106116
/**
107117
* @param sheetName sheet to read. By default, first sheet in the document
108118
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
109119
* @param skipRows number of rows before header
110120
* @param rowsCount number of rows to read.
121+
* @param nameRepairStrategy handling of column names.
122+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
111123
*/
112124
public fun DataFrame.Companion.readExcel(
113125
inputStream: InputStream,
114126
sheetName: String? = null,
115127
skipRows: Int = 0,
116128
columns: String? = null,
117129
rowsCount: Int? = null,
130+
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
118131
): AnyFrame {
119132
val wb = WorkbookFactory.create(inputStream)
120-
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount) }
133+
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
121134
}
122135

123136
/**
124137
* @param sheetName sheet to read. By default, first sheet in the document
125138
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
126139
* @param skipRows number of rows before header
127140
* @param rowsCount number of rows to read.
141+
* @param nameRepairStrategy handling of column names.
142+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
128143
*/
129144
public fun DataFrame.Companion.readExcel(
130145
wb: Workbook,
131146
sheetName: String? = null,
132147
skipRows: Int = 0,
133148
columns: String? = null,
134149
rowsCount: Int? = null,
150+
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
135151
): AnyFrame {
136152
val sheet: Sheet = sheetName
137153
?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
138154
?: wb.getSheetAt(0)
139-
return readExcel(sheet, columns, skipRows, rowsCount)
155+
return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy)
140156
}
141157

142158
/**
143159
* @param sheet sheet to read.
144160
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
145161
* @param skipRows number of rows before header
146162
* @param rowsCount number of rows to read.
163+
* @param nameRepairStrategy handling of column names.
164+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
147165
*/
148166
public fun DataFrame.Companion.readExcel(
149167
sheet: Sheet,
150168
columns: String? = null,
151169
skipRows: Int = 0,
152170
rowsCount: Int? = null,
171+
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
153172
): AnyFrame {
154173
val columnIndexes: Iterable<Int> = if (columns != null) {
155174
columns.split(",").flatMap {
@@ -176,15 +195,19 @@ public fun DataFrame.Companion.readExcel(
176195
val last = rowsCount?.let { first + it - 1 } ?: sheet.lastRowNum
177196
val valueRowsRange = (first..last)
178197

198+
val columnNameCounters = mutableMapOf<String, Int>()
179199
val columns = columnIndexes.map { index ->
180200
val headerCell = headerRow?.getCell(index)
181-
val name = if (headerCell?.cellType == CellType.NUMERIC) {
201+
val nameFromCell = if (headerCell?.cellType == CellType.NUMERIC) {
182202
headerCell.numericCellValue.toString() // Support numeric-named columns
183203
} else {
184204
headerCell?.stringCellValue
185205
?: CellReference.convertNumToColString(index) // Use Excel column names if no data
186206
}
187207

208+
val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
209+
columnNameCounters[nameFromCell] = columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
210+
188211
val values: List<Any?> = valueRowsRange.map {
189212
val row: Row? = sheet.getRow(it)
190213
val cell: Cell? = row?.getCell(index)
@@ -195,6 +218,31 @@ public fun DataFrame.Companion.readExcel(
195218
return dataFrameOf(columns)
196219
}
197220

221+
/**
222+
* This is a universal function for name repairing
223+
* and should be moved to the API module later,
224+
* when the functionality will be enabled for all IO sources.
225+
*
226+
* TODO: https://github.com/Kotlin/dataframe/issues/387
227+
*/
228+
private fun repairNameIfRequired(nameFromCell: String, columnNameCounters: MutableMap<String, Int>, nameRepairStrategy: NameRepairStrategy): String {
229+
return when (nameRepairStrategy) {
230+
NameRepairStrategy.DO_NOTHING -> nameFromCell
231+
NameRepairStrategy.CHECK_UNIQUE -> if (columnNameCounters.contains(nameFromCell)) throw DuplicateColumnNamesException(columnNameCounters.keys.toList()) else nameFromCell
232+
NameRepairStrategy.MAKE_UNIQUE -> if (nameFromCell.isEmpty()) { // probably it's never empty because of filling empty column names earlier
233+
val emptyName = "Unknown column"
234+
if (columnNameCounters.contains(emptyName)) "${emptyName}${columnNameCounters[emptyName]}"
235+
else emptyName
236+
} else {
237+
if (columnNameCounters.contains(nameFromCell)) {
238+
"${nameFromCell}${columnNameCounters[nameFromCell]}"
239+
} else {
240+
nameFromCell
241+
}
242+
}
243+
}
244+
}
245+
198246
private fun Cell?.cellValue(sheetName: String): Any? =
199247
when (this?.cellType) {
200248
CellType._NONE -> error("Cell $address of sheet $sheetName has a CellType that should only be used internally. This is a bug, please report https://github.com/Kotlin/dataframe/issues")

dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
88
import org.jetbrains.kotlinx.dataframe.api.concat
99
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
1010
import org.jetbrains.kotlinx.dataframe.api.toColumn
11+
import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
1112
import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
1213
import org.jetbrains.kotlinx.dataframe.size
1314
import org.junit.Test
@@ -109,4 +110,15 @@ class XlsxTest {
109110
DataFrame.readExcel(testResource("xlsx6.xlsx"), skipRows = 4)
110111
}
111112
}
113+
114+
@Test
115+
fun `read xlsx file with duplicated columns and repair column names`() {
116+
shouldThrow<DuplicateColumnNamesException> {
117+
DataFrame.readExcel(testResource("iris_duplicated_column.xlsx"))
118+
}
119+
120+
val df = DataFrame.readExcel(testResource("iris_duplicated_column.xlsx"), nameRepairStrategy = NameRepairStrategy.MAKE_UNIQUE)
121+
df.columnNames() shouldBe listOf("Sepal.Length", "Sepal.Width", "C",
122+
"Petal.Length", "Petal.Width", "Species", "Other.Width", "Species1", "I", "Other.Width1", "Species2")
123+
}
112124
}
Binary file not shown.

0 commit comments

Comments
 (0)