@@ -26,6 +26,7 @@ import org.jetbrains.kotlinx.dataframe.api.forEach
26
26
import org.jetbrains.kotlinx.dataframe.api.select
27
27
import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod
28
28
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
29
+ import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
29
30
import java.io.File
30
31
import java.io.InputStream
31
32
import java.io.OutputStream
@@ -60,96 +61,114 @@ private const val readExcel = "readExcel"
60
61
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
61
62
* @param skipRows number of rows before header
62
63
* @param rowsCount number of rows to read.
64
+ * @param nameRepairStrategy handling of column names.
65
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
63
66
*/
64
67
public fun DataFrame.Companion.readExcel (
65
68
url : URL ,
66
69
sheetName : String? = null,
67
70
skipRows : Int = 0,
68
71
columns : String? = null,
69
72
rowsCount : Int? = null,
73
+ nameRepairStrategy : NameRepairStrategy = NameRepairStrategy .CHECK_UNIQUE ,
70
74
): AnyFrame {
71
75
val wb = WorkbookFactory .create(url.openStream())
72
- return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount) }
76
+ return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy ) }
73
77
}
74
78
75
79
/* *
76
80
* @param sheetName sheet to read. By default, first sheet in the document
77
81
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
78
82
* @param skipRows number of rows before header
79
83
* @param rowsCount number of rows to read.
84
+ * @param nameRepairStrategy handling of column names.
85
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
80
86
*/
81
87
public fun DataFrame.Companion.readExcel (
82
88
file : File ,
83
89
sheetName : String? = null,
84
90
skipRows : Int = 0,
85
91
columns : String? = null,
86
92
rowsCount : Int? = null,
93
+ nameRepairStrategy : NameRepairStrategy = NameRepairStrategy .CHECK_UNIQUE ,
87
94
): AnyFrame {
88
95
val wb = WorkbookFactory .create(file)
89
- return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount) }
96
+ return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy ) }
90
97
}
91
98
92
99
/* *
93
100
* @param sheetName sheet to read. By default, first sheet in the document
94
101
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
95
102
* @param skipRows number of rows before header
96
103
* @param rowsCount number of rows to read.
104
+ * @param nameRepairStrategy handling of column names.
105
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
97
106
*/
98
107
public fun DataFrame.Companion.readExcel (
99
108
fileOrUrl : String ,
100
109
sheetName : String? = null,
101
110
skipRows : Int = 0,
102
111
columns : String? = null,
103
112
rowsCount : Int? = null,
104
- ): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount)
113
+ nameRepairStrategy : NameRepairStrategy = NameRepairStrategy .CHECK_UNIQUE ,
114
+ ): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy)
105
115
106
116
/* *
107
117
* @param sheetName sheet to read. By default, first sheet in the document
108
118
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
109
119
* @param skipRows number of rows before header
110
120
* @param rowsCount number of rows to read.
121
+ * @param nameRepairStrategy handling of column names.
122
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
111
123
*/
112
124
public fun DataFrame.Companion.readExcel (
113
125
inputStream : InputStream ,
114
126
sheetName : String? = null,
115
127
skipRows : Int = 0,
116
128
columns : String? = null,
117
129
rowsCount : Int? = null,
130
+ nameRepairStrategy : NameRepairStrategy = NameRepairStrategy .CHECK_UNIQUE ,
118
131
): AnyFrame {
119
132
val wb = WorkbookFactory .create(inputStream)
120
- return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount) }
133
+ return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy ) }
121
134
}
122
135
123
136
/* *
124
137
* @param sheetName sheet to read. By default, first sheet in the document
125
138
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
126
139
* @param skipRows number of rows before header
127
140
* @param rowsCount number of rows to read.
141
+ * @param nameRepairStrategy handling of column names.
142
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
128
143
*/
129
144
public fun DataFrame.Companion.readExcel (
130
145
wb : Workbook ,
131
146
sheetName : String? = null,
132
147
skipRows : Int = 0,
133
148
columns : String? = null,
134
149
rowsCount : Int? = null,
150
+ nameRepairStrategy : NameRepairStrategy = NameRepairStrategy .CHECK_UNIQUE ,
135
151
): AnyFrame {
136
152
val sheet: Sheet = sheetName
137
153
?.let { wb.getSheet(it) ? : error(" Sheet with name $sheetName not found" ) }
138
154
? : wb.getSheetAt(0 )
139
- return readExcel(sheet, columns, skipRows, rowsCount)
155
+ return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy )
140
156
}
141
157
142
158
/* *
143
159
* @param sheet sheet to read.
144
160
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
145
161
* @param skipRows number of rows before header
146
162
* @param rowsCount number of rows to read.
163
+ * @param nameRepairStrategy handling of column names.
164
+ * The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
147
165
*/
148
166
public fun DataFrame.Companion.readExcel (
149
167
sheet : Sheet ,
150
168
columns : String? = null,
151
169
skipRows : Int = 0,
152
170
rowsCount : Int? = null,
171
+ nameRepairStrategy : NameRepairStrategy = NameRepairStrategy .CHECK_UNIQUE ,
153
172
): AnyFrame {
154
173
val columnIndexes: Iterable <Int > = if (columns != null ) {
155
174
columns.split(" ," ).flatMap {
@@ -176,15 +195,19 @@ public fun DataFrame.Companion.readExcel(
176
195
val last = rowsCount?.let { first + it - 1 } ? : sheet.lastRowNum
177
196
val valueRowsRange = (first.. last)
178
197
198
+ val columnNameCounters = mutableMapOf<String , Int >()
179
199
val columns = columnIndexes.map { index ->
180
200
val headerCell = headerRow?.getCell(index)
181
- val name = if (headerCell?.cellType == CellType .NUMERIC ) {
201
+ val nameFromCell = if (headerCell?.cellType == CellType .NUMERIC ) {
182
202
headerCell.numericCellValue.toString() // Support numeric-named columns
183
203
} else {
184
204
headerCell?.stringCellValue
185
205
? : CellReference .convertNumToColString(index) // Use Excel column names if no data
186
206
}
187
207
208
+ val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
209
+ columnNameCounters[nameFromCell] = columnNameCounters.getOrDefault(nameFromCell, 0 ) + 1 // increase the counter for specific column name
210
+
188
211
val values: List <Any ?> = valueRowsRange.map {
189
212
val row: Row ? = sheet.getRow(it)
190
213
val cell: Cell ? = row?.getCell(index)
@@ -195,6 +218,31 @@ public fun DataFrame.Companion.readExcel(
195
218
return dataFrameOf(columns)
196
219
}
197
220
221
+ /* *
222
+ * This is a universal function for name repairing
223
+ * and should be moved to the API module later,
224
+ * when the functionality will be enabled for all IO sources.
225
+ *
226
+ * TODO: https://github.com/Kotlin/dataframe/issues/387
227
+ */
228
+ private fun repairNameIfRequired (nameFromCell : String , columnNameCounters : MutableMap <String , Int >, nameRepairStrategy : NameRepairStrategy ): String {
229
+ return when (nameRepairStrategy) {
230
+ NameRepairStrategy .DO_NOTHING -> nameFromCell
231
+ NameRepairStrategy .CHECK_UNIQUE -> if (columnNameCounters.contains(nameFromCell)) throw DuplicateColumnNamesException (columnNameCounters.keys.toList()) else nameFromCell
232
+ NameRepairStrategy .MAKE_UNIQUE -> if (nameFromCell.isEmpty()) { // probably it's never empty because of filling empty column names earlier
233
+ val emptyName = " Unknown column"
234
+ if (columnNameCounters.contains(emptyName)) " ${emptyName}${columnNameCounters[emptyName]} "
235
+ else emptyName
236
+ } else {
237
+ if (columnNameCounters.contains(nameFromCell)) {
238
+ " ${nameFromCell}${columnNameCounters[nameFromCell]} "
239
+ } else {
240
+ nameFromCell
241
+ }
242
+ }
243
+ }
244
+ }
245
+
198
246
private fun Cell?.cellValue (sheetName : String ): Any? =
199
247
when (this ?.cellType) {
200
248
CellType ._NONE -> error(" Cell $address of sheet $sheetName has a CellType that should only be used internally. This is a bug, please report https://github.com/Kotlin/dataframe/issues" )
0 commit comments