Skip to content

Commit 2b3361f

Browse files
author
hare
committed
Support read unstructured excel file
1 parent cab218c commit 2b3361f

File tree

3 files changed

+111
-22
lines changed

3 files changed

+111
-22
lines changed

dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 70 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,11 @@ private fun setWorkbookTempDirectory() {
9393
* @param skipRows number of rows before header
9494
* @param rowsCount number of rows to read.
9595
* @param nameRepairStrategy handling of column names.
96-
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
96+
* @param withDefaultHeader make default header
97+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
98+
* However, when withDefaultHeader is set to true,
99+
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
100+
* ensuring unique column names are generated for unstructured data.
97101
*/
98102
public fun DataFrame.Companion.readExcel(
99103
url: URL,
@@ -103,11 +107,12 @@ public fun DataFrame.Companion.readExcel(
103107
stringColumns: StringColumns? = null,
104108
rowsCount: Int? = null,
105109
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
110+
withDefaultHeader: Boolean = false,
106111
): AnyFrame {
107112
setWorkbookTempDirectory()
108113
val wb = WorkbookFactory.create(url.openStream())
109114
return wb.use {
110-
readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
115+
readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy, withDefaultHeader)
111116
}
112117
}
113118

@@ -119,7 +124,11 @@ public fun DataFrame.Companion.readExcel(
119124
* @param skipRows number of rows before header
120125
* @param rowsCount number of rows to read.
121126
* @param nameRepairStrategy handling of column names.
122-
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
127+
* @param withDefaultHeader make default header
128+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
129+
* However, when withDefaultHeader is set to true,
130+
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
131+
* ensuring unique column names are generated for unstructured data.
123132
*/
124133
public fun DataFrame.Companion.readExcel(
125134
file: File,
@@ -129,11 +138,12 @@ public fun DataFrame.Companion.readExcel(
129138
stringColumns: StringColumns? = null,
130139
rowsCount: Int? = null,
131140
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
141+
withDefaultHeader: Boolean = false,
132142
): AnyFrame {
133143
setWorkbookTempDirectory()
134144
val wb = WorkbookFactory.create(file)
135145
return wb.use {
136-
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
146+
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy, withDefaultHeader)
137147
}
138148
}
139149

@@ -145,7 +155,11 @@ public fun DataFrame.Companion.readExcel(
145155
* @param skipRows number of rows before header
146156
* @param rowsCount number of rows to read.
147157
* @param nameRepairStrategy handling of column names.
148-
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
158+
* @param withDefaultHeader make default header
159+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
160+
* However, when withDefaultHeader is set to true,
161+
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
162+
* ensuring unique column names are generated for unstructured data.
149163
*/
150164
@Refine
151165
@Interpretable("ReadExcel")
@@ -157,7 +171,8 @@ public fun DataFrame.Companion.readExcel(
157171
stringColumns: StringColumns? = null,
158172
rowsCount: Int? = null,
159173
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
160-
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy)
174+
withDefaultHeader: Boolean = false,
175+
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy, withDefaultHeader)
161176

162177
/**
163178
* @param sheetName sheet to read. By default, the first sheet in the document
@@ -167,7 +182,11 @@ public fun DataFrame.Companion.readExcel(
167182
* @param skipRows number of rows before header
168183
* @param rowsCount number of rows to read.
169184
* @param nameRepairStrategy handling of column names.
170-
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
185+
* @param withDefaultHeader make default header
186+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
187+
* However, when withDefaultHeader is set to true,
188+
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
189+
* ensuring unique column names are generated for unstructured data.
171190
*/
172191
public fun DataFrame.Companion.readExcel(
173192
inputStream: InputStream,
@@ -177,11 +196,12 @@ public fun DataFrame.Companion.readExcel(
177196
stringColumns: StringColumns? = null,
178197
rowsCount: Int? = null,
179198
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
199+
withDefaultHeader: Boolean = false,
180200
): AnyFrame {
181201
setWorkbookTempDirectory()
182202
val wb = WorkbookFactory.create(inputStream)
183203
return wb.use {
184-
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
204+
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy, withDefaultHeader)
185205
}
186206
}
187207

@@ -194,7 +214,11 @@ public fun DataFrame.Companion.readExcel(
194214
* @param skipRows number of rows before header
195215
* @param rowsCount number of rows to read.
196216
* @param nameRepairStrategy handling of column names.
197-
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
217+
* @param withDefaultHeader make default header
218+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
219+
* However, when withDefaultHeader is set to true,
220+
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
221+
* ensuring unique column names are generated for unstructured data.
198222
*/
199223
public fun DataFrame.Companion.readExcel(
200224
wb: Workbook,
@@ -204,11 +228,12 @@ public fun DataFrame.Companion.readExcel(
204228
formattingOptions: FormattingOptions? = null,
205229
rowsCount: Int? = null,
206230
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
231+
withDefaultHeader: Boolean = false,
207232
): AnyFrame {
208233
val sheet: Sheet = sheetName
209234
?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
210235
?: wb.getSheetAt(0)
211-
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy)
236+
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy, withDefaultHeader)
212237
}
213238

214239
/**
@@ -239,7 +264,11 @@ public class FormattingOptions(range: String, public val formatter: DataFormatte
239264
* @param skipRows number of rows before header
240265
* @param rowsCount number of rows to read.
241266
* @param nameRepairStrategy handling of column names.
242-
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
267+
* @param withDefaultHeader make default header
268+
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
269+
* However, when withDefaultHeader is set to true,
270+
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
271+
* ensuring unique column names are generated for unstructured data.
243272
*/
244273
public fun DataFrame.Companion.readExcel(
245274
sheet: Sheet,
@@ -248,21 +277,36 @@ public fun DataFrame.Companion.readExcel(
248277
skipRows: Int = 0,
249278
rowsCount: Int? = null,
250279
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
280+
withDefaultHeader: Boolean = false,
251281
): AnyFrame {
252-
val columnIndexes: Iterable<Int> = if (columns != null) {
253-
getColumnIndices(columns)
254-
} else {
255-
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
256-
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
282+
val columnIndexes: Iterable<Int> = when{
283+
withDefaultHeader -> {
284+
val notEmptyRow = sheet.rowIterator().asSequence().find { it != null }
285+
checkNotNull(notEmptyRow){
286+
"There are no defined cells"
287+
}
288+
notEmptyRow.firstCellNum until notEmptyRow.lastCellNum
257289
}
258-
val firstCellNum = headerRow.firstCellNum
259-
check(firstCellNum != (-1).toShort()) {
260-
"There are no defined cells on header row number ${skipRows + 1} (1-based index). Pass `columns` argument to specify what columns to read or make sure the index is correct"
290+
columns != null -> getColumnIndices(columns)
291+
else -> {
292+
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
293+
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
294+
}
295+
val firstCellNum = headerRow.firstCellNum
296+
check(firstCellNum != (-1).toShort()) {
297+
"There are no defined cells on header row number ${skipRows + 1} (1-based index). Pass `columns` argument to specify what columns to read or make sure the index is correct"
298+
}
299+
headerRow.firstCellNum until headerRow.lastCellNum
261300
}
262-
headerRow.firstCellNum until headerRow.lastCellNum
263301
}
264302

265-
val headerRow: Row? = sheet.getRow(skipRows)
303+
val headerRow: Row? = if(withDefaultHeader){
304+
sheet.shiftRows(0, sheet.lastRowNum, 1)
305+
sheet.createRow(0)
306+
}else{
307+
sheet.getRow(skipRows)
308+
}
309+
266310
val first = skipRows + 1
267311
val last = rowsCount?.let { first + it - 1 } ?: sheet.lastRowNum
268312
val valueRowsRange = (first..last)
@@ -277,7 +321,11 @@ public fun DataFrame.Companion.readExcel(
277321
?: CellReference.convertNumToColString(index) // Use Excel column names if no data
278322
}
279323

280-
val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
324+
val name = repairNameIfRequired(
325+
nameFromCell,
326+
columnNameCounters,
327+
if (withDefaultHeader) NameRepairStrategy.MAKE_UNIQUE else nameRepairStrategy
328+
)
281329
columnNameCounters[nameFromCell] =
282330
columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
283331
val getCellValue: (Cell?) -> Any? = when {

dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,4 +199,45 @@ class XlsxTest {
199199
df["col1"].type() shouldBe typeOf<String>()
200200
df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100")
201201
}
202+
203+
@Test
204+
fun `read with default header unstructured excel file`() {
205+
val df = DataFrame.readExcel(
206+
testResource("unstructured_example.xlsx"),
207+
withDefaultHeader = true,
208+
)
209+
df.columnNames() shouldBe
210+
listOf(
211+
"A",
212+
"B",
213+
"C",
214+
"D",
215+
"E",
216+
"F",
217+
"G",
218+
"H",
219+
"I",
220+
)
221+
}
222+
223+
@Test
224+
fun `should work read with default header unstructured excel file with skipRow params`() {
225+
val df = DataFrame.readExcel(
226+
testResource("unstructured_example.xlsx"),
227+
withDefaultHeader = true,
228+
skipRows = 1,
229+
)
230+
df.columnNames() shouldBe
231+
listOf(
232+
"A",
233+
"B",
234+
"C",
235+
"D",
236+
"E",
237+
"F",
238+
"G",
239+
"H",
240+
"I",
241+
)
242+
}
202243
}
Binary file not shown.

0 commit comments

Comments
 (0)