Skip to content

Commit 89d6243

Browse files
authored
Merge pull request #153 from Kotlin/fix-read-excel
use index math for skip rows #132
2 parents bdcee02 + c637909 commit 89d6243

File tree

4 files changed

+60
-24
lines changed

4 files changed

+60
-24
lines changed

dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 40 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import org.apache.poi.ss.usermodel.Cell
88
import org.apache.poi.ss.usermodel.CellType
99
import org.apache.poi.ss.usermodel.DateUtil
1010
import org.apache.poi.ss.usermodel.RichTextString
11+
import org.apache.poi.ss.usermodel.Row
1112
import org.apache.poi.ss.usermodel.Sheet
1213
import org.apache.poi.ss.usermodel.Workbook
1314
import org.apache.poi.ss.usermodel.WorkbookFactory
@@ -148,7 +149,7 @@ public fun DataFrame.Companion.readExcel(
148149
skipRows: Int = 0,
149150
rowsCount: Int? = null
150151
): AnyFrame {
151-
val columnIndexes = if (columns != null) {
152+
val columnIndexes: Iterable<Int> = if (columns != null) {
152153
columns.split(",").flatMap {
153154
if (it.contains(":")) {
154155
val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
@@ -158,42 +159,58 @@ public fun DataFrame.Companion.readExcel(
158159
}
159160
}
160161
} else {
161-
sheet.getRow(skipRows).map { it.columnIndex }
162+
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
163+
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
164+
}
165+
val firstCellNum = headerRow.firstCellNum
166+
check(firstCellNum != (-1).toShort()) {
167+
"There are no defined cells on header row number ${skipRows + 1} (1-based index). Pass `columns` argument to specify what columns to read or make sure the index is correct"
168+
}
169+
headerRow.firstCellNum until headerRow.lastCellNum
162170
}
163171

164-
val headerRow = sheet.getRow(skipRows)
165-
val valueRows = sheet.drop(1 + skipRows).let { if (rowsCount != null) it.take(rowsCount) else it }
172+
val headerRow: Row? = sheet.getRow(skipRows)
173+
val first = skipRows + 1
174+
val last = rowsCount?.let { first + it - 1 } ?: sheet.lastRowNum
175+
val valueRowsRange = (first..last)
176+
166177
val columns = columnIndexes.map { index ->
167-
val headerCell = headerRow.getCell(index)
178+
val headerCell = headerRow?.getCell(index)
168179
val name = if (headerCell?.cellType == CellType.NUMERIC) {
169180
headerCell.numericCellValue.toString() // Support numeric-named columns
170181
} else {
171182
headerCell?.stringCellValue ?: CellReference.convertNumToColString(index) // Use Excel column names if no data
172183
}
173-
val values: List<Any?> = valueRows.map {
174-
val cell: Cell? = it.getCell(index)
175-
when (cell?.cellType) {
176-
CellType._NONE -> error("Cell ${cell.address} of sheet ${sheet.sheetName} has a CellType that should only be used internally. This is a bug, please report https://github.com/Kotlin/dataframe/issues")
177-
CellType.NUMERIC -> {
178-
val number = cell.numericCellValue
179-
when {
180-
DateUtil.isCellDateFormatted(cell) -> DateUtil.getLocalDateTime(number).toKotlinLocalDateTime()
181-
else -> number
182-
}
183-
}
184-
CellType.STRING -> cell.stringCellValue
185-
CellType.FORMULA -> cell.numericCellValue
186-
CellType.BLANK -> cell.stringCellValue
187-
CellType.BOOLEAN -> cell.booleanCellValue
188-
CellType.ERROR -> cell.errorCellValue
189-
null -> null
190-
}
184+
185+
val values: List<Any?> = valueRowsRange.map {
186+
val row: Row? = sheet.getRow(it)
187+
val cell: Cell? = row?.getCell(index)
188+
cell.cellValue(sheet.sheetName)
191189
}
192190
DataColumn.createWithTypeInference(name, values)
193191
}
194192
return dataFrameOf(columns)
195193
}
196194

195+
private fun Cell?.cellValue(sheetName: String): Any? =
196+
when (this?.cellType) {
197+
CellType._NONE -> error("Cell $address of sheet $sheetName has a CellType that should only be used internally. This is a bug, please report https://github.com/Kotlin/dataframe/issues")
198+
CellType.NUMERIC -> {
199+
val number = numericCellValue
200+
when {
201+
DateUtil.isCellDateFormatted(this) -> DateUtil.getLocalDateTime(number).toKotlinLocalDateTime()
202+
else -> number
203+
}
204+
}
205+
206+
CellType.STRING -> stringCellValue
207+
CellType.FORMULA -> numericCellValue
208+
CellType.BLANK -> stringCellValue
209+
CellType.BOOLEAN -> booleanCellValue
210+
CellType.ERROR -> errorCellValue
211+
null -> null
212+
}
213+
197214
public fun <T> DataFrame<T>.writeExcel(
198215
path: String,
199216
columnsSelector: ColumnsSelector<T, *> = { all() },

dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import io.kotest.assertions.throwables.shouldThrow
34
import io.kotest.matchers.shouldBe
45
import kotlinx.datetime.LocalDateTime
56
import org.apache.poi.ss.usermodel.WorkbookFactory
67
import org.jetbrains.kotlinx.dataframe.DataFrame
78
import org.jetbrains.kotlinx.dataframe.api.concat
89
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
910
import org.jetbrains.kotlinx.dataframe.api.toColumn
11+
import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
12+
import org.jetbrains.kotlinx.dataframe.size
1013
import org.junit.Test
1114
import java.net.URL
1215
import java.nio.file.Files
@@ -83,11 +86,27 @@ class XlsxTest {
8386
fun `read header on second row`() {
8487
val df = DataFrame.readExcel(testResource("custom_header_position.xlsx"), skipRows = 1)
8588
df.columnNames() shouldBe listOf("header1", "header2")
89+
df.size() shouldBe DataFrameSize(2, 3)
8690
}
8791

8892
@Test
8993
fun `consider skipRows when obtaining column indexes`() {
90-
val df = DataFrame.readExcel(testResource("header.xlsx"), skipRows = 6)
94+
val df = DataFrame.readExcel(testResource("header.xlsx"), skipRows = 6, rowsCount = 1)
9195
df.columnNames() shouldBe listOf("Well", "Well Position", "Omit", "Sample Name", "Target Name", "Task", "Reporter", "Quencher")
96+
df shouldBe dataFrameOf("Well", "Well Position", "Omit", "Sample Name", "Target Name", "Task", "Reporter", "Quencher")(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,)
97+
}
98+
99+
@Test
100+
fun `use indexes math to skip rows`() {
101+
val df = DataFrame.readExcel(testResource("repro.xls"), skipRows = 4)
102+
df.columnNames() shouldBe listOf("a")
103+
df.rowsCount() shouldBe 2
104+
}
105+
106+
@Test
107+
fun `throw when there are no defined cells on header row`() {
108+
shouldThrow<IllegalStateException> {
109+
DataFrame.readExcel(testResource("xlsx6.xlsx"), skipRows = 4)
110+
}
92111
}
93112
}
5.5 KB
Binary file not shown.
4.69 KB
Binary file not shown.

0 commit comments

Comments
 (0)