Skip to content

Commit 7a0a377

Browse files
add nulls parse option in readExcel
1 parent 43f3bbe commit 7a0a377

File tree

3 files changed

+50
-6
lines changed

3 files changed

+50
-6
lines changed

dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ private fun setWorkbookTempDirectory() {
9898
* when set to false, it operates as [NameRepairStrategy.MAKE_UNIQUE],
9999
* ensuring unique column names will make the columns be named according to excel columns, like "A", "B", "C" etc.
100100
* for unstructured data.
101+
* @param parseEmptyAsNull when set to true, empty strings in cells are parsed as null (default true).
102+
* These cells are ignored when inferring the column’s type.
101103
*/
102104
public fun DataFrame.Companion.readExcel(
103105
url: URL,
@@ -108,6 +110,7 @@ public fun DataFrame.Companion.readExcel(
108110
rowsCount: Int? = null,
109111
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
110112
firstRowIsHeader: Boolean = true,
113+
parseEmptyAsNull: Boolean = true,
111114
): AnyFrame {
112115
setWorkbookTempDirectory()
113116
val wb = WorkbookFactory.create(url.openStream())
@@ -121,6 +124,7 @@ public fun DataFrame.Companion.readExcel(
121124
rowsCount,
122125
nameRepairStrategy,
123126
firstRowIsHeader,
127+
parseEmptyAsNull
124128
)
125129
}
126130
}
@@ -138,6 +142,8 @@ public fun DataFrame.Companion.readExcel(
138142
* when set to false, it operates as [NameRepairStrategy.MAKE_UNIQUE],
139143
* ensuring unique column names will make the columns be named according to excel columns, like "A", "B", "C" etc.
140144
* for unstructured data.
145+
* @param parseEmptyAsNull when set to true, empty strings in cells are parsed as null (default true).
146+
* These cells are ignored when inferring the column’s type.
141147
*/
142148
public fun DataFrame.Companion.readExcel(
143149
file: File,
@@ -148,6 +154,7 @@ public fun DataFrame.Companion.readExcel(
148154
rowsCount: Int? = null,
149155
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
150156
firstRowIsHeader: Boolean = true,
157+
parseEmptyAsNull: Boolean = true,
151158
): AnyFrame {
152159
setWorkbookTempDirectory()
153160
@Suppress("ktlint:standard:comment-wrapping")
@@ -162,6 +169,7 @@ public fun DataFrame.Companion.readExcel(
162169
rowsCount,
163170
nameRepairStrategy,
164171
firstRowIsHeader,
172+
parseEmptyAsNull
165173
)
166174
}
167175
}
@@ -179,6 +187,8 @@ public fun DataFrame.Companion.readExcel(
179187
* when set to false, it operates as [NameRepairStrategy.MAKE_UNIQUE],
180188
* ensuring unique column names will make the columns be named according to excel columns, like "A", "B", "C" etc.
181189
* for unstructured data.
190+
* @param parseEmptyAsNull when set to true, empty strings in cells are parsed as null (default true).
191+
* These cells are ignored when inferring the column’s type.
182192
*/
183193
public fun DataFrame.Companion.readExcel(
184194
fileOrUrl: String,
@@ -189,6 +199,7 @@ public fun DataFrame.Companion.readExcel(
189199
rowsCount: Int? = null,
190200
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
191201
firstRowIsHeader: Boolean = true,
202+
parseEmptyAsNull: Boolean = true,
192203
): AnyFrame =
193204
readExcel(
194205
asUrl(fileOrUrl),
@@ -199,6 +210,7 @@ public fun DataFrame.Companion.readExcel(
199210
rowsCount,
200211
nameRepairStrategy,
201212
firstRowIsHeader,
213+
parseEmptyAsNull
202214
)
203215

204216
/**
@@ -214,6 +226,8 @@ public fun DataFrame.Companion.readExcel(
214226
* when set to false, it operates as [NameRepairStrategy.MAKE_UNIQUE],
215227
* ensuring unique column names will make the columns be named according to excel columns, like "A", "B", "C" etc.
216228
* for unstructured data.
229+
* @param parseEmptyAsNull when set to true, empty strings in cells are parsed as null (default true).
230+
* These cells are ignored when inferring the column’s type.
217231
*/
218232
public fun DataFrame.Companion.readExcel(
219233
inputStream: InputStream,
@@ -224,6 +238,7 @@ public fun DataFrame.Companion.readExcel(
224238
rowsCount: Int? = null,
225239
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
226240
firstRowIsHeader: Boolean = true,
241+
parseEmptyAsNull: Boolean = true,
227242
): AnyFrame {
228243
setWorkbookTempDirectory()
229244
val wb = WorkbookFactory.create(inputStream)
@@ -237,6 +252,7 @@ public fun DataFrame.Companion.readExcel(
237252
rowsCount,
238253
nameRepairStrategy,
239254
firstRowIsHeader,
255+
parseEmptyAsNull
240256
)
241257
}
242258
}
@@ -255,6 +271,8 @@ public fun DataFrame.Companion.readExcel(
255271
* when set to false, it operates as [NameRepairStrategy.MAKE_UNIQUE],
256272
* ensuring unique column names will make the columns be named according to excel columns, like "A", "B", "C" etc.
257273
* for unstructured data.
274+
* @param parseEmptyAsNull when set to true, empty strings in cells are parsed as null (default true).
275+
* These cells are ignored when inferring the column’s type.
258276
*/
259277
public fun DataFrame.Companion.readExcel(
260278
wb: Workbook,
@@ -265,11 +283,12 @@ public fun DataFrame.Companion.readExcel(
265283
rowsCount: Int? = null,
266284
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
267285
firstRowIsHeader: Boolean = true,
286+
parseEmptyAsNull: Boolean = true,
268287
): AnyFrame {
269288
val sheet: Sheet = sheetName
270289
?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
271290
?: wb.getSheetAt(0)
272-
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy, firstRowIsHeader)
291+
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy, firstRowIsHeader, parseEmptyAsNull)
273292
}
274293

275294
/**
@@ -312,6 +331,7 @@ public fun DataFrame.Companion.readExcel(
312331
rowsCount: Int? = null,
313332
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
314333
firstRowIsHeader: Boolean = true,
334+
parseEmptyAsNull: Boolean = true,
315335
): AnyFrame {
316336
val columnIndexes: Iterable<Int> = when {
317337
columns != null -> getColumnIndices(columns)
@@ -364,12 +384,18 @@ public fun DataFrame.Companion.readExcel(
364384
)
365385
columnNameCounters[nameFromCell] =
366386
columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
367-
val getCellValue: (Cell?) -> Any? = when {
368-
formattingOptions != null && index in formattingOptions.columnIndices -> { cell: Cell? ->
369-
formattingOptions.formatter.formatCellValue(cell)
387+
val getCellValue: (Cell?) -> Any? = { cell ->
388+
if (cell == null) {
389+
null
390+
} else {
391+
val rawValue: Any? = if (formattingOptions != null && index in formattingOptions.columnIndices) {
392+
formattingOptions.formatter.formatCellValue(cell)
393+
} else {
394+
cell.cellValue(sheet.sheetName)
395+
}
396+
if (parseEmptyAsNull && rawValue is String && rawValue.isEmpty()) null
397+
else rawValue
370398
}
371-
372-
else -> { cell -> cell.cellValue(sheet.sheetName) }
373399
}
374400
val values: List<Any?> = valueRowsRange.map {
375401
val row: Row? = sheet.getRow(it)

dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@ import kotlinx.datetime.LocalDateTime
66
import org.apache.poi.ss.usermodel.WorkbookFactory
77
import org.jetbrains.kotlinx.dataframe.DataFrame
88
import org.jetbrains.kotlinx.dataframe.api.concat
9+
import org.jetbrains.kotlinx.dataframe.api.convert
910
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
1011
import org.jetbrains.kotlinx.dataframe.api.toColumn
12+
import org.jetbrains.kotlinx.dataframe.api.toInt
1113
import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
1214
import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
1315
import org.jetbrains.kotlinx.dataframe.size
16+
import org.jetbrains.kotlinx.dataframe.type
1417
import org.junit.Test
1518
import java.net.URL
1619
import java.nio.file.Files
@@ -53,6 +56,7 @@ class XlsxTest {
5356
"Sheet1",
5457
columns = "A:C",
5558
stringColumns = StringColumns("A:C"),
59+
parseEmptyAsNull = false,
5660
)
5761
df shouldBe dataFrameOf("col1", "col2", "C")("1", "", "3")
5862
}
@@ -216,6 +220,7 @@ class XlsxTest {
216220
firstRowIsHeader = false,
217221
skipRows = 2,
218222
rowsCount = 1,
223+
parseEmptyAsNull = false,
219224
)
220225

221226
df shouldBe dataFrameOf(
@@ -224,4 +229,17 @@ class XlsxTest {
224229
"Field 3: ", "", "TEAM 1", "", "", "", "", "Staff Code:", "Staff 1", "",
225230
)
226231
}
232+
233+
@Test
234+
fun `read columns with nulls`() {
235+
val df = DataFrame.readExcel(
236+
testResource("withNulls.xlsx"),
237+
).convert("age").toInt()
238+
df shouldBe dataFrameOf(
239+
"name" to listOf("Alice", null, "Bob"),
240+
"age" to listOf(23, 27, null),
241+
)
242+
df["name"].type shouldBe typeOf<String?>()
243+
df["age"].type shouldBe typeOf<Int?>()
244+
}
227245
}
Binary file not shown.

0 commit comments

Comments
 (0)