Skip to content

Commit bd46a6f

Browse files
authored
feat: support a new whitespace_as_null parameter (#426)
* feat: support a new whitespace_as_null parameter Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * refactor: adapt to review comments Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
1 parent 9f2be08 commit bd46a6f

File tree

15 files changed

+460
-116
lines changed

15 files changed

+460
-116
lines changed

python/fastexcel/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ def load_sheet(
304304
dtypes: DType | DTypeMap | None = None,
305305
eager: Literal[False] = ...,
306306
skip_whitespace_tail_rows: bool = False,
307+
whitespace_as_null: bool = False,
307308
) -> ExcelSheet: ...
308309

309310
@typing.overload
@@ -325,6 +326,7 @@ def load_sheet(
325326
dtypes: DType | DTypeMap | None = None,
326327
eager: Literal[True] = ...,
327328
skip_whitespace_tail_rows: bool = False,
329+
whitespace_as_null: bool = False,
328330
) -> pa.RecordBatch: ...
329331

330332
def load_sheet(
@@ -345,6 +347,7 @@ def load_sheet(
345347
dtypes: DType | DTypeMap | None = None,
346348
eager: bool = False,
347349
skip_whitespace_tail_rows: bool = False,
350+
whitespace_as_null: bool = False,
348351
) -> ExcelSheet | pa.RecordBatch:
349352
"""Loads a sheet by index or name.
350353
@@ -396,6 +399,7 @@ def load_sheet(
396399
Eager loading requires the `pyarrow` extra to be installed.
397400
:param skip_whitespace_tail_rows: Skip rows at the end of the sheet
398401
containing only whitespace and null values.
402+
:param whitespace_as_null: Consider cells containing only whitespace as null values.
399403
"""
400404
sheet_or_rb = self._reader.load_sheet(
401405
idx_or_name=idx_or_name,
@@ -409,6 +413,7 @@ def load_sheet(
409413
dtypes=dtypes,
410414
eager=eager,
411415
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
416+
whitespace_as_null=whitespace_as_null,
412417
)
413418
return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
414419

@@ -451,6 +456,7 @@ def load_table(
451456
dtypes: DType | DTypeMap | None = None,
452457
eager: Literal[False] = ...,
453458
skip_whitespace_tail_rows: bool = False,
459+
whitespace_as_null: bool = False,
454460
) -> ExcelTable: ...
455461

456462
@typing.overload
@@ -472,6 +478,7 @@ def load_table(
472478
dtypes: DType | DTypeMap | None = None,
473479
eager: Literal[True] = ...,
474480
skip_whitespace_tail_rows: bool = False,
481+
whitespace_as_null: bool = False,
475482
) -> pa.RecordBatch: ...
476483

477484
def load_table(
@@ -492,6 +499,7 @@ def load_table(
492499
dtypes: DType | DTypeMap | None = None,
493500
eager: bool = False,
494501
skip_whitespace_tail_rows: bool = False,
502+
whitespace_as_null: bool = False,
495503
) -> ExcelTable | pa.RecordBatch:
496504
"""Loads a table by name.
497505
@@ -538,6 +546,7 @@ def load_table(
538546
Eager loading requires the `pyarrow` extra to be installed.
539547
:param skip_whitespace_tail_rows: Skip rows at the end of the table
540548
containing only whitespace and null values.
549+
:param whitespace_as_null: Consider cells containing only whitespace as null values.
541550
"""
542551
if eager:
543552
return self._reader.load_table(
@@ -552,6 +561,7 @@ def load_table(
552561
dtypes=dtypes,
553562
eager=True,
554563
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
564+
whitespace_as_null=whitespace_as_null,
555565
)
556566
else:
557567
return ExcelTable(
@@ -567,6 +577,7 @@ def load_table(
567577
dtypes=dtypes,
568578
eager=False,
569579
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
580+
whitespace_as_null=whitespace_as_null,
570581
)
571582
)
572583

python/fastexcel/_fastexcel.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ class _ExcelReader:
202202
dtypes: DType | DTypeMap | None = None,
203203
eager: Literal[False] = ...,
204204
skip_whitespace_tail_rows: bool = False,
205+
whitespace_as_null: bool = False,
205206
) -> _ExcelSheet: ...
206207
@typing.overload
207208
def load_sheet(
@@ -222,6 +223,7 @@ class _ExcelReader:
222223
dtypes: DType | DTypeMap | None = None,
223224
eager: Literal[True] = ...,
224225
skip_whitespace_tail_rows: bool = False,
226+
whitespace_as_null: bool = False,
225227
) -> pa.RecordBatch: ...
226228
@typing.overload
227229
def load_sheet(
@@ -242,6 +244,7 @@ class _ExcelReader:
242244
dtypes: DType | DTypeMap | None = None,
243245
eager: bool = False,
244246
skip_whitespace_tail_rows: bool = False,
247+
whitespace_as_null: bool = False,
245248
) -> pa.RecordBatch: ...
246249
@typing.overload
247250
def load_table(
@@ -262,6 +265,7 @@ class _ExcelReader:
262265
dtypes: DType | DTypeMap | None = None,
263266
eager: Literal[False] = ...,
264267
skip_whitespace_tail_rows: bool = False,
268+
whitespace_as_null: bool = False,
265269
) -> _ExcelTable: ...
266270
@typing.overload
267271
def load_table(
@@ -282,6 +286,7 @@ class _ExcelReader:
282286
dtypes: DType | DTypeMap | None = None,
283287
eager: Literal[True] = ...,
284288
skip_whitespace_tail_rows: bool = False,
289+
whitespace_as_null: bool = False,
285290
) -> pa.RecordBatch: ...
286291
@property
287292
def sheet_names(self) -> list[str]: ...

python/tests/test_whitespace.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,89 @@ def test_skip_tail_whitespace_rows() -> None:
7575
pd_assert_frame_equal(
7676
table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
7777
)
78+
79+
80+
def test_skip_tail_rows_and_whitespace_as_null_behavior() -> None:
81+
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))
82+
83+
# Expected data when converting whitespace to null but not skipping tail rows
84+
expected_with_whitespace_as_null = pl.DataFrame(
85+
{
86+
# All rows should be taken into account but the space in the last row should be
87+
# considered null
88+
"Column One": [1.0, 2.0, 3.0, None, 5.0, None, None, None, None, None],
89+
# All rows should be taken into account but the empty string in 8th row should be
90+
# considered null
91+
"Column Two": ["one", "two", None, "four", "five", None, None, None, None, None],
92+
"Column Three": [
93+
datetime.datetime(2025, 11, 19, 14, 34, 2),
94+
datetime.datetime(2025, 11, 20, 14, 56, 34),
95+
datetime.datetime(2025, 11, 21, 15, 19, 6),
96+
None,
97+
datetime.datetime(2025, 11, 22, 15, 41, 38),
98+
datetime.datetime(2025, 11, 23, 16, 4, 10),
99+
None,
100+
None,
101+
None,
102+
None,
103+
],
104+
}
105+
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
106+
107+
# Expected data when converting whitespace to null and skipping tail rows
108+
expected_without_whitespace = pl.DataFrame(
109+
{
110+
"Column One": [1.0, 2.0, 3.0, None, 5.0, None],
111+
"Column Two": ["one", "two", None, "four", "five", None],
112+
"Column Three": [
113+
datetime.datetime(2025, 11, 19, 14, 34, 2),
114+
datetime.datetime(2025, 11, 20, 14, 56, 34),
115+
datetime.datetime(2025, 11, 21, 15, 19, 6),
116+
None,
117+
datetime.datetime(2025, 11, 22, 15, 41, 38),
118+
datetime.datetime(2025, 11, 23, 16, 4, 10),
119+
],
120+
}
121+
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
122+
123+
# Test sheet with whitespace_as_null but not skipping tail rows
124+
sheet_with_whitespace_as_null = excel_reader.load_sheet(
125+
"Without Table", whitespace_as_null=True
126+
)
127+
pl_assert_frame_equal(
128+
sheet_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null
129+
)
130+
131+
# Test table with whitespace_as_null but not skipping tail rows
132+
table_with_whitespace_as_null = excel_reader.load_table(
133+
"Table_with_whitespace", whitespace_as_null=True
134+
)
135+
pl_assert_frame_equal(
136+
table_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null
137+
)
138+
139+
# Test sheet with both whitespace_as_null and skip_whitespace_tail_rows
140+
sheet_without_whitespace = excel_reader.load_sheet(
141+
"Without Table", whitespace_as_null=True, skip_whitespace_tail_rows=True
142+
)
143+
pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace)
144+
145+
# Test table with both whitespace_as_null and skip_whitespace_tail_rows
146+
table_without_whitespace = excel_reader.load_table(
147+
"Table_with_whitespace", whitespace_as_null=True, skip_whitespace_tail_rows=True
148+
)
149+
pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace)
150+
151+
# Also verify pandas compatibility
152+
pd_assert_frame_equal(
153+
sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
154+
)
155+
pd_assert_frame_equal(
156+
sheet_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas()
157+
)
158+
pd_assert_frame_equal(
159+
table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
160+
)
161+
pd_assert_frame_equal(
162+
table_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas()
163+
)

src/data/mod.rs

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,25 @@ impl ExcelSheetData<'_> {
5454
end_row: usize,
5555
col: usize,
5656
dtype_coercion: &DTypeCoercion,
57+
whitespace_as_null: bool,
5758
) -> FastExcelResult<DType> {
5859
match self {
59-
ExcelSheetData::Owned(data) => {
60-
get_dtype_for_column(data, start_row, end_row, col, dtype_coercion)
61-
}
62-
ExcelSheetData::Ref(data) => {
63-
get_dtype_for_column(data, start_row, end_row, col, dtype_coercion)
64-
}
60+
ExcelSheetData::Owned(data) => get_dtype_for_column(
61+
data,
62+
start_row,
63+
end_row,
64+
col,
65+
dtype_coercion,
66+
whitespace_as_null,
67+
),
68+
ExcelSheetData::Ref(data) => get_dtype_for_column(
69+
data,
70+
start_row,
71+
end_row,
72+
col,
73+
dtype_coercion,
74+
whitespace_as_null,
75+
),
6576
}
6677
}
6778

@@ -297,6 +308,7 @@ impl FastExcelColumn {
297308
data: &Range<CT>,
298309
offset: usize,
299310
limit: usize,
311+
whitespace_as_null: bool,
300312
) -> FastExcelResult<Self> {
301313
let len = limit.checked_sub(offset).ok_or_else(|| {
302314
FastExcelErrorKind::InvalidParameters(format!(
@@ -311,9 +323,13 @@ impl FastExcelColumn {
311323
DType::Float => {
312324
FastExcelSeries::Float(create_float_vec(data, column_info.index, offset, limit))
313325
}
314-
DType::String => {
315-
FastExcelSeries::String(create_string_vec(data, column_info.index, offset, limit))
316-
}
326+
DType::String => FastExcelSeries::String(create_string_vec(
327+
data,
328+
column_info.index,
329+
offset,
330+
limit,
331+
whitespace_as_null,
332+
)),
317333
DType::Bool => {
318334
FastExcelSeries::Bool(create_boolean_vec(data, column_info.index, offset, limit))
319335
}

0 commit comments

Comments
 (0)