Skip to content

Commit 9f2be08

Browse files
authored
feat!: support a new skip_whitespace_tail_rows parameter (#425)
* refactor: pass entire LoadSheetOrTable struct around rather than storing every format option individually Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * feat: support a new skip_whitespace_tail_rows parameter Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * chore: lint Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * rename test files Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
1 parent 268dbc2 commit 9f2be08

File tree

17 files changed

+423
-131
lines changed

17 files changed

+423
-131
lines changed

.clippy.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
disallowed-macros = [
2+
{ path = "std::assert_ne", reason = "use `pretty_assertions::assert_ne` instead" },
3+
{ path = "std::assert_eq", reason = "use `pretty_assertions::assert_eq` instead" },
4+
{ path = "std::assert_matches::assert_matches", reason = "use `pretty_assertions::assert_matches` instead" },
5+
]

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@ lint-python:
4646
lint-rust:
4747
cargo fmt --all -- --check
4848
# Rust
49-
cargo clippy --tests
49+
cargo clippy --tests -- -D warnings
5050
# Python-related code
51-
cargo clippy --features __maturin --tests
51+
cargo clippy --features __maturin,__pyo3-tests --tests -- -D warnings
5252
# Rust+polars
53-
cargo clippy --features polars --tests
53+
cargo clippy --features polars --tests -- -D warnings
5454

5555
.PHONY: lint ## Lint rust and python source files
5656
lint: lint-python lint-rust

python/fastexcel/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ def load_sheet(
303303
| None = None,
304304
dtypes: DType | DTypeMap | None = None,
305305
eager: Literal[False] = ...,
306+
skip_whitespace_tail_rows: bool = False,
306307
) -> ExcelSheet: ...
307308

308309
@typing.overload
@@ -323,6 +324,7 @@ def load_sheet(
323324
| None = None,
324325
dtypes: DType | DTypeMap | None = None,
325326
eager: Literal[True] = ...,
327+
skip_whitespace_tail_rows: bool = False,
326328
) -> pa.RecordBatch: ...
327329

328330
def load_sheet(
@@ -342,6 +344,7 @@ def load_sheet(
342344
| None = None,
343345
dtypes: DType | DTypeMap | None = None,
344346
eager: bool = False,
347+
skip_whitespace_tail_rows: bool = False,
345348
) -> ExcelSheet | pa.RecordBatch:
346349
"""Loads a sheet by index or name.
347350
@@ -391,6 +394,8 @@ def load_sheet(
391394
whereas `True` will load it eagerly via `pyarrow`.
392395
393396
Eager loading requires the `pyarrow` extra to be installed.
397+
:param skip_whitespace_tail_rows: Skip rows at the end of the sheet
398+
containing only whitespace and null values.
394399
"""
395400
sheet_or_rb = self._reader.load_sheet(
396401
idx_or_name=idx_or_name,
@@ -403,6 +408,7 @@ def load_sheet(
403408
use_columns=use_columns,
404409
dtypes=dtypes,
405410
eager=eager,
411+
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
406412
)
407413
return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
408414

@@ -444,6 +450,7 @@ def load_table(
444450
| None = None,
445451
dtypes: DType | DTypeMap | None = None,
446452
eager: Literal[False] = ...,
453+
skip_whitespace_tail_rows: bool = False,
447454
) -> ExcelTable: ...
448455

449456
@typing.overload
@@ -464,6 +471,7 @@ def load_table(
464471
| None = None,
465472
dtypes: DType | DTypeMap | None = None,
466473
eager: Literal[True] = ...,
474+
skip_whitespace_tail_rows: bool = False,
467475
) -> pa.RecordBatch: ...
468476

469477
def load_table(
@@ -483,6 +491,7 @@ def load_table(
483491
| None = None,
484492
dtypes: DType | DTypeMap | None = None,
485493
eager: bool = False,
494+
skip_whitespace_tail_rows: bool = False,
486495
) -> ExcelTable | pa.RecordBatch:
487496
"""Loads a table by name.
488497
@@ -527,6 +536,8 @@ def load_table(
527536
whereas `True` will load it eagerly via `pyarrow`.
528537
529538
Eager loading requires the `pyarrow` extra to be installed.
539+
:param skip_whitespace_tail_rows: Skip rows at the end of the table
540+
containing only whitespace and null values.
530541
"""
531542
if eager:
532543
return self._reader.load_table(
@@ -540,6 +551,7 @@ def load_table(
540551
use_columns=use_columns,
541552
dtypes=dtypes,
542553
eager=True,
554+
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
543555
)
544556
else:
545557
return ExcelTable(
@@ -554,6 +566,7 @@ def load_table(
554566
use_columns=use_columns,
555567
dtypes=dtypes,
556568
eager=False,
569+
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
557570
)
558571
)
559572

python/fastexcel/_fastexcel.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ class _ExcelReader:
201201
| None = None,
202202
dtypes: DType | DTypeMap | None = None,
203203
eager: Literal[False] = ...,
204+
skip_whitespace_tail_rows: bool = False,
204205
) -> _ExcelSheet: ...
205206
@typing.overload
206207
def load_sheet(
@@ -220,6 +221,7 @@ class _ExcelReader:
220221
| None = None,
221222
dtypes: DType | DTypeMap | None = None,
222223
eager: Literal[True] = ...,
224+
skip_whitespace_tail_rows: bool = False,
223225
) -> pa.RecordBatch: ...
224226
@typing.overload
225227
def load_sheet(
@@ -239,6 +241,7 @@ class _ExcelReader:
239241
| None = None,
240242
dtypes: DType | DTypeMap | None = None,
241243
eager: bool = False,
244+
skip_whitespace_tail_rows: bool = False,
242245
) -> pa.RecordBatch: ...
243246
@typing.overload
244247
def load_table(
@@ -258,6 +261,7 @@ class _ExcelReader:
258261
| None = None,
259262
dtypes: DType | DTypeMap | None = None,
260263
eager: Literal[False] = ...,
264+
skip_whitespace_tail_rows: bool = False,
261265
) -> _ExcelTable: ...
262266
@typing.overload
263267
def load_table(
@@ -277,6 +281,7 @@ class _ExcelReader:
277281
| None = None,
278282
dtypes: DType | DTypeMap | None = None,
279283
eager: Literal[True] = ...,
284+
skip_whitespace_tail_rows: bool = False,
280285
) -> pa.RecordBatch: ...
281286
@property
282287
def sheet_names(self) -> list[str]: ...

python/tests/test_whitespace.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import datetime
2+
3+
import fastexcel
4+
import polars as pl
5+
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
6+
from polars.testing import assert_frame_equal as pl_assert_frame_equal
7+
8+
from .utils import path_for_fixture
9+
10+
11+
def test_skip_tail_whitespace_rows() -> None:
12+
"""Test that skip_whitespace_tail_rows option works correctly."""
13+
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))
14+
15+
# Expected data when NOT skipping whitespace tail rows
16+
expected_with_whitespace = pl.DataFrame(
17+
{
18+
"Column One": ["1", "2", "3", None, "5", None, None, None, None, " "],
19+
"Column Two": ["one", "two", None, "four", "five", None, None, "", None, None],
20+
"Column Three": [
21+
datetime.datetime(2025, 11, 19, 14, 34, 2),
22+
datetime.datetime(2025, 11, 20, 14, 56, 34),
23+
datetime.datetime(2025, 11, 21, 15, 19, 6),
24+
None,
25+
datetime.datetime(2025, 11, 22, 15, 41, 38),
26+
datetime.datetime(2025, 11, 23, 16, 4, 10),
27+
None,
28+
None,
29+
None,
30+
None,
31+
],
32+
}
33+
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
34+
35+
# Expected data when skipping whitespace tail rows
36+
expected_without_whitespace = pl.DataFrame(
37+
{
38+
"Column One": [1.0, 2.0, 3.0, None, 5.0, None],
39+
"Column Two": ["one", "two", None, "four", "five", None],
40+
"Column Three": [
41+
datetime.datetime(2025, 11, 19, 14, 34, 2),
42+
datetime.datetime(2025, 11, 20, 14, 56, 34),
43+
datetime.datetime(2025, 11, 21, 15, 19, 6),
44+
None,
45+
datetime.datetime(2025, 11, 22, 15, 41, 38),
46+
datetime.datetime(2025, 11, 23, 16, 4, 10),
47+
],
48+
}
49+
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
50+
51+
# Test sheet without skipping whitespace tail rows
52+
sheet_with_whitespace = excel_reader.load_sheet("Without Table")
53+
pl_assert_frame_equal(sheet_with_whitespace.to_polars(), expected_with_whitespace)
54+
55+
# Test table without skipping whitespace tail rows
56+
table_with_whitespace = excel_reader.load_table("Table_with_whitespace")
57+
pl_assert_frame_equal(table_with_whitespace.to_polars(), expected_with_whitespace)
58+
59+
# Test sheet with skipping whitespace tail rows
60+
sheet_without_whitespace = excel_reader.load_sheet(
61+
"Without Table", skip_whitespace_tail_rows=True
62+
)
63+
pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace)
64+
65+
# Test table with skipping whitespace tail rows
66+
table_without_whitespace = excel_reader.load_table(
67+
"Table_with_whitespace", skip_whitespace_tail_rows=True
68+
)
69+
pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace)
70+
71+
# Also verify pandas compatibility
72+
pd_assert_frame_equal(
73+
sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
74+
)
75+
pd_assert_frame_equal(
76+
table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
77+
)

src/data/mod.rs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,17 @@ impl ExcelSheetData<'_> {
6464
}
6565
}
6666
}
67+
68+
pub(crate) fn height_without_tail_whitespace(&self) -> usize {
69+
match self {
70+
ExcelSheetData::Owned(data) => {
71+
height_without_tail_whitespace(data).unwrap_or_else(|| data.height())
72+
}
73+
ExcelSheetData::Ref(data) => {
74+
height_without_tail_whitespace(data).unwrap_or_else(|| data.height())
75+
}
76+
}
77+
}
6778
}
6879

6980
impl From<Range<CalData>> for ExcelSheetData<'_> {
@@ -78,6 +89,55 @@ impl<'a> From<Range<CalDataRef<'a>>> for ExcelSheetData<'a> {
7889
}
7990
}
8091

92+
trait CellIsWhiteSpace {
93+
fn is_whitespace(&self) -> bool;
94+
}
95+
96+
impl<T> CellIsWhiteSpace for T
97+
where
98+
T: DataType,
99+
{
100+
fn is_whitespace(&self) -> bool {
101+
if self.is_empty() {
102+
true
103+
} else if self.is_string()
104+
&& let Some(s) = self.get_string()
105+
{
106+
s.trim().is_empty()
107+
} else {
108+
false
109+
}
110+
}
111+
}
112+
113+
pub(crate) fn height_without_tail_whitespace<CT: CellType + DataType + std::fmt::Debug>(
114+
data: &Range<CT>,
115+
) -> Option<usize> {
116+
let height = data.height();
117+
let width = data.width();
118+
if height < 1 {
119+
return Some(0);
120+
}
121+
if width < 1 {
122+
return None;
123+
}
124+
(0..width)
125+
.map(|col_idx| {
126+
let mut row_idx = height - 1;
127+
// Start at the bottom of the column and work upwards until we find a non-empty cell
128+
while row_idx > 0
129+
&& data
130+
.get((row_idx, col_idx))
131+
.map(CellIsWhiteSpace::is_whitespace)
132+
.unwrap_or(true)
133+
{
134+
row_idx -= 1;
135+
}
136+
row_idx + 1
137+
})
138+
.max()
139+
}
140+
81141
/// A container for a typed vector of values. Used to represent a column of data in an Excel sheet.
82142
/// These should only be used when you need to work on the raw data. Otherwise, you should use a
83143
/// `FastExcelColumn`.
@@ -138,6 +198,12 @@ macro_rules! impl_series_variant {
138198
}
139199
}
140200

201+
impl From<&[Option<$type>]> for FastExcelSeries {
202+
fn from(arr: &[Option<$type>]) -> Self {
203+
Self::$variant(arr.into_iter().map(ToOwned::to_owned).collect())
204+
}
205+
}
206+
141207
// Not implementing is_empty here, because we have no len information for null Series
142208
impl FastExcelSeries {
143209
pub fn $into_fn(self) -> FastExcelResult<Vec<Option<$type>>> {

src/types/dtype/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ pub(crate) fn excel_float_to_string(x: f64) -> String {
294294
#[cfg(test)]
295295
mod tests {
296296
use calamine::{Cell, Data as CalData};
297+
use pretty_assertions::assert_eq;
297298
use rstest::{fixture, rstest};
298299

299300
use super::*;

0 commit comments

Comments
 (0)