Skip to content

Commit 874c188

Browse files
lukapeschkeCopilot
andauthored
fix: allow using use_columns with load_table when column_names is not specified (#437)
* fix: allow using `use_columns` with `load_table` when `column_names` is not specified closes #436 Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> * Update src/types/exceltable/mod.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 70e42b8 commit 874c188

File tree

5 files changed

+437
-8
lines changed

5 files changed

+437
-8
lines changed

python/tests/test_column_selection.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,3 +703,157 @@ def test_use_columns_dtypes_eager_loading(
703703

704704
assert pd_df.columns.to_list() == use_columns
705705
assert pl_df.columns == use_columns
706+
707+
708+
def test_use_columns_with_table() -> None:
709+
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
710+
711+
table = excel_reader.load_table("users", use_columns=["User Id", "FirstName"])
712+
713+
expected_available_columns = [
714+
fastexcel.ColumnInfo(
715+
name="User Id",
716+
index=0,
717+
absolute_index=0,
718+
dtype="float",
719+
column_name_from="provided",
720+
dtype_from="guessed",
721+
),
722+
fastexcel.ColumnInfo(
723+
name="FirstName",
724+
index=1,
725+
absolute_index=1,
726+
dtype="string",
727+
column_name_from="provided",
728+
dtype_from="guessed",
729+
),
730+
fastexcel.ColumnInfo(
731+
name="__UNNAMED__2",
732+
index=2,
733+
absolute_index=2,
734+
dtype="string",
735+
column_name_from="generated",
736+
dtype_from="guessed",
737+
),
738+
fastexcel.ColumnInfo(
739+
name="__UNNAMED__3",
740+
index=3,
741+
absolute_index=3,
742+
dtype="datetime",
743+
column_name_from="generated",
744+
dtype_from="guessed",
745+
),
746+
]
747+
748+
expected_selected_columns = [
749+
fastexcel.ColumnInfo(
750+
name="User Id",
751+
index=0,
752+
absolute_index=0,
753+
dtype="float",
754+
column_name_from="provided",
755+
dtype_from="guessed",
756+
),
757+
fastexcel.ColumnInfo(
758+
name="FirstName",
759+
index=1,
760+
absolute_index=1,
761+
dtype="string",
762+
column_name_from="provided",
763+
dtype_from="guessed",
764+
),
765+
]
766+
767+
assert table.available_columns() == expected_available_columns
768+
assert table.selected_columns == expected_selected_columns
769+
770+
expected_pl_df = pl.DataFrame(
771+
{"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]}
772+
)
773+
expected_pd_df = pd.DataFrame(
774+
{"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]}
775+
)
776+
777+
pl_df = table.to_polars()
778+
pl_assert_frame_equal(pl_df, expected_pl_df)
779+
780+
pd_df = table.to_pandas()
781+
pd_assert_frame_equal(pd_df, expected_pd_df)
782+
783+
784+
def test_use_columns_with_table_and_provided_columns() -> None:
785+
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
786+
787+
table = excel_reader.load_table(
788+
"users", use_columns=[0, 2], column_names=["user_id", "last_name"]
789+
)
790+
791+
expected_available_columns = [
792+
fastexcel.ColumnInfo(
793+
name="user_id",
794+
index=0,
795+
absolute_index=0,
796+
dtype="float",
797+
column_name_from="provided",
798+
dtype_from="guessed",
799+
),
800+
fastexcel.ColumnInfo(
801+
name="__UNNAMED__1",
802+
index=1,
803+
absolute_index=1,
804+
dtype="string",
805+
column_name_from="generated",
806+
dtype_from="guessed",
807+
),
808+
fastexcel.ColumnInfo(
809+
name="last_name",
810+
index=2,
811+
absolute_index=2,
812+
dtype="string",
813+
column_name_from="provided",
814+
dtype_from="guessed",
815+
),
816+
fastexcel.ColumnInfo(
817+
name="__UNNAMED__3",
818+
index=3,
819+
absolute_index=3,
820+
dtype="datetime",
821+
column_name_from="generated",
822+
dtype_from="guessed",
823+
),
824+
]
825+
826+
expected_selected_columns = [
827+
fastexcel.ColumnInfo(
828+
name="user_id",
829+
index=0,
830+
absolute_index=0,
831+
dtype="float",
832+
column_name_from="provided",
833+
dtype_from="guessed",
834+
),
835+
fastexcel.ColumnInfo(
836+
name="last_name",
837+
index=2,
838+
absolute_index=2,
839+
dtype="string",
840+
column_name_from="provided",
841+
dtype_from="guessed",
842+
),
843+
]
844+
845+
assert table.available_columns() == expected_available_columns
846+
assert table.selected_columns == expected_selected_columns
847+
848+
expected_pl_df = pl.DataFrame(
849+
{"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]}
850+
)
851+
expected_pd_df = pd.DataFrame(
852+
{"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]}
853+
)
854+
855+
pl_df = table.to_polars()
856+
pl_assert_frame_equal(pl_df, expected_pl_df)
857+
858+
pd_df = table.to_pandas()
859+
pd_assert_frame_equal(pd_df, expected_pd_df)

src/types/excelsheet/column_info/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ fn column_info_from_header<D: CalamineDataProvider>(
375375
if let SelectedColumns::Selection(column_selection) = selected_columns {
376376
if column_selection.len() != names.len() {
377377
return Err(FastExcelErrorKind::InvalidParameters(
378-
"column_names and use_columns must have the same length".to_string(),
378+
"column_names and use_columns must have the same length when a header is provided".to_string(),
379379
)
380380
.into());
381381
}

src/types/exceltable/mod.rs

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ use polars_core::frame::DataFrame;
88
use pyo3::pyclass;
99

1010
use crate::{
11-
FastExcelColumn, LoadSheetOrTableOptions,
11+
FastExcelColumn, FastExcelErrorKind, IdxOrName, LoadSheetOrTableOptions, SelectedColumns,
1212
data::height_without_tail_whitespace,
13-
error::FastExcelResult,
13+
error::{ErrorContext, FastExcelResult},
1414
types::{
1515
dtype::DTypes,
1616
excelsheet::{
@@ -42,17 +42,66 @@ pub struct ExcelTable {
4242
}
4343

4444
impl ExcelTable {
45+
/// Builds a `Header` for a table. This might update the column selection, if provided
46+
fn build_header_and_update_selection(
47+
table: &Table<Data>,
48+
mut opts: LoadSheetOrTableOptions,
49+
) -> FastExcelResult<(Header, LoadSheetOrTableOptions)> {
50+
Ok(match (&opts.column_names, opts.header_row) {
51+
(None, None) => {
52+
let mut table_columns: Vec<String> = table.columns().into();
53+
// If there is a column selection, we need to convert all elements to column
54+
// indices. This is required because we will be providing the header, and it
55+
// it is required to use an index-based selection when custom column names are provided
56+
if let SelectedColumns::Selection(selected_columns) = &opts.selected_columns {
57+
let selected_column_indices = selected_columns
58+
.iter()
59+
.map(|idx_or_name| match idx_or_name {
60+
IdxOrName::Idx(idx) => Ok(*idx),
61+
IdxOrName::Name(name) => table_columns
62+
.iter()
63+
.enumerate()
64+
.find_map(|(idx, col_name)| {
65+
(col_name.as_str() == name.as_str()).then_some(idx)
66+
})
67+
.ok_or_else(|| {
68+
FastExcelErrorKind::ColumnNotFound(name.clone().into()).into()
69+
})
70+
.with_context(|| {
71+
format!("available columns are: {table_columns:?}")
72+
}),
73+
})
74+
.collect::<FastExcelResult<Vec<usize>>>()?;
75+
76+
table_columns = table_columns
77+
.into_iter()
78+
.enumerate()
79+
.filter_map(|(idx, col_name)| {
80+
selected_column_indices.contains(&idx).then_some(col_name)
81+
})
82+
.collect();
83+
84+
opts = opts.selected_columns(SelectedColumns::Selection(
85+
selected_column_indices
86+
.into_iter()
87+
.map(Into::into)
88+
.collect(),
89+
))
90+
}
91+
(Header::With(table_columns), opts)
92+
}
93+
(None, Some(row)) => (Header::At(row), opts),
94+
(Some(column_names), _) => (Header::With(column_names.clone()), opts),
95+
})
96+
}
97+
4598
pub(crate) fn try_new(
4699
table: Table<Data>,
47100
opts: LoadSheetOrTableOptions,
48101
) -> FastExcelResult<Self> {
49102
let pagination = Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, table.data())?;
50103

51-
let header = match (opts.column_names.clone(), opts.header_row) {
52-
(None, None) => Header::With(table.columns().into()),
53-
(None, Some(row)) => Header::At(row),
54-
(Some(column_names), _) => Header::With(column_names),
55-
};
104+
let (header, opts) = Self::build_header_and_update_selection(&table, opts)?;
56105

57106
let available_columns_info =
58107
build_available_columns_info(table.data(), &opts.selected_columns, &header)?;

0 commit comments

Comments
 (0)