Skip to content

Commit f0fdf3f

Browse files
authored
feat: do not require full range for usecols (#368)
1 parent 9d9c6f4 commit f0fdf3f

File tree

3 files changed

+315
-27
lines changed

3 files changed

+315
-27
lines changed

python/fastexcel/__init__.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,12 @@ def load_sheet(
327327
- A list of strings and ints, the column names and/or indices
328328
(starting at 0)
329329
- A string, a comma separated list of Excel column letters and column
330-
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
331-
`A,B,C,D,E` and `A,C,E,F`)
330+
ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
331+
`A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
332+
(e.g. `"B:"` to select all columns from B onwards) and from-beginning
333+
ranges (e.g. `":C"` to select columns from A to C). These can be
334+
combined for "except" patterns (e.g. `":C,E:"` to select everything
335+
except column D)
332336
- A callable, a function that takes a column and returns a boolean
333337
indicating whether the column should be used
334338
:param dtypes: An optional dtype (for all columns)
@@ -443,8 +447,12 @@ def load_table(
443447
- A list of strings and ints, the column names and/or indices
444448
(starting at 0)
445449
- A string, a comma separated list of Excel column letters and column
446-
ranges (e.g. `“A:E”` or `“A,C,E:F”`, which would result in
447-
`A,B,C,D,E` and `A,C,E,F`)
450+
ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
451+
`A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
452+
(e.g. `"B:"` to select all columns from B onwards) and from-beginning
453+
ranges (e.g. `":C"` to select columns from A to C). These can be
454+
combined for "except" patterns (e.g. `":C,E:"` to select everything
455+
except column D)
448456
- A callable, a function that takes a column and returns a boolean
449457
indicating whether the column should be used
450458
:param dtypes: An optional dtype (for all columns)

python/tests/test_column_selection.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,138 @@ def test_single_sheet_with_unnamed_columns_and_str_range(
303303
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
304304

305305

306+
def test_single_sheet_with_unnamed_columns_and_open_ended_range(
307+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
308+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
309+
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
310+
) -> None:
311+
# Test B: (should get columns B, C, D, E - indices 1, 2, 3, 4)
312+
use_columns_str = "B:"
313+
expected = {
314+
k: v
315+
for k, v in single_sheet_with_unnamed_columns_expected.items()
316+
if k in ["__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
317+
}
318+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
319+
"With unnamed columns", use_columns=use_columns_str
320+
)
321+
assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[1:]
322+
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
323+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
324+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
325+
326+
327+
def test_single_sheet_with_unnamed_columns_and_open_ended_range_from_start(
328+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
329+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
330+
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
331+
) -> None:
332+
# Test A: (should get all columns)
333+
use_columns_str = "A:"
334+
expected = single_sheet_with_unnamed_columns_expected
335+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
336+
"With unnamed columns", use_columns=use_columns_str
337+
)
338+
assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info
339+
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
340+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
341+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
342+
343+
344+
def test_single_sheet_with_unnamed_columns_and_mixed_open_ended_range(
345+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
346+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
347+
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
348+
) -> None:
349+
# Test A,C: (should get column A and columns from C onwards - indices 0, 2, 3, 4)
350+
use_columns_str = "A,C:"
351+
expected = {
352+
k: v
353+
for k, v in single_sheet_with_unnamed_columns_expected.items()
354+
if k in ["col1", "col3", "__UNNAMED__3", "col5"]
355+
}
356+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
357+
"With unnamed columns", use_columns=use_columns_str
358+
)
359+
expected_selected_cols = [
360+
sheet_with_unnamed_columns_expected_column_info[0]
361+
] + sheet_with_unnamed_columns_expected_column_info[2:]
362+
assert sheet.selected_columns == expected_selected_cols
363+
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
364+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
365+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
366+
367+
368+
def test_single_sheet_with_unnamed_columns_and_from_beginning_range(
369+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
370+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
371+
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
372+
) -> None:
373+
# Test :C (should get columns A, B, C - indices 0, 1, 2)
374+
use_columns_str = ":C"
375+
expected = {
376+
k: v
377+
for k, v in single_sheet_with_unnamed_columns_expected.items()
378+
if k in ["col1", "__UNNAMED__1", "col3"]
379+
}
380+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
381+
"With unnamed columns", use_columns=use_columns_str
382+
)
383+
assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[:3]
384+
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
385+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
386+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
387+
388+
389+
def test_single_sheet_with_unnamed_columns_and_from_beginning_range_single_column(
390+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
391+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
392+
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
393+
) -> None:
394+
# Test :A (should get only column A - index 0)
395+
use_columns_str = ":A"
396+
expected = {
397+
k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1"]
398+
}
399+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
400+
"With unnamed columns", use_columns=use_columns_str
401+
)
402+
assert sheet.selected_columns == [sheet_with_unnamed_columns_expected_column_info[0]]
403+
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
404+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
405+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
406+
407+
408+
def test_single_sheet_with_unnamed_columns_and_complex_mixed_pattern(
409+
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
410+
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
411+
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
412+
) -> None:
413+
# Test A,:B,D,E: (should get A, A,B again (deduplicated), D, and E)
414+
# This effectively becomes A,B,D,E (columns 0,1,3,4)
415+
use_columns_str = "A,:B,D,E:"
416+
expected = {
417+
k: v
418+
for k, v in single_sheet_with_unnamed_columns_expected.items()
419+
if k in ["col1", "__UNNAMED__1", "__UNNAMED__3", "col5"]
420+
}
421+
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
422+
"With unnamed columns", use_columns=use_columns_str
423+
)
424+
# Expected: columns A, A,B (from :B), D, E (from E:)
425+
# After deduplication: 0,1,3,4
426+
expected_selected_cols = [
427+
sheet_with_unnamed_columns_expected_column_info[0], # A
428+
sheet_with_unnamed_columns_expected_column_info[1], # B
429+
sheet_with_unnamed_columns_expected_column_info[3], # D
430+
sheet_with_unnamed_columns_expected_column_info[4], # E
431+
]
432+
assert sheet.selected_columns == expected_selected_cols
433+
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
434+
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
435+
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
436+
437+
306438
def test_single_sheet_invalid_column_indices_negative_integer(
307439
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
308440
) -> None:

0 commit comments

Comments
 (0)