Skip to content

Commit 13952f8

Browse files
committed
Clean usecol and date processing
1 parent 7664117 commit 13952f8

File tree

3 files changed

+54
-59
lines changed

3 files changed

+54
-59
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ def _clean_mapping(self, mapping):
324324
def _agg_index(self, index) -> Index:
325325
arrays = []
326326
converters = self._clean_mapping(self.converters)
327+
clean_dtypes = self._clean_mapping(self.dtype)
327328

328329
for i, arr in enumerate(index):
329330
if self._should_parse_dates(i):
@@ -352,8 +353,6 @@ def _agg_index(self, index) -> Index:
352353
else:
353354
col_na_values, col_na_fvalues = set(), set()
354355

355-
clean_dtypes = self._clean_mapping(self.dtype)
356-
357356
cast_type = None
358357
index_converter = False
359358
if self.index_names is not None:
@@ -620,35 +619,6 @@ def _check_data_length(
620619
stacklevel=find_stack_level(),
621620
)
622621

623-
@overload
624-
def _evaluate_usecols(
625-
self,
626-
usecols: Callable[[Hashable], object],
627-
names: Iterable[Hashable],
628-
) -> set[int]: ...
629-
630-
@overload
631-
def _evaluate_usecols(
632-
self, usecols: SequenceT, names: Iterable[Hashable]
633-
) -> SequenceT: ...
634-
635-
@final
636-
def _evaluate_usecols(
637-
self,
638-
usecols: Callable[[Hashable], object] | SequenceT,
639-
names: Iterable[Hashable],
640-
) -> SequenceT | set[int]:
641-
"""
642-
Check whether or not the 'usecols' parameter
643-
is a callable. If so, enumerates the 'names'
644-
parameter and returns a set of indices for
645-
each entry in 'names' that evaluates to True.
646-
If not a callable, returns 'usecols'.
647-
"""
648-
if callable(usecols):
649-
return {i for i, name in enumerate(names) if usecols(name)}
650-
return usecols
651-
652622
@final
653623
def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:
654624
"""
@@ -976,3 +946,32 @@ def _validate_usecols_arg(usecols):
976946

977947
return usecols, usecols_dtype
978948
return usecols, None
949+
950+
951+
@overload
952+
def evaluate_callable_usecols(
953+
usecols: Callable[[Hashable], object],
954+
names: Iterable[Hashable],
955+
) -> set[int]: ...
956+
957+
958+
@overload
959+
def evaluate_callable_usecols(
960+
usecols: SequenceT, names: Iterable[Hashable]
961+
) -> SequenceT: ...
962+
963+
964+
def evaluate_callable_usecols(
965+
usecols: Callable[[Hashable], object] | SequenceT,
966+
names: Iterable[Hashable],
967+
) -> SequenceT | set[int]:
968+
"""
969+
Check whether or not the 'usecols' parameter
970+
is a callable. If so, enumerates the 'names'
971+
parameter and returns a set of indices for
972+
each entry in 'names' that evaluates to True.
973+
If not a callable, returns 'usecols'.
974+
"""
975+
if callable(usecols):
976+
return {i for i, name in enumerate(names) if usecols(name)}
977+
return usecols

pandas/io/parsers/c_parser_wrapper.py

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
ParserBase,
3232
ParserError,
3333
date_converter,
34+
evaluate_callable_usecols,
3435
is_index_col,
3536
validate_parse_dates_presence,
3637
)
@@ -133,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
133134
self.orig_names = self.names[:] # type: ignore[has-type]
134135

135136
if self.usecols:
136-
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
137+
usecols = evaluate_callable_usecols(self.usecols, self.orig_names)
137138

138139
# GH 14671
139140
# assert for mypy, orig_names is List or None, None would error in issubset
@@ -256,8 +257,7 @@ def read(
256257
columns, self.col_names
257258
)
258259

259-
if self.usecols is not None:
260-
columns = self._filter_usecols(columns)
260+
columns = _filter_usecols(self.usecols, columns)
261261

262262
col_dict = {k: v for k, v in col_dict.items() if k in columns}
263263

@@ -290,13 +290,21 @@ def read(
290290
else:
291291
values = data.pop(self.index_col[i])
292292

293-
values = self._maybe_parse_dates(values, i)
293+
if self._should_parse_dates(i):
294+
values = date_converter(
295+
values,
296+
col=self.index_names[index]
297+
if self.index_names is not None
298+
else None,
299+
dayfirst=self.dayfirst,
300+
cache_dates=self.cache_dates,
301+
date_format=self.date_format,
302+
)
294303
arrays.append(values)
295304

296305
index = ensure_index_from_sequences(arrays)
297306

298-
if self.usecols is not None:
299-
names = self._filter_usecols(names)
307+
names = _filter_usecols(self.usecols, names)
300308

301309
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
302310

@@ -320,8 +328,7 @@ def read(
320328
names = list(self.orig_names)
321329
names = dedup_names(names, is_potential_multi_index(names, self.index_col))
322330

323-
if self.usecols is not None:
324-
names = self._filter_usecols(names)
331+
names = _filter_usecols(self.usecols, names)
325332

326333
# columns as list
327334
alldata = [x[1] for x in data_tups]
@@ -335,25 +342,13 @@ def read(
335342

336343
return index, column_names, date_data
337344

338-
def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]:
339-
# hackish
340-
usecols = self._evaluate_usecols(self.usecols, names)
341-
if usecols is not None and len(names) != len(usecols):
342-
return [
343-
name for i, name in enumerate(names) if i in usecols or name in usecols
344-
]
345-
return names
346-
347-
def _maybe_parse_dates(self, values, index: int):
348-
if self._should_parse_dates(index):
349-
values = date_converter(
350-
values,
351-
col=self.index_names[index] if self.index_names is not None else None,
352-
dayfirst=self.dayfirst,
353-
cache_dates=self.cache_dates,
354-
date_format=self.date_format,
355-
)
356-
return values
345+
346+
def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]:
347+
# hackish
348+
usecols = evaluate_callable_usecols(usecols, names)
349+
if usecols is not None and len(names) != len(usecols):
350+
return [name for i, name in enumerate(names) if i in usecols or name in usecols]
351+
return names
357352

358353

359354
def _concatenate_chunks(

pandas/io/parsers/python_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
)
6060
from pandas.io.parsers.base_parser import (
6161
ParserBase,
62+
evaluate_callable_usecols,
6263
get_na_values,
6364
parser_defaults,
6465
validate_parse_dates_presence,
@@ -774,7 +775,7 @@ def _handle_usecols(
774775
col_indices: set[int] | list[int]
775776
if self.usecols is not None:
776777
if callable(self.usecols):
777-
col_indices = self._evaluate_usecols(self.usecols, usecols_key)
778+
col_indices = evaluate_callable_usecols(self.usecols, usecols_key)
778779
elif any(isinstance(u, str) for u in self.usecols):
779780
if len(columns) > 1:
780781
raise ValueError(

0 commit comments

Comments
 (0)