Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,9 +377,7 @@ def _extract_multi_indexer_columns(

# clean the index_names
index_names = header.pop(-1)
index_names, _, _ = self._clean_index_names(
index_names, self.index_col, self.unnamed_cols
)
index_names, _, _ = self._clean_index_names(index_names, self.index_col)

# extract the columns
field_count = len(header[0])
Expand Down Expand Up @@ -455,21 +453,24 @@ def _maybe_make_multi_index_columns(
return columns

@final
def _make_index(self, data, alldata, columns, indexnamerow=False):
def _make_index(
self, data, alldata, columns, indexnamerow=False
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
index: Index | None
if not is_index_col(self.index_col) or not self.index_col:
index = None

elif not self._has_complex_date_col:
index = self._get_simple_index(alldata, columns)
index = self._agg_index(index)
simple_index = self._get_simple_index(alldata, columns)
index = self._agg_index(simple_index)
elif self._has_complex_date_col:
if not self._name_processed:
(self.index_names, _, self.index_col) = self._clean_index_names(
list(columns), self.index_col, self.unnamed_cols
list(columns), self.index_col
)
self._name_processed = True
index = self._get_complex_date_index(data, columns)
index = self._agg_index(index, try_parse_dates=False)
date_index = self._get_complex_date_index(data, columns)
index = self._agg_index(date_index, try_parse_dates=False)

# add names for the index
if indexnamerow:
Expand Down Expand Up @@ -1040,7 +1041,7 @@ def _validate_usecols_arg(self, usecols):
return usecols, usecols_dtype
return usecols, None

def _clean_index_names(self, columns, index_col, unnamed_cols):
def _clean_index_names(self, columns, index_col):
if not is_index_col(index_col):
return None, columns, index_col

Expand Down Expand Up @@ -1072,7 +1073,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols):

# Only clean index names that were placeholders.
for i, name in enumerate(index_names):
if isinstance(name, str) and name in unnamed_cols:
if isinstance(name, str) and name in self.unnamed_cols:
index_names[i] = None

return index_names, columns, index_col
Expand Down
19 changes: 11 additions & 8 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def __init__(
self.names, # type: ignore[has-type]
# error: Cannot determine type of 'index_col'
self.index_col, # type: ignore[has-type]
self.unnamed_cols,
)

if self.index_names is None:
Expand Down Expand Up @@ -233,6 +232,8 @@ def read(
Sequence[Hashable] | MultiIndex,
Mapping[Hashable, ArrayLike],
]:
index: Index | MultiIndex | None
column_names: Sequence[Hashable] | MultiIndex
try:
if self.low_memory:
chunks = self._reader.read_low_memory(nrows)
Expand Down Expand Up @@ -297,7 +298,12 @@ def read(
data_tups = sorted(data.items())
data = {k: v for k, (i, v) in zip(names, data_tups)}

names, date_data = self._do_date_conversions(names, data)
column_names, date_data = self._do_date_conversions(names, data)

# maybe create a mi on the columns
column_names = self._maybe_make_multi_index_columns(
column_names, self.col_names
)

else:
# rename dict keys
Expand All @@ -321,12 +327,9 @@ def read(
data = {k: v for k, (i, v) in zip(names, data_tups)}

names, date_data = self._do_date_conversions(names, data)
index, names = self._make_index(date_data, alldata, names)

# maybe create a mi on the columns
conv_names = self._maybe_make_multi_index_columns(names, self.col_names)
index, column_names = self._make_index(date_data, alldata, names)

return index, conv_names, date_data
return index, column_names, date_data

def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
# hackish
Expand All @@ -343,7 +346,7 @@ def _get_index_names(self):

if self._reader.leading_cols == 0 and self.index_col is not None:
(idx_names, names, self.index_col) = self._clean_index_names(
names, self.index_col, self.unnamed_cols
names, self.index_col
)

return names, idx_names
Expand Down
37 changes: 26 additions & 11 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DefaultDict,
Hashable,
Iterator,
List,
Literal,
Mapping,
Sequence,
Expand All @@ -38,6 +39,11 @@
from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.inference import is_dict_like

from pandas import (
Index,
MultiIndex,
)

from pandas.io.parsers.base_parser import (
ParserBase,
parser_defaults,
Expand Down Expand Up @@ -180,7 +186,7 @@ def __init__(
)
self.num = re.compile(regex)

def _make_reader(self, f) -> None:
def _make_reader(self, f: IO[str]) -> None:
sep = self.delimiter

if sep is None or len(sep) == 1:
Expand All @@ -205,16 +211,17 @@ class MyDialect(csv.Dialect):
else:
# attempt to sniff the delimiter from the first valid line,
# i.e. no comment line and not in skiprows
line = f.readline()
line: str = f.readline()
lines = self._check_comments([[line]])[0]
while self.skipfunc(self.pos) or not lines:
self.pos += 1
line = f.readline()
lines = self._check_comments([[line]])[0]
lines_str = cast(List[str], lines)

# since `line` was a string, lines will be a list containing
# only a single string
line = lines[0]
line = lines_str[0]

self.pos += 1
self.line_pos += 1
Expand Down Expand Up @@ -246,7 +253,11 @@ def _read():
# TextIOWrapper, mmap, None]")
self.data = reader # type: ignore[assignment]

def read(self, rows: int | None = None):
def read(
self, rows: int | None = None
) -> tuple[
Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
]:
try:
content = self._get_lines(rows)
except StopIteration:
Expand Down Expand Up @@ -286,9 +297,11 @@ def read(self, rows: int | None = None):
conv_data = self._convert_data(data)
columns, conv_data = self._do_date_conversions(columns, conv_data)

index, columns = self._make_index(conv_data, alldata, columns, indexnamerow)
index, result_columns = self._make_index(
conv_data, alldata, columns, indexnamerow
)

return index, columns, conv_data
return index, result_columns, conv_data

def _exclude_implicit_index(
self,
Expand Down Expand Up @@ -599,7 +612,7 @@ def _handle_usecols(
self._col_indices = sorted(col_indices)
return columns

def _buffered_line(self):
def _buffered_line(self) -> list[Scalar]:
"""
Return a line from buffer, filling buffer if required.
"""
Expand Down Expand Up @@ -889,7 +902,9 @@ def _clear_buffer(self) -> None:

_implicit_index = False

def _get_index_name(self, columns: list[Hashable]):
def _get_index_name(
self, columns: list[Hashable]
) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]:
"""
Try several cases to get lines:

Expand Down Expand Up @@ -954,8 +969,8 @@ def _get_index_name(self, columns: list[Hashable]):

else:
# Case 2
(index_name, columns_, self.index_col) = self._clean_index_names(
columns, self.index_col, self.unnamed_cols
(index_name, _, self.index_col) = self._clean_index_names(
columns, self.index_col
)

return index_name, orig_names, columns
Expand Down Expand Up @@ -1043,7 +1058,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
]
return zipped_content

def _get_lines(self, rows: int | None = None):
def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
lines = self.buf
new_rows = None

Expand Down