Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,7 @@ def _extract_multi_indexer_columns(

# clean the index_names
index_names = header.pop(-1)
index_names, _, _ = self._clean_index_names(
index_names, self.index_col, self.unnamed_cols
)
index_names, _, _ = self._clean_index_names(index_names, self.index_col)

# extract the columns
field_count = len(header[0])
Expand Down Expand Up @@ -381,21 +379,24 @@ def _maybe_make_multi_index_columns(
return columns

@final
def _make_index(self, data, alldata, columns, indexnamerow=False):
def _make_index(
self, data, alldata, columns, indexnamerow=False
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
index: Index | None
if not is_index_col(self.index_col) or not self.index_col:
index = None

elif not self._has_complex_date_col:
index = self._get_simple_index(alldata, columns)
index = self._agg_index(index)
simple_index = self._get_simple_index(alldata, columns)
index = self._agg_index(simple_index)
elif self._has_complex_date_col:
if not self._name_processed:
(self.index_names, _, self.index_col) = self._clean_index_names(
list(columns), self.index_col, self.unnamed_cols
list(columns), self.index_col
)
self._name_processed = True
index = self._get_complex_date_index(data, columns)
index = self._agg_index(index, try_parse_dates=False)
date_index = self._get_complex_date_index(data, columns)
index = self._agg_index(date_index, try_parse_dates=False)

# add names for the index
if indexnamerow:
Expand Down Expand Up @@ -966,7 +967,7 @@ def _validate_usecols_arg(self, usecols):
return usecols, usecols_dtype
return usecols, None

def _clean_index_names(self, columns, index_col, unnamed_cols):
def _clean_index_names(self, columns, index_col):
if not is_index_col(index_col):
return None, columns, index_col

Expand Down Expand Up @@ -998,7 +999,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols):

# Only clean index names that were placeholders.
for i, name in enumerate(index_names):
if isinstance(name, str) and name in unnamed_cols:
if isinstance(name, str) and name in self.unnamed_cols:
index_names[i] = None

return index_names, columns, index_col
Expand Down
19 changes: 11 additions & 8 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds):
self.names, # type: ignore[has-type]
# error: Cannot determine type of 'index_col'
self.index_col, # type: ignore[has-type]
self.unnamed_cols,
)

if self.index_names is None:
Expand Down Expand Up @@ -220,6 +219,8 @@ def read(
Sequence[Hashable] | MultiIndex,
Mapping[Hashable, ArrayLike],
]:
index: Index | MultiIndex | None
column_names: Sequence[Hashable] | MultiIndex
try:
if self.low_memory:
chunks = self._reader.read_low_memory(nrows)
Expand Down Expand Up @@ -284,7 +285,12 @@ def read(
data_tups = sorted(data.items())
data = {k: v for k, (i, v) in zip(names, data_tups)}

names, date_data = self._do_date_conversions(names, data)
column_names, date_data = self._do_date_conversions(names, data)

# maybe create a mi on the columns
column_names = self._maybe_make_multi_index_columns(
column_names, self.col_names
)

else:
# rename dict keys
Expand All @@ -308,12 +314,9 @@ def read(
data = {k: v for k, (i, v) in zip(names, data_tups)}

names, date_data = self._do_date_conversions(names, data)
index, names = self._make_index(date_data, alldata, names)

# maybe create a mi on the columns
conv_names = self._maybe_make_multi_index_columns(names, self.col_names)
index, column_names = self._make_index(date_data, alldata, names)

return index, conv_names, date_data
return index, column_names, date_data

def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
# hackish
Expand All @@ -330,7 +333,7 @@ def _get_index_names(self):

if self._reader.leading_cols == 0 and self.index_col is not None:
(idx_names, names, self.index_col) = self._clean_index_names(
names, self.index_col, self.unnamed_cols
names, self.index_col
)

return names, idx_names
Expand Down
45 changes: 31 additions & 14 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DefaultDict,
Hashable,
Iterator,
List,
Literal,
Mapping,
Sequence,
Expand All @@ -37,6 +38,11 @@
from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.inference import is_dict_like

from pandas import (
Index,
MultiIndex,
)

from pandas.io.parsers.base_parser import (
ParserBase,
parser_defaults,
Expand Down Expand Up @@ -167,7 +173,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
)
self.num = re.compile(regex)

def _make_reader(self, f) -> None:
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
sep = self.delimiter

if sep is None or len(sep) == 1:
Expand Down Expand Up @@ -198,10 +204,11 @@ class MyDialect(csv.Dialect):
self.pos += 1
line = f.readline()
lines = self._check_comments([[line]])[0]
lines_str = cast(List[str], lines)

# since `line` was a string, lines will be a list containing
# only a single string
line = lines[0]
line = lines_str[0]

self.pos += 1
self.line_pos += 1
Expand Down Expand Up @@ -233,7 +240,11 @@ def _read():
# TextIOWrapper, mmap, None]")
self.data = reader # type: ignore[assignment]

def read(self, rows: int | None = None):
def read(
self, rows: int | None = None
) -> tuple[
Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
]:
try:
content = self._get_lines(rows)
except StopIteration:
Expand Down Expand Up @@ -273,9 +284,11 @@ def read(self, rows: int | None = None):
conv_data = self._convert_data(data)
columns, conv_data = self._do_date_conversions(columns, conv_data)

index, columns = self._make_index(conv_data, alldata, columns, indexnamerow)
index, result_columns = self._make_index(
conv_data, alldata, columns, indexnamerow
)

return index, columns, conv_data
return index, result_columns, conv_data

def _exclude_implicit_index(
self,
Expand Down Expand Up @@ -586,7 +599,7 @@ def _handle_usecols(
self._col_indices = sorted(col_indices)
return columns

def _buffered_line(self):
def _buffered_line(self) -> list[Scalar]:
"""
Return a line from buffer, filling buffer if required.
"""
Expand Down Expand Up @@ -876,7 +889,9 @@ def _clear_buffer(self) -> None:

_implicit_index = False

def _get_index_name(self, columns: list[Hashable]):
def _get_index_name(
self, columns: list[Hashable]
) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]:
"""
Try several cases to get lines:

Expand Down Expand Up @@ -941,8 +956,8 @@ def _get_index_name(self, columns: list[Hashable]):

else:
# Case 2
(index_name, columns_, self.index_col) = self._clean_index_names(
columns, self.index_col, self.unnamed_cols
(index_name, _, self.index_col) = self._clean_index_names(
columns, self.index_col
)

return index_name, orig_names, columns
Expand Down Expand Up @@ -1034,7 +1049,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
]
return zipped_content

def _get_lines(self, rows: int | None = None):
def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
lines = self.buf
new_rows = None

Expand Down Expand Up @@ -1131,7 +1146,7 @@ class FixedWidthReader(abc.Iterator):

def __init__(
self,
f: IO[str],
f: IO[str] | ReadCsvBuffer[str],
colspecs: list[tuple[int, int]] | Literal["infer"],
delimiter: str | None,
comment: str | None,
Expand Down Expand Up @@ -1228,14 +1243,16 @@ def detect_colspecs(
return edge_pairs

def __next__(self) -> list[str]:
# Argument 1 to "next" has incompatible type "Union[IO[str],
# ReadCsvBuffer[str]]"; expected "SupportsNext[str]"
if self.buffer is not None:
try:
line = next(self.buffer)
except StopIteration:
self.buffer = None
line = next(self.f)
line = next(self.f) # type: ignore[arg-type]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@twoertwein Does ReadCsvBuffer suport next in addition to iter?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand: As long as .__iter__() returns an object that has __next__, I don't think the file object itself has to have __next__.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm weird, mypy complains about ReadCsvBuffer[str]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry, you are right! I think only read_fwf calls next explicitly, or? read_csv does not explicitly call next. The following works with read_csv

from io import StringIO

import pandas as pd


class Test:
    def __init__(self):
        self.buffer = StringIO("a,b,c\n0,1,2")

    @property
    def mode(self):
        return self.buffer.mode

    def fileno(self):
        return self.buffer.fileno()

    def seek(self, __offset, __whence=-1):
        if __whence != -1:
            return self.seek(__offset)
        return self.seek(__offset, __whence=__whence)

    def seekable(self):
        return self.buffer.seekable()

    def tell(self):
        return self.buffer.tell()

    def read(self, __n=None):
        self.buffer.read(__n)

    def __iter__(self):
        return self.buffer.__iter__()

    def readline(self):
        return self.buffer.readline()

    @property
    def closed(self):
        return self.buffer.closed()


print(pd.read_csv(Test(), engine="python"))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep I think this is only called from read_fwf.

Can we add next to ReadCsvBuffer without breaking anything (from a correctness standpoint)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be fine with adding it or (probably better) creating ReadFwfBuffer which inherits from ReadCsvBuffer (not sure whether it needs everything that is in ReadCsvBuffer).

Also happy to have the ignores for now and address this in a later PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets go with the ignore for now then until we have typed more. Can probably make a better decision then

else:
line = next(self.f)
line = next(self.f) # type: ignore[arg-type]
# Note: 'colspecs' is a sequence of half-open intervals.
return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]

Expand All @@ -1252,7 +1269,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
self.infer_nrows = kwds.pop("infer_nrows")
PythonParser.__init__(self, f, **kwds)

def _make_reader(self, f: IO[str]) -> None:
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
self.data = FixedWidthReader(
f,
self.colspecs,
Expand Down