diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 0b9d45a2efc59..7927439abb510 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -303,9 +303,7 @@ def _extract_multi_indexer_columns( # clean the index_names index_names = header.pop(-1) - index_names, _, _ = self._clean_index_names( - index_names, self.index_col, self.unnamed_cols - ) + index_names, _, _ = self._clean_index_names(index_names, self.index_col) # extract the columns field_count = len(header[0]) @@ -381,21 +379,24 @@ def _maybe_make_multi_index_columns( return columns @final - def _make_index(self, data, alldata, columns, indexnamerow=False): + def _make_index( + self, data, alldata, columns, indexnamerow=False + ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: + index: Index | None if not is_index_col(self.index_col) or not self.index_col: index = None elif not self._has_complex_date_col: - index = self._get_simple_index(alldata, columns) - index = self._agg_index(index) + simple_index = self._get_simple_index(alldata, columns) + index = self._agg_index(simple_index) elif self._has_complex_date_col: if not self._name_processed: (self.index_names, _, self.index_col) = self._clean_index_names( - list(columns), self.index_col, self.unnamed_cols + list(columns), self.index_col ) self._name_processed = True - index = self._get_complex_date_index(data, columns) - index = self._agg_index(index, try_parse_dates=False) + date_index = self._get_complex_date_index(data, columns) + index = self._agg_index(date_index, try_parse_dates=False) # add names for the index if indexnamerow: @@ -966,7 +967,7 @@ def _validate_usecols_arg(self, usecols): return usecols, usecols_dtype return usecols, None - def _clean_index_names(self, columns, index_col, unnamed_cols): + def _clean_index_names(self, columns, index_col): if not is_index_col(index_col): return None, columns, index_col @@ -998,7 +999,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols): # Only clean index names that were placeholders. for i, name in enumerate(index_names): - if isinstance(name, str) and name in unnamed_cols: + if isinstance(name, str) and name in self.unnamed_cols: index_names[i] = None return index_names, columns, index_col diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index fc0f572c79e6b..e8909f542f335 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -172,7 +172,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds): self.names, # type: ignore[has-type] # error: Cannot determine type of 'index_col' self.index_col, # type: ignore[has-type] - self.unnamed_cols, ) if self.index_names is None: @@ -220,6 +219,8 @@ def read( Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike], ]: + index: Index | MultiIndex | None + column_names: Sequence[Hashable] | MultiIndex try: if self.low_memory: chunks = self._reader.read_low_memory(nrows) @@ -284,7 +285,12 @@ def read( data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - names, date_data = self._do_date_conversions(names, data) + column_names, date_data = self._do_date_conversions(names, data) + + # maybe create a mi on the columns + column_names = self._maybe_make_multi_index_columns( + column_names, self.col_names + ) else: # rename dict keys @@ -308,12 +314,9 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} names, date_data = self._do_date_conversions(names, data) - index, names = self._make_index(date_data, alldata, names) - - # maybe create a mi on the columns - conv_names = self._maybe_make_multi_index_columns(names, self.col_names) + index, column_names = self._make_index(date_data, alldata, names) - return index, conv_names, date_data + return index, column_names, date_data def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: # hackish @@ -330,7 +333,7 @@ def _get_index_names(self): if self._reader.leading_cols == 0 and self.index_col is not None: (idx_names, names, self.index_col) = self._clean_index_names( - names, self.index_col, self.unnamed_cols + names, self.index_col ) return names, idx_names diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 52fa3be4ff418..68be818f4f3d4 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -13,6 +13,7 @@ DefaultDict, Hashable, Iterator, + List, Literal, Mapping, Sequence, @@ -37,6 +38,11 @@ from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like +from pandas import ( + Index, + MultiIndex, +) + from pandas.io.parsers.base_parser import ( ParserBase, parser_defaults, @@ -167,7 +173,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds): ) self.num = re.compile(regex) - def _make_reader(self, f) -> None: + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: sep = self.delimiter if sep is None or len(sep) == 1: @@ -198,10 +204,11 @@ class MyDialect(csv.Dialect): self.pos += 1 line = f.readline() lines = self._check_comments([[line]])[0] + lines_str = cast(List[str], lines) # since `line` was a string, lines will be a list containing # only a single string - line = lines[0] + line = lines_str[0] self.pos += 1 self.line_pos += 1 @@ -233,7 +240,11 @@ def _read(): # TextIOWrapper, mmap, None]") self.data = reader # type: ignore[assignment] - def read(self, rows: int | None = None): + def read( + self, rows: int | None = None + ) -> tuple[ + Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] + ]: try: content = self._get_lines(rows) except StopIteration: @@ -273,9 +284,11 @@ def read(self, rows: int | None = None): conv_data = self._convert_data(data) columns, conv_data = self._do_date_conversions(columns, conv_data) - index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) + index, result_columns = self._make_index( + conv_data, alldata, columns, indexnamerow + ) - return index, columns, conv_data + return index, result_columns, conv_data def _exclude_implicit_index( self, @@ -586,7 +599,7 @@ def _handle_usecols( self._col_indices = sorted(col_indices) return columns - def _buffered_line(self): + def _buffered_line(self) -> list[Scalar]: """ Return a line from buffer, filling buffer if required. """ @@ -876,7 +889,9 @@ def _clear_buffer(self) -> None: _implicit_index = False - def _get_index_name(self, columns: list[Hashable]): + def _get_index_name( + self, columns: list[Hashable] + ) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]: """ Try several cases to get lines: @@ -941,8 +956,8 @@ def _get_index_name(self, columns: list[Hashable]): else: # Case 2 - (index_name, columns_, self.index_col) = self._clean_index_names( - columns, self.index_col, self.unnamed_cols + (index_name, _, self.index_col) = self._clean_index_names( + columns, self.index_col ) return index_name, orig_names, columns @@ -1034,7 +1049,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: ] return zipped_content - def _get_lines(self, rows: int | None = None): + def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: lines = self.buf new_rows = None @@ -1131,7 +1146,7 @@ class FixedWidthReader(abc.Iterator): def __init__( self, - f: IO[str], + f: IO[str] | ReadCsvBuffer[str], colspecs: list[tuple[int, int]] | Literal["infer"], delimiter: str | None, comment: str | None, @@ -1228,14 +1243,16 @@ def detect_colspecs( return edge_pairs def __next__(self) -> list[str]: + # Argument 1 to "next" has incompatible type "Union[IO[str], + # ReadCsvBuffer[str]]"; expected "SupportsNext[str]" if self.buffer is not None: try: line = next(self.buffer) except StopIteration: self.buffer = None - line = next(self.f) + line = next(self.f) # type: ignore[arg-type] else: - line = next(self.f) + line = next(self.f) # type: ignore[arg-type] # Note: 'colspecs' is a sequence of half-open intervals. return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] @@ -1252,7 +1269,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) - def _make_reader(self, f: IO[str]) -> None: + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: self.data = FixedWidthReader( f, self.colspecs,