diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e7473aabdff87..e8faea76897c6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -28,27 +28,19 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, ) from pandas.core.dtypes.missing import isna from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, StringDtype, @@ -58,12 +50,9 @@ ArrowExtensionArray, BaseMaskedArray, BooleanArray, - Categorical, - ExtensionArray, FloatingArray, IntegerArray, ) -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -86,7 +75,6 @@ from pandas._typing import ( ArrayLike, DtypeArg, - DtypeObj, Hashable, HashableT, Scalar, @@ -127,7 +115,6 @@ def __init__(self, kwds) -> None: "for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates - self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) @@ -145,12 +132,6 @@ def __init__(self, kwds) -> None: self.false_values = kwds.get("false_values") self.cache_dates = kwds.pop("cache_dates", True) - self._date_conv = _make_date_converter( - date_format=self.date_format, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - ) - # validate header options for mi self.header = kwds.get("header") if is_list_like(self.header, allow_sets=False): @@ -181,58 +162,12 @@ def __init__(self, kwds) -> None: self._first_chunk = True - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Returns - ------- - The names of the columns which will get parsed later if a list - is given as specification. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - if not isinstance(self.parse_dates, list): - return set() - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in self.parse_dates - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - # Convert positions to actual column names - return { - col if (isinstance(col, str) or col in columns) else columns[col] - for col in self.parse_dates - } - def close(self) -> None: pass @@ -404,9 +339,12 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: for i, arr in enumerate(index): if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv( + arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) if self.na_filter: @@ -420,7 +358,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: assert self.index_names is not None col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( + col_na_values, col_na_fvalues = get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) else: @@ -451,90 +389,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: return index - @final - def _convert_to_ndarrays( - self, - dct: Mapping, - na_values, - na_fvalues, - converters=None, - dtypes=None, - ) -> dict[Any, np.ndarray]: - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) - else: - # single dtype or None - cast_type = dtypes - - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) - else: - col_na_values, col_na_fvalues = set(), set() - - if c in self._parse_date_cols: - # GH#26203 Do not convert columns which get converted to dates - # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) - np.putmask(values, mask, np.nan) - result[c] = values - continue - - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used." - ), - ParserWarning, - stacklevel=find_stack_level(), - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool=False, - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool, - ) - - # type specified in dtype param or cast_type is an EA - if cast_type is not None: - cast_type = pandas_dtype(cast_type) - if cast_type and (cvals.dtype != cast_type or is_ea): - if not is_ea and na_count > 0: - if is_bool_dtype(cast_type): - raise ValueError(f"Bool column has NA values in column {c}") - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - return result - @final def _set_noconvert_dtype_columns( self, col_indices: list[int], names: Sequence[Hashable] @@ -580,6 +434,7 @@ def _set(x) -> int: return x if isinstance(self.parse_dates, list): + validate_parse_dates_presence(self.parse_dates, names) for val in self.parse_dates: noconvert_columns.add(_set(val)) @@ -705,80 +560,6 @@ def _infer_types( return result, na_count - @final - def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray or ExtensionArray - cast_type : np.dtype or ExtensionDtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray or ExtensionArray - """ - if isinstance(cast_type, CategoricalDtype): - known_cats = cast_type.categories is not None - - if not is_object_dtype(values.dtype) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = lib.ensure_string_array( - values, skipna=False, convert_na_value=False - ) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif isinstance(cast_type, ExtensionDtype): - array_type = cast_type.construct_array_type() - try: - if isinstance(cast_type, BooleanDtype): - # error: Unexpected keyword argument "true_values" for - # "_from_sequence_of_strings" of "ExtensionArray" - values_str = [str(val) for val in values] - return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values_str, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - none_values=self.na_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - elif isinstance(values, ExtensionArray): - values = values.astype(cast_type, copy=False) - elif issubclass(cast_type.type, str): - # TODO: why skipna=True here and False above? some tests depend - # on it here, but nothing fails if we change it above - # (as no tests get there as of 2022-12-06) - values = lib.ensure_string_array( - values, skipna=True, convert_na_value=False - ) - else: - try: - values = astype_array(values, cast_type, copy=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - @overload def _do_date_conversions( self, @@ -799,16 +580,25 @@ def _do_date_conversions( names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, ) -> Mapping[Hashable, ArrayLike] | DataFrame: - if isinstance(self.parse_dates, list): - return _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - dtype_backend=self.dtype_backend, + if not isinstance(self.parse_dates, list): + return data + for colspec in self.parse_dates: + if isinstance(colspec, int) and colspec not in data: + colspec = names[colspec] + if (isinstance(self.index_col, list) and colspec in self.index_col) or ( + isinstance(self.index_names, list) and colspec in self.index_names + ): + continue + result = date_converter( + data[colspec], + col=colspec, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data[colspec] = result # type: ignore[index] return data @@ -903,56 +693,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen return usecols - @final - def _validate_usecols_arg(self, usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): @@ -1042,40 +782,37 @@ def _get_empty_meta( return index, columns, col_dict -def _make_date_converter( +def date_converter( + date_col, + col: Hashable, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(date_col, col: Hashable): - if date_col.dtype.kind in "Mm": - return date_col - - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(np.asarray(date_col)) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - str_objs = lib.ensure_string_array(date_col) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - # test_multi_index_parse_dates - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - - return converter + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values parser_defaults = { @@ -1118,43 +855,7 @@ def converter(date_col, col: Hashable): } -def _process_date_conversion( - data_dict: Mapping[Hashable, ArrayLike] | DataFrame, - converter: Callable, - parse_spec: list, - index_col, - index_names, - columns: Sequence[Hashable] | Index, - dtype_backend=lib.no_default, -) -> Mapping[Hashable, ArrayLike] | DataFrame: - for colspec in parse_spec: - if isinstance(colspec, int) and colspec not in data_dict: - colspec = columns[colspec] - if (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - result = converter(np.asarray(data_dict[colspec]), col=colspec) - # error: Unsupported target for indexed assignment - # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") - data_dict[colspec] = result # type: ignore[index] - - return data_dict - - -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. @@ -1191,3 +892,99 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): def is_index_col(col) -> bool: return col is not None and col is not False + + +def validate_parse_dates_presence( + parse_dates: bool | list, columns: Sequence[Hashable] +) -> set: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if not isinstance(parse_dates, list): + return set() + + missing = set() + unique_cols = set() + for col in parse_dates: + if isinstance(col, str): + if col not in columns: + missing.add(col) + else: + unique_cols.add(col) + elif col in columns: + unique_cols.add(col) + else: + unique_cols.add(columns[col]) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") + return unique_cols + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4de626288aa41..b59a778624c49 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,7 +30,9 @@ from pandas.io.parsers.base_parser import ( ParserBase, ParserError, + date_converter, is_index_col, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -160,7 +162,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ) # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' @@ -344,9 +346,12 @@ def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv( + values = date_converter( values, col=self.index_names[index] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) return values diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f7d2aa2419429..05fe963e9b2b7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -10,9 +10,11 @@ from typing import ( IO, TYPE_CHECKING, + Any, DefaultDict, Literal, cast, + final, ) import warnings @@ -27,20 +29,39 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_integer, is_numeric_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import Index + from pandas.io.common import ( dedup_names, is_potential_multi_index, ) from pandas.io.parsers.base_parser import ( ParserBase, + get_na_values, parser_defaults, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -53,13 +74,13 @@ from pandas._typing import ( ArrayLike, + DtypeObj, ReadCsvBuffer, Scalar, T, ) from pandas import ( - Index, MultiIndex, Series, ) @@ -157,7 +178,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: @@ -370,6 +390,165 @@ def _convert_data( clean_dtypes, ) + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + converters=None, + dtypes=None, + ) -> dict[Any, np.ndarray]: + result = {} + parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns) + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType] + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + return result + + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values_str, + dtype=cast_type, + true_values=self.true_values, # pyright: ignore[reportCallIssue] + false_values=self.false_values, # pyright: ignore[reportCallIssue] + none_values=self.na_values, # pyright: ignore[reportCallIssue] + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index f63cc3d56bf89..4ccfa8e81e883 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -75,6 +75,7 @@ def tips_df(datapath): @pytest.mark.single_cpu +@pytest.mark.network @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: