|
3 | 3 | from typing import TYPE_CHECKING
|
4 | 4 | import warnings
|
5 | 5 |
|
| 6 | +import numpy as np |
| 7 | + |
6 | 8 | from pandas._libs import lib
|
7 | 9 | from pandas.compat._optional import import_optional_dependency
|
8 | 10 | from pandas.errors import (
|
|
12 | 14 | from pandas.util._exceptions import find_stack_level
|
13 | 15 |
|
14 | 16 | from pandas.core.dtypes.common import pandas_dtype
|
| 17 | +from pandas.core.dtypes.dtypes import ( |
| 18 | + BaseMaskedDtype, |
| 19 | +) |
15 | 20 | from pandas.core.dtypes.inference import is_integer
|
16 | 21 |
|
| 22 | +from pandas.core.arrays.string_ import StringDtype |
| 23 | + |
17 | 24 | from pandas.io._util import arrow_table_to_pandas
|
18 | 25 | from pandas.io.parsers.base_parser import ParserBase
|
19 | 26 |
|
@@ -140,20 +147,7 @@ def handle_warning(invalid_row) -> str:
|
140 | 147 | "encoding": self.encoding,
|
141 | 148 | }
|
142 | 149 |
|
143 |
| - def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: |
144 |
| - """ |
145 |
| - Processes data read in based on kwargs. |
146 |
| -
|
147 |
| - Parameters |
148 |
| - ---------- |
149 |
| - frame: DataFrame |
150 |
| - The DataFrame to process. |
151 |
| -
|
152 |
| - Returns |
153 |
| - ------- |
154 |
| - DataFrame |
155 |
| - The processed DataFrame. |
156 |
| - """ |
| 150 | + def _finalize_column_names(self, frame: DataFrame) -> DataFrame: |
157 | 151 | num_cols = len(frame.columns)
|
158 | 152 | multi_index_named = True
|
159 | 153 | if self.header is None:
|
@@ -196,6 +190,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
|
196 | 190 | if self.header is None and not multi_index_named:
|
197 | 191 | frame.index.names = [None] * len(frame.index.names)
|
198 | 192 |
|
| 193 | + return frame |
| 194 | + |
| 195 | + def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: |
| 196 | + """ |
| 197 | + Processes data read in based on kwargs. |
| 198 | +
|
| 199 | + Parameters |
| 200 | + ---------- |
| 201 | + frame: DataFrame |
| 202 | + The DataFrame to process. |
| 203 | +
|
| 204 | + Returns |
| 205 | + ------- |
| 206 | + DataFrame |
| 207 | + The processed DataFrame. |
| 208 | + """ |
| 209 | + |
199 | 210 | if self.dtype is not None:
|
200 | 211 | # Ignore non-existent columns from dtype mapping
|
201 | 212 | # like other parsers do
|
@@ -282,14 +293,47 @@ def read(self) -> DataFrame:
|
282 | 293 |
|
283 | 294 | table = table.cast(new_schema)
|
284 | 295 |
|
| 296 | + workaround = False |
| 297 | + pass_backend = dtype_backend |
| 298 | + if self.dtype is not None and dtype_backend != "pyarrow": |
| 299 | + # We pass dtype_backend="pyarrow" and subsequently cast |
| 300 | + # to avoid lossy conversion e.g. GH#56136 |
| 301 | + workaround = True |
| 302 | + pass_backend = "numpy_nullable" |
| 303 | + |
285 | 304 | with warnings.catch_warnings():
|
286 | 305 | warnings.filterwarnings(
|
287 | 306 | "ignore",
|
288 | 307 | "make_block is deprecated",
|
289 | 308 | DeprecationWarning,
|
290 | 309 | )
|
291 | 310 | frame = arrow_table_to_pandas(
|
292 |
| - table, dtype_backend=dtype_backend, null_to_int64=True |
| 311 | + table, dtype_backend=pass_backend, null_to_int64=True |
293 | 312 | )
|
294 | 313 |
|
| 314 | + frame = self._finalize_column_names(frame) |
| 315 | + |
| 316 | + if workaround and dtype_backend != "numpy_nullable": |
| 317 | + old_dtype = self.dtype |
| 318 | + if not isinstance(old_dtype, dict): |
| 319 | + # e.g. test_categorical_dtype_utf16 |
| 320 | + old_dtype = dict.fromkeys(frame.columns, old_dtype) |
| 321 | + |
| 322 | + # _finalize_pandas_output will call astype, but we need to make |
| 323 | + # sure all keys are populated appropriately. |
| 324 | + new_dtype = {} |
| 325 | + for key in frame.columns: |
| 326 | + ser = frame[key] |
| 327 | + if isinstance(ser.dtype, BaseMaskedDtype): |
| 328 | + new_dtype[key] = ser.dtype.numpy_dtype |
| 329 | + elif isinstance(ser.dtype, StringDtype): |
| 330 | + # We cast here in case the user passed "category" in |
| 331 | + # order to get the correct dtype.categories.dtype |
| 332 | + # e.g. test_categorical_dtype_utf16 |
| 333 | + new_dtype[key] = StringDtype(na_value=np.nan) |
| 334 | + frame[key] = frame[key].astype(new_dtype[key]) |
| 335 | + |
| 336 | + new_dtype.update(old_dtype) |
| 337 | + self.dtype = new_dtype |
| 338 | + |
295 | 339 | return self._finalize_pandas_output(frame)
|
0 commit comments