Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
26 changes: 26 additions & 0 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
)
from pandas.core.dtypes.inference import is_integer

from pandas.core.arrays.arrow.array import to_pyarrow_type

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

Expand Down Expand Up @@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str:
f"f{n}" for n in self.convert_options["include_columns"]
]

if self.dtype is not None:
if isinstance(self.dtype, dict):
column_types = {}
for col, col_dtype in self.dtype.items():
source_dtype = pandas_dtype(col_dtype)

try:
target_dtype = to_pyarrow_type(source_dtype.type)
if target_dtype:
column_types[col] = target_dtype

except TypeError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whats an example where this happens?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. I seem to remember it failed some test. I can look into it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the try/except block to test the theory, but I'm getting some failures. Not sure if it's the test suite or the change itself. I was getting some recent failures in the test suite anyway... they just don't seem related.

If the test suite will pass, I'm fine leaving it out. I think there was some historical reason for including it, during some of my earlier attempts at making this work.

# TODO: Unsupported dtypes silently ignored - may cause
# unexpected behavior when pyarrow applies default inference
# instead of user's dtype
pass

if column_types:
self.convert_options["column_types"] = column_types
else:
# TODO: Global dtypes not supported - may cause inconsistent behavior
# between engines, especially for leading zero preservation
pass
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if they pass a singleton, can we do something like

convert_options["column_types"] = defaultdict(user_passed_dtype)

?

Copy link
Author

@dxdc dxdc Oct 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good thought, but it doesn't work. One of the first things I tried actually :) I documented a larger analysis on pyarrow here: apache/arrow#47502

You can see the relevant portion of pyarrow code here. Everything is mapped back to C++, and if the column name is not found, it uses the default (inferred) option.

https://github.com/apache/arrow/blob/eb9d5194a306f8145f8600b176f3bd391ee4397c/cpp/src/arrow/csv/reader.cc#L675-L682


self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
Expand Down
86 changes: 86 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
).index
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
tm.assert_index_equal(result, expected)


def test_leading_zeros_preserved_with_dtype_str(all_parsers):
# GH#61618: ensure string dtype preservation across engines
parser = all_parsers
engine_name = getattr(parser, "engine", "unknown")

# Skip pyarrow engine as it has its own xfail test
if engine_name == "pyarrow":
pytest.skip("pyarrow engine tested separately with xfail")

data = """col1,col2,col3,col4
AB,000388907,abc,0150
CD,101044572,def,0150
EF,000023607,ghi,0205
GH,100102040,jkl,0205"""

result = parser.read_csv(
StringIO(data),
dtype=str,
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"


@pytest.mark.xfail(
reason="pyarrow engine strips leading zeros with dtype=str (GH#57666)", strict=False
)
def test_leading_zeros_preserved_with_dtype_str_pyarrow(pyarrow_parser_only):
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
# This is a known issue that needs to be fixed in the pyarrow engine
parser = pyarrow_parser_only

data = """col1,col2,col3,col4
AB,000388907,abc,0150
CD,101044572,def,0150
EF,000023607,ghi,0205
GH,100102040,jkl,0205"""

result = parser.read_csv(
StringIO(data),
dtype=str,
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"


def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
# GH#61618: further discussion on ensuring string dtype preservation across engines

parser = all_parsers

data = """col1,col2,col3,col4
AB,000388907,199,0150
CD,101044572,200,0150
EF,000023607,201,0205
GH,100102040,202,0205"""

result = parser.read_csv(
StringIO(data),
dtype={"col2": str, "col3": int, "col4": str},
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]

assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"

assert result.loc[0, "col3"] == 199
assert result.loc[1, "col3"] == 200
assert result.loc[2, "col3"] == 201
assert result.loc[3, "col3"] == 202
Loading