pandas-dev · dxdc · Sep 2, 2025 · Sep 25, 2025 · Sep 27, 2025 · Oct 1, 2025
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1124,6 +1124,7 @@ I/O
 - Bug in :meth:`read_csv` with ``c`` and ``python`` engines reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
+- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -19,6 +19,8 @@
 )
 from pandas.core.dtypes.inference import is_integer
 
+from pandas.core.arrays.arrow.array import to_pyarrow_type
+
 from pandas.io._util import arrow_table_to_pandas
 from pandas.io.parsers.base_parser import ParserBase
 
@@ -139,6 +141,25 @@ def handle_warning(invalid_row) -> str:
                 f"f{n}" for n in self.convert_options["include_columns"]
             ]
 
+        if self.dtype is not None:
+            if isinstance(self.dtype, dict):
+                column_types = {}
+                for col, col_dtype in self.dtype.items():
+                    source_dtype = pandas_dtype(col_dtype)
+                    target_dtype = to_pyarrow_type(source_dtype.type)
+                    if target_dtype:
+                        column_types[col] = target_dtype
+                        # TODO: Unsupported dtypes silently ignored - may cause
+                        # unexpected behavior when pyarrow applies default inference
+                        # instead of user's dtype
+
+                if column_types:
+                    self.convert_options["column_types"] = column_types
+            else:
+                # TODO: Global dtypes not supported - may cause inconsistent behavior
+                # between engines, especially for leading zero preservation
+                pass
+
         self.read_options = {
             "autogenerate_column_names": self.header is None,
             "skip_rows": self.header

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -638,3 +638,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
     ).index
     expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
     tm.assert_index_equal(result, expected)
+
+
+def test_leading_zeros_preserved_with_dtype_str(all_parsers):
+    # GH#61618: ensure string dtype preservation across engines
+    parser = all_parsers
+    engine_name = getattr(parser, "engine", "unknown")
+
+    # Skip pyarrow engine as it has its own xfail test
+    if engine_name == "pyarrow":
+        pytest.skip("pyarrow engine tested separately with xfail")
+
+    data = """col1,col2,col3,col4
+AB,000388907,abc,0150
+CD,101044572,def,0150
+EF,000023607,ghi,0205
+GH,100102040,jkl,0205"""
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype=str,
+    )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+
+@pytest.mark.xfail(
+    reason="pyarrow engine strips leading zeros with dtype=str (GH#57666)", strict=False
+)
+def test_leading_zeros_preserved_with_dtype_str_pyarrow(pyarrow_parser_only):
+    # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
+    # This is a known issue that needs to be fixed in the pyarrow engine
+    parser = pyarrow_parser_only
+
+    data = """col1,col2,col3,col4
+AB,000388907,abc,0150
+CD,101044572,def,0150
+EF,000023607,ghi,0205
+GH,100102040,jkl,0205"""
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype=str,
+    )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+
+def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
+    # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
+    # GH#61618: further discussion on ensuring string dtype preservation across engines
+
+    parser = all_parsers
+
+    data = """col1,col2,col3,col4
+AB,000388907,199,0150
+CD,101044572,200,0150
+EF,000023607,201,0205
+GH,100102040,202,0205"""
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"col2": str, "col3": int, "col4": str},
+    )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+    assert result.loc[0, "col3"] == 199
+    assert result.loc[1, "col3"] == 200
+    assert result.loc[2, "col3"] == 201
+    assert result.loc[3, "col3"] == 202