wip

fealho · fealho · commit 2666bd41fc3a · 2026-01-26T08:46:28.000-08:00
diff --git a/sdv/io/local/local.py b/sdv/io/local/local.py
@@ -61,7 +61,46 @@ class CSVHandler(BaseLocalHandler):
     def __init__(self):
         pass
 
-    def read(self, folder_name, file_names=None, read_csv_parameters=None):
+    def _keep_leading_zeros(self, file_path, table_data, read_csv_parameters):
+        """Reload numeric columns as strings when they contain leading zeros.
+
+        Args:
+            file_path (Path):
+                Path to the CSV file being read.
+            table_data (pandas.DataFrame):
+                DataFrame produced by ``read_csv`` call.
+            read_csv_parameters (dict):
+                Parameters used for the initial read that will be reused for the
+                follow-up read.
+
+        Returns:
+            pandas.DataFrame:
+                The updated DataFrame with any leading-zero numeric columns
+                preserved as strings.
+        """
+        candidate_columns = [
+            column
+            for column in table_data.columns
+            if pd.api.types.is_numeric_dtype(table_data[column])
+            and not pd.api.types.is_bool_dtype(table_data[column])
+        ]
+        if not candidate_columns:
+            return table_data
+
+        leading_zero_parameters = read_csv_parameters.copy()
+        leading_zero_parameters['dtype'] = str
+        leading_zero_parameters['usecols'] = candidate_columns
+        string_data = pd.read_csv(file_path, **leading_zero_parameters)
+
+        for column in candidate_columns:
+            series = string_data[column].dropna().astype(str)
+            has_leading_zeros = series.str.match(r'^0\d+').any()
+            if has_leading_zeros:
+                table_data[column] = string_data[column]
+
+        return table_data
+
+    def read(self, folder_name, file_names=None, read_csv_parameters=None, keep_leading_zeros=True):
         """Read data from CSV files and return it along with metadata.
 
         Args:
@@ -75,6 +114,10 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
                 The keys are any of the parameter names of the pandas.read_csv function
                 and the values are your inputs. Defaults to
                 `{'parse_dates': False, 'low_memory': False, 'on_bad_lines': 'warn'}`
+            keep_leading_zeros (bool):
+                Whether to keep leading zeros by detecting numeric columns that have
+                string values with leading zeros and loading those columns as strings.
+                Defaults to ``True``.
 
         Returns:
             dict:
@@ -123,7 +166,12 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
 
         for file_path in file_paths:
             table_name = file_path.stem  # Remove file extension to get table name
-            data[table_name] = pd.read_csv(file_path, **read_csv_parameters)
+            table_data = pd.read_csv(file_path, **read_csv_parameters)
+
+            if keep_leading_zeros:
+                table_data = self._keep_leading_zeros(file_path, table_data, read_csv_parameters)
+
+            data[table_name] = table_data
 
         return data
 
diff --git a/tests/unit/io/local/test_local.py b/tests/unit/io/local/test_local.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from unittest.mock import Mock, call, patch
 
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -93,7 +94,7 @@ def test_read(self, mock_read_csv, mock_glob):
         handler = CSVHandler()
 
         # Run
-        data = handler.read('/path/to/data')
+        data = handler.read('/path/to/data', keep_leading_zeros=False)
 
         # Assert
         assert len(data) == 2
@@ -107,6 +108,74 @@ def test_read(self, mock_read_csv, mock_glob):
             data['child'], pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
         )
 
+    def test_read_keep_leading_zeros_default(self, tmpdir):
+        """Test that leading zeros are preserved by default."""
+        # Setup
+        file_path = Path(tmpdir)
+        data = pd.DataFrame({
+            'zip_code': ['02116', '10110'],
+            'age': [30, 25],
+        })
+        data.to_csv(file_path / 'users.csv', index=False)
+
+        handler = CSVHandler()
+
+        # Run
+        out = handler.read(tmpdir, file_names=['users.csv'])
+
+        # Assert
+        pd.testing.assert_frame_equal(out['users'], data)
+
+    def test_read_keep_leading_zeros_multiple_files_mixed_types(self, tmpdir):
+        """Test leading zeros with multiple files and mixed dtypes."""
+        # Setup
+        file_path = Path(tmpdir)
+        users = pd.DataFrame({
+            'user_id': [1, 2, None],
+            'zip_code': ['00123', '98765', np.nan],
+            'age': [30, 25, None],
+            'is_active': [True, False, None],
+            'joined_at': ['2024-01-01', '2024-01-02', None],
+        })
+        orders = pd.DataFrame({
+            'order_id': [10, 20, np.nan],
+            'tracking_code': ['000045', '123450', None],
+            'amount': [10.5, 20.0, None],
+            'discount_rate': [0.0001, 0.0015, np.nan],
+            'notes': ['first', 'second', np.nan],
+        })
+        users.to_csv(file_path / 'users.csv', index=False)
+        orders.to_csv(file_path / 'orders.csv', index=False)
+
+        handler = CSVHandler()
+
+        # Run
+        out = handler.read(tmpdir, file_names=['users.csv', 'orders.csv'])
+
+        # Assert
+        users_expected = users.where(users.notna(), np.nan)
+        orders_expected = orders.where(orders.notna(), np.nan)
+        pd.testing.assert_frame_equal(out['users'], users_expected)
+        pd.testing.assert_frame_equal(out['orders'], orders_expected)
+
+    def test_read_keep_leading_zeros_false(self, tmpdir):
+        """Test that leading zeros can be ignored when requested."""
+        # Setup
+        file_path = Path(tmpdir)
+        pd.DataFrame({
+            'zip_code': ['02116', '10110'],
+            'age': [30, 25],
+        }).to_csv(file_path / 'users.csv', index=False)
+
+        handler = CSVHandler()
+
+        # Run
+        data = handler.read(tmpdir, file_names=['users.csv'], keep_leading_zeros=False)
+
+        # Assert
+        expected = pd.DataFrame({'zip_code': [2116, 10110], 'age': [30, 25]})
+        pd.testing.assert_frame_equal(data['users'], expected)
+
     def test_read_files(self, tmpdir):
         """Test the read method of CSVHandler class with given ``file_names``."""
         # Setup