wip

fealho · fealho · commit a48a4afc44f7 · 2026-01-23T19:03:12.000-08:00
diff --git a/sdv/io/local/local.py b/sdv/io/local/local.py
@@ -61,7 +61,49 @@ class CSVHandler(BaseLocalHandler):
     def __init__(self):
         pass
 
-    def read(self, folder_name, file_names=None, read_csv_parameters=None):
+    def _preserve_leading_zeros(self, file_path, table_data, read_csv_parameters):
+        """Reload numeric columns as strings when they contain leading zeros.
+
+        Args:
+            file_path (Path or str):
+                Path to the CSV file being read.
+            table_data (pandas.DataFrame):
+                DataFrame produced by the initial ``read_csv`` call.
+            read_csv_parameters (dict):
+                Parameters used for the initial read that will be reused for the
+                follow-up read.
+
+        Returns:
+            pandas.DataFrame:
+                The updated DataFrame with any leading-zero numeric columns
+                preserved as strings.
+        """
+        candidate_columns = [
+            column
+            for column in table_data.columns
+            if pd.api.types.is_numeric_dtype(table_data[column])
+            and not pd.api.types.is_bool_dtype(table_data[column])
+        ]
+        if not candidate_columns:
+            return table_data
+
+        leading_zero_parameters = read_csv_parameters.copy()
+        for key in ('dtype', 'converters', 'parse_dates', 'date_parser'):
+            leading_zero_parameters.pop(key, None)
+
+        leading_zero_parameters['dtype'] = str
+        leading_zero_parameters['usecols'] = candidate_columns
+        string_data = pd.read_csv(file_path, **leading_zero_parameters)
+
+        for column in candidate_columns:
+            series = string_data[column].dropna().astype(str)
+            has_leading_zeros = series.str.match(r'^0\d+').any()
+            if has_leading_zeros:
+                table_data[column] = string_data[column]
+
+        return table_data
+
+    def read(self, folder_name, file_names=None, read_csv_parameters=None, keep_leading_zeros=True):
         """Read data from CSV files and return it along with metadata.
 
         Args:
@@ -75,6 +117,10 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
                 The keys are any of the parameter names of the pandas.read_csv function
                 and the values are your inputs. Defaults to
                 `{'parse_dates': False, 'low_memory': False, 'on_bad_lines': 'warn'}`
+            keep_leading_zeros (bool):
+                Whether to preserve leading zeros by detecting numeric columns that have
+                string values with leading zeros and loading those columns as strings.
+                Defaults to ``True``.
 
         Returns:
             dict:
@@ -123,7 +169,14 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
 
         for file_path in file_paths:
             table_name = file_path.stem  # Remove file extension to get table name
-            data[table_name] = pd.read_csv(file_path, **read_csv_parameters)
+            table_data = pd.read_csv(file_path, **read_csv_parameters)
+
+            if keep_leading_zeros:
+                table_data = self._preserve_leading_zeros(
+                    file_path, table_data, read_csv_parameters
+                )
+
+            data[table_name] = table_data
 
         return data
 
diff --git a/tests/unit/io/local/test_local.py b/tests/unit/io/local/test_local.py
@@ -93,7 +93,7 @@ def test_read(self, mock_read_csv, mock_glob):
         handler = CSVHandler()
 
         # Run
-        data = handler.read('/path/to/data')
+        data = handler.read('/path/to/data', keep_leading_zeros=False)
 
         # Assert
         assert len(data) == 2
@@ -107,6 +107,42 @@ def test_read(self, mock_read_csv, mock_glob):
             data['child'], pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
         )
 
+    def test_read_keep_leading_zeros_default(self, tmpdir):
+        """Test that leading zeros are preserved by default."""
+        # Setup
+        file_path = Path(tmpdir)
+        pd.DataFrame({
+            'zip_code': ['02116', '10110'],
+            'age': [30, 25],
+        }).to_csv(file_path / 'users.csv', index=False)
+
+        handler = CSVHandler()
+
+        # Run
+        data = handler.read(tmpdir, file_names=['users.csv'])
+
+        # Assert
+        expected = pd.DataFrame({'zip_code': ['02116', '10110'], 'age': [30, 25]})
+        pd.testing.assert_frame_equal(data['users'], expected)
+
+    def test_read_keep_leading_zeros_false(self, tmpdir):
+        """Test that leading zeros can be ignored when requested."""
+        # Setup
+        file_path = Path(tmpdir)
+        pd.DataFrame({
+            'zip_code': ['02116', '10110'],
+            'age': [30, 25],
+        }).to_csv(file_path / 'users.csv', index=False)
+
+        handler = CSVHandler()
+
+        # Run
+        data = handler.read(tmpdir, file_names=['users.csv'], keep_leading_zeros=False)
+
+        # Assert
+        expected = pd.DataFrame({'zip_code': [2116, 10110], 'age': [30, 25]})
+        pd.testing.assert_frame_equal(data['users'], expected)
+
     def test_read_files(self, tmpdir):
         """Test the read method of CSVHandler class with given ``file_names``."""
         # Setup