Skip to content

Commit 2666bd4

Browse files
committed
wip
1 parent 6f05c1f commit 2666bd4

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

sdv/io/local/local.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,46 @@ class CSVHandler(BaseLocalHandler):
6161
def __init__(self):
6262
pass
6363

64-
def read(self, folder_name, file_names=None, read_csv_parameters=None):
64+
def _keep_leading_zeros(self, file_path, table_data, read_csv_parameters):
65+
"""Reload numeric columns as strings when they contain leading zeros.
66+
67+
Args:
68+
file_path (Path):
69+
Path to the CSV file being read.
70+
table_data (pandas.DataFrame):
71+
DataFrame produced by ``read_csv`` call.
72+
read_csv_parameters (dict):
73+
Parameters used for the initial read that will be reused for the
74+
follow-up read.
75+
76+
Returns:
77+
pandas.DataFrame:
78+
The updated DataFrame with any leading-zero numeric columns
79+
preserved as strings.
80+
"""
81+
candidate_columns = [
82+
column
83+
for column in table_data.columns
84+
if pd.api.types.is_numeric_dtype(table_data[column])
85+
and not pd.api.types.is_bool_dtype(table_data[column])
86+
]
87+
if not candidate_columns:
88+
return table_data
89+
90+
leading_zero_parameters = read_csv_parameters.copy()
91+
leading_zero_parameters['dtype'] = str
92+
leading_zero_parameters['usecols'] = candidate_columns
93+
string_data = pd.read_csv(file_path, **leading_zero_parameters)
94+
95+
for column in candidate_columns:
96+
series = string_data[column].dropna().astype(str)
97+
has_leading_zeros = series.str.match(r'^0\d+').any()
98+
if has_leading_zeros:
99+
table_data[column] = string_data[column]
100+
101+
return table_data
102+
103+
def read(self, folder_name, file_names=None, read_csv_parameters=None, keep_leading_zeros=True):
65104
"""Read data from CSV files and return it along with metadata.
66105
67106
Args:
@@ -75,6 +114,10 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
75114
The keys are any of the parameter names of the pandas.read_csv function
76115
and the values are your inputs. Defaults to
77116
`{'parse_dates': False, 'low_memory': False, 'on_bad_lines': 'warn'}`
117+
keep_leading_zeros (bool):
118+
Whether to keep leading zeros by detecting numeric columns that have
119+
string values with leading zeros and loading those columns as strings.
120+
Defaults to ``True``.
78121
79122
Returns:
80123
dict:
@@ -123,7 +166,12 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
123166

124167
for file_path in file_paths:
125168
table_name = file_path.stem # Remove file extension to get table name
126-
data[table_name] = pd.read_csv(file_path, **read_csv_parameters)
169+
table_data = pd.read_csv(file_path, **read_csv_parameters)
170+
171+
if keep_leading_zeros:
172+
table_data = self._keep_leading_zeros(file_path, table_data, read_csv_parameters)
173+
174+
data[table_name] = table_data
127175

128176
return data
129177

tests/unit/io/local/test_local.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66
from unittest.mock import Mock, call, patch
77

8+
import numpy as np
89
import pandas as pd
910
import pytest
1011

@@ -93,7 +94,7 @@ def test_read(self, mock_read_csv, mock_glob):
9394
handler = CSVHandler()
9495

9596
# Run
96-
data = handler.read('/path/to/data')
97+
data = handler.read('/path/to/data', keep_leading_zeros=False)
9798

9899
# Assert
99100
assert len(data) == 2
@@ -107,6 +108,74 @@ def test_read(self, mock_read_csv, mock_glob):
107108
data['child'], pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
108109
)
109110

111+
def test_read_keep_leading_zeros_default(self, tmpdir):
112+
"""Test that leading zeros are preserved by default."""
113+
# Setup
114+
file_path = Path(tmpdir)
115+
data = pd.DataFrame({
116+
'zip_code': ['02116', '10110'],
117+
'age': [30, 25],
118+
})
119+
data.to_csv(file_path / 'users.csv', index=False)
120+
121+
handler = CSVHandler()
122+
123+
# Run
124+
out = handler.read(tmpdir, file_names=['users.csv'])
125+
126+
# Assert
127+
pd.testing.assert_frame_equal(out['users'], data)
128+
129+
def test_read_keep_leading_zeros_multiple_files_mixed_types(self, tmpdir):
130+
"""Test leading zeros with multiple files and mixed dtypes."""
131+
# Setup
132+
file_path = Path(tmpdir)
133+
users = pd.DataFrame({
134+
'user_id': [1, 2, None],
135+
'zip_code': ['00123', '98765', np.nan],
136+
'age': [30, 25, None],
137+
'is_active': [True, False, None],
138+
'joined_at': ['2024-01-01', '2024-01-02', None],
139+
})
140+
orders = pd.DataFrame({
141+
'order_id': [10, 20, np.nan],
142+
'tracking_code': ['000045', '123450', None],
143+
'amount': [10.5, 20.0, None],
144+
'discount_rate': [0.0001, 0.0015, np.nan],
145+
'notes': ['first', 'second', np.nan],
146+
})
147+
users.to_csv(file_path / 'users.csv', index=False)
148+
orders.to_csv(file_path / 'orders.csv', index=False)
149+
150+
handler = CSVHandler()
151+
152+
# Run
153+
out = handler.read(tmpdir, file_names=['users.csv', 'orders.csv'])
154+
155+
# Assert
156+
users_expected = users.where(users.notna(), np.nan)
157+
orders_expected = orders.where(orders.notna(), np.nan)
158+
pd.testing.assert_frame_equal(out['users'], users_expected)
159+
pd.testing.assert_frame_equal(out['orders'], orders_expected)
160+
161+
def test_read_keep_leading_zeros_false(self, tmpdir):
162+
"""Test that leading zeros can be ignored when requested."""
163+
# Setup
164+
file_path = Path(tmpdir)
165+
pd.DataFrame({
166+
'zip_code': ['02116', '10110'],
167+
'age': [30, 25],
168+
}).to_csv(file_path / 'users.csv', index=False)
169+
170+
handler = CSVHandler()
171+
172+
# Run
173+
data = handler.read(tmpdir, file_names=['users.csv'], keep_leading_zeros=False)
174+
175+
# Assert
176+
expected = pd.DataFrame({'zip_code': [2116, 10110], 'age': [30, 25]})
177+
pd.testing.assert_frame_equal(data['users'], expected)
178+
110179
def test_read_files(self, tmpdir):
111180
"""Test the read method of CSVHandler class with given ``file_names``."""
112181
# Setup

0 commit comments

Comments
 (0)