Skip to content

Commit a48a4af

Browse files
committed
wip
1 parent 6f05c1f commit a48a4af

File tree

2 files changed

+92
-3
lines changed

2 files changed

+92
-3
lines changed

sdv/io/local/local.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,49 @@ class CSVHandler(BaseLocalHandler):
6161
def __init__(self):
6262
pass
6363

64-
def read(self, folder_name, file_names=None, read_csv_parameters=None):
64+
def _preserve_leading_zeros(self, file_path, table_data, read_csv_parameters):
65+
"""Reload numeric columns as strings when they contain leading zeros.
66+
67+
Args:
68+
file_path (Path or str):
69+
Path to the CSV file being read.
70+
table_data (pandas.DataFrame):
71+
DataFrame produced by the initial ``read_csv`` call.
72+
read_csv_parameters (dict):
73+
Parameters used for the initial read that will be reused for the
74+
follow-up read.
75+
76+
Returns:
77+
pandas.DataFrame:
78+
The updated DataFrame with any leading-zero numeric columns
79+
preserved as strings.
80+
"""
81+
candidate_columns = [
82+
column
83+
for column in table_data.columns
84+
if pd.api.types.is_numeric_dtype(table_data[column])
85+
and not pd.api.types.is_bool_dtype(table_data[column])
86+
]
87+
if not candidate_columns:
88+
return table_data
89+
90+
leading_zero_parameters = read_csv_parameters.copy()
91+
for key in ('dtype', 'converters', 'parse_dates', 'date_parser'):
92+
leading_zero_parameters.pop(key, None)
93+
94+
leading_zero_parameters['dtype'] = str
95+
leading_zero_parameters['usecols'] = candidate_columns
96+
string_data = pd.read_csv(file_path, **leading_zero_parameters)
97+
98+
for column in candidate_columns:
99+
series = string_data[column].dropna().astype(str)
100+
has_leading_zeros = series.str.match(r'^0\d+').any()
101+
if has_leading_zeros:
102+
table_data[column] = string_data[column]
103+
104+
return table_data
105+
106+
def read(self, folder_name, file_names=None, read_csv_parameters=None, keep_leading_zeros=True):
65107
"""Read data from CSV files and return it along with metadata.
66108
67109
Args:
@@ -75,6 +117,10 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
75117
The keys are any of the parameter names of the pandas.read_csv function
76118
and the values are your inputs. Defaults to
77119
`{'parse_dates': False, 'low_memory': False, 'on_bad_lines': 'warn'}`
120+
keep_leading_zeros (bool):
121+
Whether to preserve leading zeros by detecting numeric columns that have
122+
string values with leading zeros and loading those columns as strings.
123+
Defaults to ``True``.
78124
79125
Returns:
80126
dict:
@@ -123,7 +169,14 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
123169

124170
for file_path in file_paths:
125171
table_name = file_path.stem # Remove file extension to get table name
126-
data[table_name] = pd.read_csv(file_path, **read_csv_parameters)
172+
table_data = pd.read_csv(file_path, **read_csv_parameters)
173+
174+
if keep_leading_zeros:
175+
table_data = self._preserve_leading_zeros(
176+
file_path, table_data, read_csv_parameters
177+
)
178+
179+
data[table_name] = table_data
127180

128181
return data
129182

tests/unit/io/local/test_local.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def test_read(self, mock_read_csv, mock_glob):
9393
handler = CSVHandler()
9494

9595
# Run
96-
data = handler.read('/path/to/data')
96+
data = handler.read('/path/to/data', keep_leading_zeros=False)
9797

9898
# Assert
9999
assert len(data) == 2
@@ -107,6 +107,42 @@ def test_read(self, mock_read_csv, mock_glob):
107107
data['child'], pd.DataFrame({'col3': [4, 5, 6], 'col4': ['d', 'e', 'f']})
108108
)
109109

110+
def test_read_keep_leading_zeros_default(self, tmpdir):
111+
"""Test that leading zeros are preserved by default."""
112+
# Setup
113+
file_path = Path(tmpdir)
114+
pd.DataFrame({
115+
'zip_code': ['02116', '10110'],
116+
'age': [30, 25],
117+
}).to_csv(file_path / 'users.csv', index=False)
118+
119+
handler = CSVHandler()
120+
121+
# Run
122+
data = handler.read(tmpdir, file_names=['users.csv'])
123+
124+
# Assert
125+
expected = pd.DataFrame({'zip_code': ['02116', '10110'], 'age': [30, 25]})
126+
pd.testing.assert_frame_equal(data['users'], expected)
127+
128+
def test_read_keep_leading_zeros_false(self, tmpdir):
129+
"""Test that leading zeros can be ignored when requested."""
130+
# Setup
131+
file_path = Path(tmpdir)
132+
pd.DataFrame({
133+
'zip_code': ['02116', '10110'],
134+
'age': [30, 25],
135+
}).to_csv(file_path / 'users.csv', index=False)
136+
137+
handler = CSVHandler()
138+
139+
# Run
140+
data = handler.read(tmpdir, file_names=['users.csv'], keep_leading_zeros=False)
141+
142+
# Assert
143+
expected = pd.DataFrame({'zip_code': [2116, 10110], 'age': [30, 25]})
144+
pd.testing.assert_frame_equal(data['users'], expected)
145+
110146
def test_read_files(self, tmpdir):
111147
"""Test the read method of CSVHandler class with given ``file_names``."""
112148
# Setup

0 commit comments

Comments
 (0)