@@ -61,7 +61,49 @@ class CSVHandler(BaseLocalHandler):
6161 def __init__ (self ):
6262 pass
6363
64- def read (self , folder_name , file_names = None , read_csv_parameters = None ):
64+ def _preserve_leading_zeros (self , file_path , table_data , read_csv_parameters ):
65+ """Reload numeric columns as strings when they contain leading zeros.
66+
67+ Args:
68+ file_path (Path or str):
69+ Path to the CSV file being read.
70+ table_data (pandas.DataFrame):
71+ DataFrame produced by the initial ``read_csv`` call.
72+ read_csv_parameters (dict):
73+ Parameters used for the initial read that will be reused for the
74+ follow-up read.
75+
76+ Returns:
77+ pandas.DataFrame:
78+ The updated DataFrame with any leading-zero numeric columns
79+ preserved as strings.
80+ """
81+ candidate_columns = [
82+ column
83+ for column in table_data .columns
84+ if pd .api .types .is_numeric_dtype (table_data [column ])
85+ and not pd .api .types .is_bool_dtype (table_data [column ])
86+ ]
87+ if not candidate_columns :
88+ return table_data
89+
90+ leading_zero_parameters = read_csv_parameters .copy ()
91+ for key in ('dtype' , 'converters' , 'parse_dates' , 'date_parser' ):
92+ leading_zero_parameters .pop (key , None )
93+
94+ leading_zero_parameters ['dtype' ] = str
95+ leading_zero_parameters ['usecols' ] = candidate_columns
96+ string_data = pd .read_csv (file_path , ** leading_zero_parameters )
97+
98+ for column in candidate_columns :
99+ series = string_data [column ].dropna ().astype (str )
100+ has_leading_zeros = series .str .match (r'^0\d+' ).any ()
101+ if has_leading_zeros :
102+ table_data [column ] = string_data [column ]
103+
104+ return table_data
105+
106+ def read (self , folder_name , file_names = None , read_csv_parameters = None , keep_leading_zeros = True ):
65107 """Read data from CSV files and return it along with metadata.
66108
67109 Args:
@@ -75,6 +117,10 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
75117 The keys are any of the parameter names of the pandas.read_csv function
76118 and the values are your inputs. Defaults to
77119 `{'parse_dates': False, 'low_memory': False, 'on_bad_lines': 'warn'}`
120+ keep_leading_zeros (bool):
121+ Whether to preserve leading zeros by detecting numeric columns that have
122+ string values with leading zeros and loading those columns as strings.
123+ Defaults to ``True``.
78124
79125 Returns:
80126 dict:
@@ -123,7 +169,14 @@ def read(self, folder_name, file_names=None, read_csv_parameters=None):
123169
124170 for file_path in file_paths :
125171 table_name = file_path .stem # Remove file extension to get table name
126- data [table_name ] = pd .read_csv (file_path , ** read_csv_parameters )
172+ table_data = pd .read_csv (file_path , ** read_csv_parameters )
173+
174+ if keep_leading_zeros :
175+ table_data = self ._preserve_leading_zeros (
176+ file_path , table_data , read_csv_parameters
177+ )
178+
179+ data [table_name ] = table_data
127180
128181 return data
129182
0 commit comments