55from pathlib import Path
66from unittest .mock import Mock , call , patch
77
8+ import numpy as np
89import pandas as pd
910import pytest
1011
@@ -93,7 +94,7 @@ def test_read(self, mock_read_csv, mock_glob):
9394 handler = CSVHandler ()
9495
9596 # Run
96- data = handler .read ('/path/to/data' )
97+ data = handler .read ('/path/to/data' , keep_leading_zeros = False )
9798
9899 # Assert
99100 assert len (data ) == 2
@@ -107,6 +108,74 @@ def test_read(self, mock_read_csv, mock_glob):
107108 data ['child' ], pd .DataFrame ({'col3' : [4 , 5 , 6 ], 'col4' : ['d' , 'e' , 'f' ]})
108109 )
109110
111+ def test_read_keep_leading_zeros_default (self , tmpdir ):
112+ """Test that leading zeros are preserved by default."""
113+ # Setup
114+ file_path = Path (tmpdir )
115+ data = pd .DataFrame ({
116+ 'zip_code' : ['02116' , '10110' ],
117+ 'age' : [30 , 25 ],
118+ })
119+ data .to_csv (file_path / 'users.csv' , index = False )
120+
121+ handler = CSVHandler ()
122+
123+ # Run
124+ out = handler .read (tmpdir , file_names = ['users.csv' ])
125+
126+ # Assert
127+ pd .testing .assert_frame_equal (out ['users' ], data )
128+
129+ def test_read_keep_leading_zeros_multiple_files_mixed_types (self , tmpdir ):
130+ """Test leading zeros with multiple files and mixed dtypes."""
131+ # Setup
132+ file_path = Path (tmpdir )
133+ users = pd .DataFrame ({
134+ 'user_id' : [1 , 2 , None ],
135+ 'zip_code' : ['00123' , '98765' , np .nan ],
136+ 'age' : [30 , 25 , None ],
137+ 'is_active' : [True , False , None ],
138+ 'joined_at' : ['2024-01-01' , '2024-01-02' , None ],
139+ })
140+ orders = pd .DataFrame ({
141+ 'order_id' : [10 , 20 , np .nan ],
142+ 'tracking_code' : ['000045' , '123450' , None ],
143+ 'amount' : [10.5 , 20.0 , None ],
144+ 'discount_rate' : [0.0001 , 0.0015 , np .nan ],
145+ 'notes' : ['first' , 'second' , np .nan ],
146+ })
147+ users .to_csv (file_path / 'users.csv' , index = False )
148+ orders .to_csv (file_path / 'orders.csv' , index = False )
149+
150+ handler = CSVHandler ()
151+
152+ # Run
153+ out = handler .read (tmpdir , file_names = ['users.csv' , 'orders.csv' ])
154+
155+ # Assert
156+ users_expected = users .where (users .notna (), np .nan )
157+ orders_expected = orders .where (orders .notna (), np .nan )
158+ pd .testing .assert_frame_equal (out ['users' ], users_expected )
159+ pd .testing .assert_frame_equal (out ['orders' ], orders_expected )
160+
161+ def test_read_keep_leading_zeros_false (self , tmpdir ):
162+ """Test that leading zeros can be ignored when requested."""
163+ # Setup
164+ file_path = Path (tmpdir )
165+ pd .DataFrame ({
166+ 'zip_code' : ['02116' , '10110' ],
167+ 'age' : [30 , 25 ],
168+ }).to_csv (file_path / 'users.csv' , index = False )
169+
170+ handler = CSVHandler ()
171+
172+ # Run
173+ data = handler .read (tmpdir , file_names = ['users.csv' ], keep_leading_zeros = False )
174+
175+ # Assert
176+ expected = pd .DataFrame ({'zip_code' : [2116 , 10110 ], 'age' : [30 , 25 ]})
177+ pd .testing .assert_frame_equal (data ['users' ], expected )
178+
110179 def test_read_files (self , tmpdir ):
111180 """Test the read method of CSVHandler class with given ``file_names``."""
112181 # Setup
0 commit comments