4444
4545class TestDSVReader (unittest .TestCase ):
4646 def test_csv_reader_local (self ):
47+ """Test basic CSV reading with standard format."""
4748 csv_content = "id,name,age\n 1,Alice,25\n 2,Bob,30\n "
4849
4950 with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as tmp :
@@ -69,6 +70,7 @@ def test_csv_reader_local(self):
6970 os .unlink (tmp_path )
7071
7172 def test_tsv_reader_local (self ):
73+ """Test basic TSV reading with tab separator."""
7274 tsv_content = "id\t name\t age\n 1\t Alice\t 25\n 2\t Bob\t 30\n "
7375
7476 with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".tsv" , delete = False ) as tmp :
@@ -93,6 +95,7 @@ def test_tsv_reader_local(self):
9395 os .unlink (tmp_path )
9496
9597 def test_csv_with_gzip_compression (self ):
98+ """Test CSV reading with gzip compression."""
9699 csv_content = "id,name\n 1,Test\n "
97100
98101 with tempfile .NamedTemporaryFile (suffix = ".csv.gz" , delete = False ) as tmp :
@@ -142,6 +145,7 @@ def test_malformed_quoted_csv(self):
142145 os .unlink (tmp_path )
143146
144147 def test_custom_separator (self ):
148+ """Test CSV reading with custom separator."""
145149 custom_csv = "id;name;age\n 1;Alice;25\n "
146150
147151 with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as tmp :
@@ -167,6 +171,7 @@ def test_custom_separator(self):
167171
168172 @patch ("pandas.read_csv" )
169173 def test_gcs_csv_reading (self , mock_read_csv ):
174+ """Test GCS CSV reading with mocked pandas."""
170175 mock_df = pd .DataFrame ({"id" : [1 ], "name" : ["Test" ]})
171176
172177 def mock_read_csv_impl (* args , ** kwargs ):
@@ -196,6 +201,7 @@ def __exit__(self, *args):
196201 @patch ("pandas.read_csv" )
197202 @patch ("metadata.readers.dataframe.dsv.return_s3_storage_options" )
198203 def test_s3_csv_reading (self , mock_storage_opts , mock_read_csv ):
204+ """Test S3 CSV reading with mocked pandas."""
199205 mock_storage_opts .return_value = {}
200206 mock_df = pd .DataFrame ({"id" : [1 ], "name" : ["Test" ]})
201207
@@ -229,6 +235,7 @@ def __exit__(self, *args):
229235 @patch ("pandas.read_csv" )
230236 @patch ("metadata.readers.dataframe.dsv.return_azure_storage_options" )
231237 def test_azure_csv_reading (self , mock_storage_opts , mock_read_csv ):
238+ """Test Azure CSV reading with mocked pandas."""
232239 mock_storage_opts .return_value = {"connection_string" : "test" }
233240 mock_df = pd .DataFrame ({"id" : [1 ], "name" : ["Test" ]})
234241
@@ -259,6 +266,141 @@ def __exit__(self, *args):
259266 chunks = list (dataframes )
260267 self .assertEqual (len (chunks ), 1 )
261268
269+ def test_csv_standard_with_special_characters (self ):
270+ """Test standard CSV with commas in quoted fields, empty values, and special characters."""
271+ csv_content = 'id,name,address,notes\n 1,"John Doe","123 Main St, City, State","Active customer"\n 2,"Jane Smith",,"VIP status, priority"\n 3,,,""\n '
272+
273+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as tmp :
274+ tmp .write (csv_content )
275+ tmp_path = tmp .name
276+
277+ try :
278+ config = LocalConfig ()
279+ reader = CSVDataFrameReader (config , None )
280+
281+ result = reader ._read (key = tmp_path , bucket_name = "" )
282+
283+ chunks = list (result .dataframes ())
284+ self .assertEqual (len (chunks ), 1 )
285+ self .assertEqual (chunks [0 ].shape , (3 , 4 ))
286+
287+ # Row 1: standard values with commas in quoted field
288+ self .assertEqual (chunks [0 ].iloc [0 ]["id" ], 1 )
289+ self .assertEqual (chunks [0 ].iloc [0 ]["name" ], "John Doe" )
290+ self .assertEqual (chunks [0 ].iloc [0 ]["address" ], "123 Main St, City, State" )
291+ self .assertEqual (chunks [0 ].iloc [0 ]["notes" ], "Active customer" )
292+
293+ # Row 2: empty address, comma in notes
294+ self .assertEqual (chunks [0 ].iloc [1 ]["id" ], 2 )
295+ self .assertEqual (chunks [0 ].iloc [1 ]["name" ], "Jane Smith" )
296+ self .assertTrue (pd .isna (chunks [0 ].iloc [1 ]["address" ]))
297+ self .assertEqual (chunks [0 ].iloc [1 ]["notes" ], "VIP status, priority" )
298+
299+ # Row 3: mostly empty
300+ self .assertEqual (chunks [0 ].iloc [2 ]["id" ], 3 )
301+ self .assertTrue (pd .isna (chunks [0 ].iloc [2 ]["name" ]))
302+ self .assertTrue (pd .isna (chunks [0 ].iloc [2 ]["address" ]))
303+ self .assertTrue (pd .isna (chunks [0 ].iloc [2 ]["notes" ]))
304+ finally :
305+ import os
306+
307+ os .unlink (tmp_path )
308+
309+ def test_csv_complex_escaping_backslash_and_double_quote (self ):
310+ """Test complex CSV with both backslash escaping (\" ) and double-quote escaping ("") in same file."""
311+ csv_content = (
312+ "product,quantity,description,metadata\n "
313+ '"Part A",5,"Interlocked Flexible Metal Conduit, Galvanized, 50mm dia. (2\\ "), Normal","Stock: ""In warehouse"""\n '
314+ '"Component B",10,"Value with \\ "quote\\ " and, comma","Status: ""Active"" and \\ "Ready\\ ""\n '
315+ '"Item C",3,"Windows path: C:\\ \\ Users\\ \\ data.txt","Mix of ""both"" styles"\n '
316+ )
317+
318+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as tmp :
319+ tmp .write (csv_content )
320+ tmp_path = tmp .name
321+
322+ try :
323+ config = LocalConfig ()
324+ reader = CSVDataFrameReader (config , None )
325+
326+ result = reader ._read (key = tmp_path , bucket_name = "" )
327+
328+ chunks = list (result .dataframes ())
329+ self .assertEqual (len (chunks ), 1 )
330+ self .assertEqual (chunks [0 ].shape , (3 , 4 ))
331+
332+ # Row 1: backslash-escaped quote in description, double-quote in metadata
333+ self .assertEqual (chunks [0 ].iloc [0 ]["product" ], "Part A" )
334+ self .assertEqual (chunks [0 ].iloc [0 ]["quantity" ], 5 )
335+ self .assertEqual (
336+ chunks [0 ].iloc [0 ]["description" ],
337+ 'Interlocked Flexible Metal Conduit, Galvanized, 50mm dia. (2"), Normal' ,
338+ )
339+ self .assertEqual (chunks [0 ].iloc [0 ]["metadata" ], 'Stock: "In warehouse"' )
340+
341+ # Row 2: both backslash and double-quote escaping in same fields
342+ self .assertEqual (chunks [0 ].iloc [1 ]["product" ], "Component B" )
343+ self .assertEqual (chunks [0 ].iloc [1 ]["quantity" ], 10 )
344+ self .assertEqual (
345+ chunks [0 ].iloc [1 ]["description" ], 'Value with "quote" and, comma'
346+ )
347+ self .assertEqual (
348+ chunks [0 ].iloc [1 ]["metadata" ], 'Status: "Active" and "Ready"'
349+ )
350+
351+ # Row 3: Windows path with backslashes, double-quote in metadata
352+ self .assertEqual (chunks [0 ].iloc [2 ]["product" ], "Item C" )
353+ self .assertEqual (chunks [0 ].iloc [2 ]["quantity" ], 3 )
354+ self .assertEqual (
355+ chunks [0 ].iloc [2 ]["description" ], "Windows path: C:\\ Users\\ data.txt"
356+ )
357+ self .assertEqual (chunks [0 ].iloc [2 ]["metadata" ], 'Mix of "both" styles' )
358+ finally :
359+ import os
360+
361+ os .unlink (tmp_path )
362+
363+ def test_csv_edge_cases_with_newlines_and_mixed_quotes (self ):
364+ """Test edge cases with newlines in quoted fields and complex mixed escaping."""
365+ csv_content = (
366+ "id,text,value\n "
367+ '1,"Multi-line text:\n Line 1\n Line 2 with \\ "quote\\ "","Simple"\n '
368+ '2,"Text with ""double"" and \\ "backslash\\ " quotes","Complex, with comma"\n '
369+ )
370+
371+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as tmp :
372+ tmp .write (csv_content )
373+ tmp_path = tmp .name
374+
375+ try :
376+ config = LocalConfig ()
377+ reader = CSVDataFrameReader (config , None )
378+
379+ result = reader ._read (key = tmp_path , bucket_name = "" )
380+
381+ chunks = list (result .dataframes ())
382+ self .assertEqual (len (chunks ), 1 )
383+ self .assertEqual (chunks [0 ].shape , (2 , 3 ))
384+
385+ # Row 1: multi-line text with backslash-escaped quotes
386+ self .assertEqual (chunks [0 ].iloc [0 ]["id" ], 1 )
387+ self .assertEqual (
388+ chunks [0 ].iloc [0 ]["text" ],
389+ 'Multi-line text:\n Line 1\n Line 2 with "quote"' ,
390+ )
391+ self .assertEqual (chunks [0 ].iloc [0 ]["value" ], "Simple" )
392+
393+ # Row 2: both types of escaping in same field
394+ self .assertEqual (chunks [0 ].iloc [1 ]["id" ], 2 )
395+ self .assertEqual (
396+ chunks [0 ].iloc [1 ]["text" ], 'Text with "double" and "backslash" quotes'
397+ )
398+ self .assertEqual (chunks [0 ].iloc [1 ]["value" ], "Complex, with comma" )
399+ finally :
400+ import os
401+
402+ os .unlink (tmp_path )
403+
262404
263405if __name__ == "__main__" :
264406 unittest .main ()
0 commit comments