Skip to content

Commit b79fbeb

Browse files
fix: \" -> " case using escapechar as \\ (#25778)
1 parent abcadb8 commit b79fbeb

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed

ingestion/src/metadata/readers/dataframe/dsv.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ def chunk_generator():
140140
storage_options=storage_options,
141141
compression=compression,
142142
encoding_errors="ignore",
143+
escapechar="\\",
144+
engine="python",
143145
) as reader:
144146
for chunks in reader:
145147
chunks = self._fix_malformed_quoted_chunk(

ingestion/tests/unit/readers/test_dsv_reader.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
class TestDSVReader(unittest.TestCase):
4646
def test_csv_reader_local(self):
47+
"""Test basic CSV reading with standard format."""
4748
csv_content = "id,name,age\n1,Alice,25\n2,Bob,30\n"
4849

4950
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
@@ -69,6 +70,7 @@ def test_csv_reader_local(self):
6970
os.unlink(tmp_path)
7071

7172
def test_tsv_reader_local(self):
73+
"""Test basic TSV reading with tab separator."""
7274
tsv_content = "id\tname\tage\n1\tAlice\t25\n2\tBob\t30\n"
7375

7476
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as tmp:
@@ -93,6 +95,7 @@ def test_tsv_reader_local(self):
9395
os.unlink(tmp_path)
9496

9597
def test_csv_with_gzip_compression(self):
98+
"""Test CSV reading with gzip compression."""
9699
csv_content = "id,name\n1,Test\n"
97100

98101
with tempfile.NamedTemporaryFile(suffix=".csv.gz", delete=False) as tmp:
@@ -142,6 +145,7 @@ def test_malformed_quoted_csv(self):
142145
os.unlink(tmp_path)
143146

144147
def test_custom_separator(self):
148+
"""Test CSV reading with custom separator."""
145149
custom_csv = "id;name;age\n1;Alice;25\n"
146150

147151
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
@@ -167,6 +171,7 @@ def test_custom_separator(self):
167171

168172
@patch("pandas.read_csv")
169173
def test_gcs_csv_reading(self, mock_read_csv):
174+
"""Test GCS CSV reading with mocked pandas."""
170175
mock_df = pd.DataFrame({"id": [1], "name": ["Test"]})
171176

172177
def mock_read_csv_impl(*args, **kwargs):
@@ -196,6 +201,7 @@ def __exit__(self, *args):
196201
@patch("pandas.read_csv")
197202
@patch("metadata.readers.dataframe.dsv.return_s3_storage_options")
198203
def test_s3_csv_reading(self, mock_storage_opts, mock_read_csv):
204+
"""Test S3 CSV reading with mocked pandas."""
199205
mock_storage_opts.return_value = {}
200206
mock_df = pd.DataFrame({"id": [1], "name": ["Test"]})
201207

@@ -229,6 +235,7 @@ def __exit__(self, *args):
229235
@patch("pandas.read_csv")
230236
@patch("metadata.readers.dataframe.dsv.return_azure_storage_options")
231237
def test_azure_csv_reading(self, mock_storage_opts, mock_read_csv):
238+
"""Test Azure CSV reading with mocked pandas."""
232239
mock_storage_opts.return_value = {"connection_string": "test"}
233240
mock_df = pd.DataFrame({"id": [1], "name": ["Test"]})
234241

@@ -259,6 +266,141 @@ def __exit__(self, *args):
259266
chunks = list(dataframes)
260267
self.assertEqual(len(chunks), 1)
261268

269+
def test_csv_standard_with_special_characters(self):
270+
"""Test standard CSV with commas in quoted fields, empty values, and special characters."""
271+
csv_content = 'id,name,address,notes\n1,"John Doe","123 Main St, City, State","Active customer"\n2,"Jane Smith",,"VIP status, priority"\n3,,,""\n'
272+
273+
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
274+
tmp.write(csv_content)
275+
tmp_path = tmp.name
276+
277+
try:
278+
config = LocalConfig()
279+
reader = CSVDataFrameReader(config, None)
280+
281+
result = reader._read(key=tmp_path, bucket_name="")
282+
283+
chunks = list(result.dataframes())
284+
self.assertEqual(len(chunks), 1)
285+
self.assertEqual(chunks[0].shape, (3, 4))
286+
287+
# Row 1: standard values with commas in quoted field
288+
self.assertEqual(chunks[0].iloc[0]["id"], 1)
289+
self.assertEqual(chunks[0].iloc[0]["name"], "John Doe")
290+
self.assertEqual(chunks[0].iloc[0]["address"], "123 Main St, City, State")
291+
self.assertEqual(chunks[0].iloc[0]["notes"], "Active customer")
292+
293+
# Row 2: empty address, comma in notes
294+
self.assertEqual(chunks[0].iloc[1]["id"], 2)
295+
self.assertEqual(chunks[0].iloc[1]["name"], "Jane Smith")
296+
self.assertTrue(pd.isna(chunks[0].iloc[1]["address"]))
297+
self.assertEqual(chunks[0].iloc[1]["notes"], "VIP status, priority")
298+
299+
# Row 3: mostly empty
300+
self.assertEqual(chunks[0].iloc[2]["id"], 3)
301+
self.assertTrue(pd.isna(chunks[0].iloc[2]["name"]))
302+
self.assertTrue(pd.isna(chunks[0].iloc[2]["address"]))
303+
self.assertTrue(pd.isna(chunks[0].iloc[2]["notes"]))
304+
finally:
305+
import os
306+
307+
os.unlink(tmp_path)
308+
309+
def test_csv_complex_escaping_backslash_and_double_quote(self):
310+
"""Test complex CSV with both backslash escaping (\") and double-quote escaping ("") in same file."""
311+
csv_content = (
312+
"product,quantity,description,metadata\n"
313+
'"Part A",5,"Interlocked Flexible Metal Conduit, Galvanized, 50mm dia. (2\\"), Normal","Stock: ""In warehouse"""\n'
314+
'"Component B",10,"Value with \\"quote\\" and, comma","Status: ""Active"" and \\"Ready\\""\n'
315+
'"Item C",3,"Windows path: C:\\\\Users\\\\data.txt","Mix of ""both"" styles"\n'
316+
)
317+
318+
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
319+
tmp.write(csv_content)
320+
tmp_path = tmp.name
321+
322+
try:
323+
config = LocalConfig()
324+
reader = CSVDataFrameReader(config, None)
325+
326+
result = reader._read(key=tmp_path, bucket_name="")
327+
328+
chunks = list(result.dataframes())
329+
self.assertEqual(len(chunks), 1)
330+
self.assertEqual(chunks[0].shape, (3, 4))
331+
332+
# Row 1: backslash-escaped quote in description, double-quote in metadata
333+
self.assertEqual(chunks[0].iloc[0]["product"], "Part A")
334+
self.assertEqual(chunks[0].iloc[0]["quantity"], 5)
335+
self.assertEqual(
336+
chunks[0].iloc[0]["description"],
337+
'Interlocked Flexible Metal Conduit, Galvanized, 50mm dia. (2"), Normal',
338+
)
339+
self.assertEqual(chunks[0].iloc[0]["metadata"], 'Stock: "In warehouse"')
340+
341+
# Row 2: both backslash and double-quote escaping in same fields
342+
self.assertEqual(chunks[0].iloc[1]["product"], "Component B")
343+
self.assertEqual(chunks[0].iloc[1]["quantity"], 10)
344+
self.assertEqual(
345+
chunks[0].iloc[1]["description"], 'Value with "quote" and, comma'
346+
)
347+
self.assertEqual(
348+
chunks[0].iloc[1]["metadata"], 'Status: "Active" and "Ready"'
349+
)
350+
351+
# Row 3: Windows path with backslashes, double-quote in metadata
352+
self.assertEqual(chunks[0].iloc[2]["product"], "Item C")
353+
self.assertEqual(chunks[0].iloc[2]["quantity"], 3)
354+
self.assertEqual(
355+
chunks[0].iloc[2]["description"], "Windows path: C:\\Users\\data.txt"
356+
)
357+
self.assertEqual(chunks[0].iloc[2]["metadata"], 'Mix of "both" styles')
358+
finally:
359+
import os
360+
361+
os.unlink(tmp_path)
362+
363+
def test_csv_edge_cases_with_newlines_and_mixed_quotes(self):
364+
"""Test edge cases with newlines in quoted fields and complex mixed escaping."""
365+
csv_content = (
366+
"id,text,value\n"
367+
'1,"Multi-line text:\nLine 1\nLine 2 with \\"quote\\"","Simple"\n'
368+
'2,"Text with ""double"" and \\"backslash\\" quotes","Complex, with comma"\n'
369+
)
370+
371+
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
372+
tmp.write(csv_content)
373+
tmp_path = tmp.name
374+
375+
try:
376+
config = LocalConfig()
377+
reader = CSVDataFrameReader(config, None)
378+
379+
result = reader._read(key=tmp_path, bucket_name="")
380+
381+
chunks = list(result.dataframes())
382+
self.assertEqual(len(chunks), 1)
383+
self.assertEqual(chunks[0].shape, (2, 3))
384+
385+
# Row 1: multi-line text with backslash-escaped quotes
386+
self.assertEqual(chunks[0].iloc[0]["id"], 1)
387+
self.assertEqual(
388+
chunks[0].iloc[0]["text"],
389+
'Multi-line text:\nLine 1\nLine 2 with "quote"',
390+
)
391+
self.assertEqual(chunks[0].iloc[0]["value"], "Simple")
392+
393+
# Row 2: both types of escaping in same field
394+
self.assertEqual(chunks[0].iloc[1]["id"], 2)
395+
self.assertEqual(
396+
chunks[0].iloc[1]["text"], 'Text with "double" and "backslash" quotes'
397+
)
398+
self.assertEqual(chunks[0].iloc[1]["value"], "Complex, with comma")
399+
finally:
400+
import os
401+
402+
os.unlink(tmp_path)
403+
262404

263405
if __name__ == "__main__":
264406
unittest.main()

0 commit comments

Comments
 (0)