From 1b90e5434298bde28a07d73714e44c8a3dad4d09 Mon Sep 17 00:00:00 2001 From: Andy Terrel Date: Mon, 19 May 2025 16:39:10 -0400 Subject: [PATCH 1/4] Adds failing test --- Lib/test/test_csv.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 9aace57633b0c6..81a167a908d930 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1342,6 +1342,14 @@ def test_has_header_strings(self): self.assertFalse(sniffer.has_header(self.sample10)) self.assertFalse(sniffer.has_header(self.sample11)) + def test_issue129374_writing_quoted_strings(self): + s = '42,"hello","world",-1\n' + d = csv.Sniffer().sniff(s) + x = list(csv.reader([s], d)) + f = StringIO() + csv.writer(f,d).writerows(x) + self.assertEqual(f.getvalue(), '42,"hello", "world",-1\r\n') + def test_has_header(self): sniffer = csv.Sniffer() self.assertIs(sniffer.has_header(self.sample1), False) From 8cb069f0e747354dd61db4fe7660d4bd73402d90 Mon Sep 17 00:00:00 2001 From: Andy Terrel Date: Tue, 20 May 2025 14:46:11 -0400 Subject: [PATCH 2/4] Adds csv sniffer ability to guess a quoting policy --- Lib/csv.py | 62 +++++++++++++++++++++++++++++++++++++++++++- Lib/test/test_csv.py | 9 +++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 0a627ba7a512fa..1ca893b384fca5 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -265,7 +265,7 @@ class dialect(Dialect): # _csv.reader won't accept a quotechar of '' dialect.quotechar = quotechar or '"' dialect.skipinitialspace = skipinitialspace - + dialect.quoting = self._guess_quoting(sample, dialect) return dialect @@ -448,6 +448,66 @@ def _guess_delimiter(self, data, delimiters): data[0].count("%c " % delim)) return (delim, skipinitialspace) + def _guess_quoting(self, data, dialect): + """ + Looks for the quoting rules that are used in the data setting the most conservative quoting rule. + """ + lines = data.split(dialect.lineterminator) + if len(lines) == 1: + lines = data.split("\n") + policies = [] + for line in lines[:5]: + elems = line.split(dialect.delimiter) + num_quoted_elems = 0 + num_float_elems = 0 + num_none_elems = 0 + for elem in elems: + if not elem: + continue + if elem[0] == dialect.quotechar: + num_quoted_elems += 1 + elem = elem.strip(dialect.quotechar) + if dialect.doublequote: + if elem[0] == dialect.quotechar: + elem = elem.strip(dialect.quotechar) + elif elem[0] == '"': + elem = elem.strip('"') + else: + elem = elem.strip("'") + try: + float(elem) + num_float_elems += 1 + is_float = True + except ValueError: + is_float = False + if elem == "None": + num_none_elems += 1 + max_quote_policy = QUOTE_NONE + if num_quoted_elems == len(elems): + max_quote_policy = QUOTE_ALL + elif num_quoted_elems == len(elems) - num_none_elems: + max_quote_policy = QUOTE_NOTNULL + elif num_quoted_elems == len(elems) - num_float_elems: + max_quote_policy = QUOTE_NONNUMERIC + elif num_quoted_elems == len(elems) - num_none_elems - num_float_elems: + max_quote_policy = QUOTE_STRINGS + elif num_quoted_elems > 0: + max_quote_policy = QUOTE_MINIMAL + policies.append(max_quote_policy) + return self._determine_quote_priority(policies) + + def _determine_quote_priority(self, quote_policies): + priority_order = { + QUOTE_ALL: 50, + QUOTE_NOTNULL: 40, + QUOTE_NONNUMERIC: 30, + QUOTE_STRINGS: 20, + QUOTE_MINIMAL: 10, + QUOTE_NONE: 0 + } + reverse_priority_order = {v: k for k, v in priority_order.items()} + max_priority = map(lambda e:priority_order.get(e, 0), quote_policies) + return reverse_priority_order[max(max_priority)] def has_header(self, sample): # Creates a dictionary of types of data in each column. If any diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 81a167a908d930..071245a4ac56a4 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1423,6 +1423,15 @@ def test_doublequote(self): dialect = sniffer.sniff(self.sample9) self.assertTrue(dialect.doublequote) + def test_quote_detection(self): + sniffer = csv.Sniffer() + dialect = sniffer.sniff(self.sample1) + self.assertEqual(dialect.quoting, csv.QUOTE_NONE) + breakpoint() + dialect = sniffer.sniff(self.sample2) + self.assertEqual(dialect.quoting, csv.QUOTE_ALL) + + class NUL: def write(s, *args): pass From bf12351f2eb3d42c65e2a701944d628cc032b6b8 Mon Sep 17 00:00:00 2001 From: Andy Terrel Date: Wed, 21 May 2025 08:45:56 -0400 Subject: [PATCH 3/4] Fix test to reflect number output --- Lib/test/test_csv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 071245a4ac56a4..68f73208a0c0b8 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1348,7 +1348,7 @@ def test_issue129374_writing_quoted_strings(self): x = list(csv.reader([s], d)) f = StringIO() csv.writer(f,d).writerows(x) - self.assertEqual(f.getvalue(), '42,"hello", "world",-1\r\n') + self.assertEqual(f.getvalue(), '42.0,"hello","world",-1.0\r\n') def test_has_header(self): sniffer = csv.Sniffer() @@ -1427,7 +1427,6 @@ def test_quote_detection(self): sniffer = csv.Sniffer() dialect = sniffer.sniff(self.sample1) self.assertEqual(dialect.quoting, csv.QUOTE_NONE) - breakpoint() dialect = sniffer.sniff(self.sample2) self.assertEqual(dialect.quoting, csv.QUOTE_ALL) From db4b4453ab5876ff10d4d86b9231b99b71694081 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Wed, 21 May 2025 12:57:30 +0000 Subject: [PATCH 4/4] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst diff --git a/Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst b/Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst new file mode 100644 index 00000000000000..1274425e95c1d9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst @@ -0,0 +1 @@ +csv sniffer gains support for detecting quoting policy