diff --git a/Lib/csv.py b/Lib/csv.py index 0a627ba7a512fa..1ca893b384fca5 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -265,7 +265,7 @@ class dialect(Dialect): # _csv.reader won't accept a quotechar of '' dialect.quotechar = quotechar or '"' dialect.skipinitialspace = skipinitialspace - + dialect.quoting = self._guess_quoting(sample, dialect) return dialect @@ -448,6 +448,66 @@ def _guess_delimiter(self, data, delimiters): data[0].count("%c " % delim)) return (delim, skipinitialspace) + def _guess_quoting(self, data, dialect): + """ + Looks for the quoting rules that are used in the data setting the most conservative quoting rule. + """ + lines = data.split(dialect.lineterminator) + if len(lines) == 1: + lines = data.split("\n") + policies = [] + for line in lines[:5]: + elems = line.split(dialect.delimiter) + num_quoted_elems = 0 + num_float_elems = 0 + num_none_elems = 0 + for elem in elems: + if not elem: + continue + if elem[0] == dialect.quotechar: + num_quoted_elems += 1 + elem = elem.strip(dialect.quotechar) + if dialect.doublequote: + if elem[0] == dialect.quotechar: + elem = elem.strip(dialect.quotechar) + elif elem[0] == '"': + elem = elem.strip('"') + else: + elem = elem.strip("'") + try: + float(elem) + num_float_elems += 1 + is_float = True + except ValueError: + is_float = False + if elem == "None": + num_none_elems += 1 + max_quote_policy = QUOTE_NONE + if num_quoted_elems == len(elems): + max_quote_policy = QUOTE_ALL + elif num_quoted_elems == len(elems) - num_none_elems: + max_quote_policy = QUOTE_NOTNULL + elif num_quoted_elems == len(elems) - num_float_elems: + max_quote_policy = QUOTE_NONNUMERIC + elif num_quoted_elems == len(elems) - num_none_elems - num_float_elems: + max_quote_policy = QUOTE_STRINGS + elif num_quoted_elems > 0: + max_quote_policy = QUOTE_MINIMAL + policies.append(max_quote_policy) + return self._determine_quote_priority(policies) + + def _determine_quote_priority(self, quote_policies): + priority_order = { + QUOTE_ALL: 50, + QUOTE_NOTNULL: 40, + QUOTE_NONNUMERIC: 30, + QUOTE_STRINGS: 20, + QUOTE_MINIMAL: 10, + QUOTE_NONE: 0 + } + reverse_priority_order = {v: k for k, v in priority_order.items()} + max_priority = map(lambda e:priority_order.get(e, 0), quote_policies) + return reverse_priority_order[max(max_priority)] def has_header(self, sample): # Creates a dictionary of types of data in each column. If any diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 9aace57633b0c6..68f73208a0c0b8 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1342,6 +1342,14 @@ def test_has_header_strings(self): self.assertFalse(sniffer.has_header(self.sample10)) self.assertFalse(sniffer.has_header(self.sample11)) + def test_issue129374_writing_quoted_strings(self): + s = '42,"hello","world",-1\n' + d = csv.Sniffer().sniff(s) + x = list(csv.reader([s], d)) + f = StringIO() + csv.writer(f,d).writerows(x) + self.assertEqual(f.getvalue(), '42.0,"hello","world",-1.0\r\n') + def test_has_header(self): sniffer = csv.Sniffer() self.assertIs(sniffer.has_header(self.sample1), False) @@ -1415,6 +1423,14 @@ def test_doublequote(self): dialect = sniffer.sniff(self.sample9) self.assertTrue(dialect.doublequote) + def test_quote_detection(self): + sniffer = csv.Sniffer() + dialect = sniffer.sniff(self.sample1) + self.assertEqual(dialect.quoting, csv.QUOTE_NONE) + dialect = sniffer.sniff(self.sample2) + self.assertEqual(dialect.quoting, csv.QUOTE_ALL) + + class NUL: def write(s, *args): pass diff --git a/Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst b/Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst new file mode 100644 index 00000000000000..1274425e95c1d9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-21-12-57-29.gh-issue-129374.nVYHmg.rst @@ -0,0 +1 @@ +csv sniffer gains support for detecting quoting policy