diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 43c40e4d0f3154..45dd085b47a0c3 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -423,11 +423,11 @@ zlib Optimizations ============= -module_name ------------ - -* TODO +csv +--- +* :meth:`csv.Sniffer.sniff` delimiter detection is now up to 1.6x faster. + (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.) Deprecated diff --git a/Lib/csv.py b/Lib/csv.py index 0a627ba7a512fa..e60103c5f33f5a 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -364,31 +364,33 @@ def _guess_delimiter(self, data, delimiters): try and evaluate the smallest portion of the data possible, evaluating additional chunks as necessary. """ + from collections import Counter, defaultdict data = list(filter(None, data.split('\n'))) - ascii = [chr(c) for c in range(127)] # 7-bit ASCII - # build frequency tables chunkLength = min(10, len(data)) iteration = 0 - charFrequency = {} + num_lines = 0 + # {char -> {count_per_line -> num_lines_with_that_count}} + char_frequency = defaultdict(Counter) modes = {} delims = {} start, end = 0, chunkLength while start < len(data): iteration += 1 for line in data[start:end]: - for char in ascii: - metaFrequency = charFrequency.get(char, {}) - # must count even if frequency is 0 - freq = line.count(char) - # value is the mode - metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 - charFrequency[char] = metaFrequency - - for char in charFrequency.keys(): - items = list(charFrequency[char].items()) + num_lines += 1 + for char, count in Counter(line).items(): + if char.isascii(): + char_frequency[char][count] += 1 + + for char, counts in char_frequency.items(): + items = list(counts.items()) + missed_lines = num_lines - sum(counts.values()) + if missed_lines: + # Store the number of lines 'char' was missing from. + items.append((0, missed_lines)) if len(items) == 1 and items[0][0] == 0: continue # get the mode of the frequencies diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 60feab225a107c..15d10a311730d3 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1437,6 +1437,56 @@ def test_doublequote(self): dialect = sniffer.sniff(self.sample9) self.assertTrue(dialect.doublequote) + def test_guess_delimiter_crlf_not_chosen(self): + # Ensure that we pick the real delimiter ("|") over "\r" in a tie. + sniffer = csv.Sniffer() + sample = "a|b\r\nc|d\r\ne|f\r\n" + self.assertEqual(sniffer.sniff(sample).delimiter, "|") + self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r") + + def test_zero_mode_tie_order_independence(self): + sniffer = csv.Sniffer() + # ":" appears in half the rows (1, 0, 1, 0) - a tie between + # 0 and 1 per line. + # "," appears once every row (true delimiter). + # + # Even if the zero-frequency bucket is appended vs. inserted, the tie + # yields an adjusted score of 0, so ":" should not be promoted and + # "," must be selected. + sample = ( + "a,b:c\n" + "d,e\n" + "f,g:c\n" + "h,i\n" + ) + dialect = sniffer.sniff(sample) + self.assertEqual(dialect.delimiter, ",") + + def test_zero_mode_tie_order_comma_first(self): + sniffer = csv.Sniffer() + pattern = ( + "a,b\n" + "c:d\n" + "e,f\n" + "g:h\n" + ) + sample = pattern * 10 + with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"): + sniffer.sniff(sample) + + def test_zero_mode_tie_order_colon_first(self): + sniffer = csv.Sniffer() + pattern = ( + "a:b\n" + "c,d\n" + "e:f\n" + "g,h\n" + ) + sample = pattern * 10 + with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"): + sniffer.sniff(sample) + + class NUL: def write(s, *args): pass diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst new file mode 100644 index 00000000000000..855070ed6f4511 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -0,0 +1 @@ +Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.