Skip to content

Commit 6be6f8f

Browse files
maurycypicnixz
andauthored
gh-137627: Make csv.Sniffer.sniff() delimiter detection 1.6x faster (#137628)
Co-authored-by: Bénédikt Tran <[email protected]>
1 parent aa9d0a6 commit 6be6f8f

File tree

4 files changed

+70
-17
lines changed

4 files changed

+70
-17
lines changed

Doc/whatsnew/3.15.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -652,11 +652,11 @@ zlib
652652
Optimizations
653653
=============
654654

655-
module_name
656-
-----------
657-
658-
* TODO
655+
csv
656+
---
659657

658+
* :meth:`csv.Sniffer.sniff` delimiter detection is now up to 1.6x faster.
659+
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
660660

661661

662662
Removed

Lib/csv.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -362,31 +362,33 @@ def _guess_delimiter(self, data, delimiters):
362362
try and evaluate the smallest portion of the data possible, evaluating
363363
additional chunks as necessary.
364364
"""
365+
from collections import Counter, defaultdict
365366

366367
data = list(filter(None, data.split('\n')))
367368

368-
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
369-
370369
# build frequency tables
371370
chunkLength = min(10, len(data))
372371
iteration = 0
373-
charFrequency = {}
372+
num_lines = 0
373+
# {char -> {count_per_line -> num_lines_with_that_count}}
374+
char_frequency = defaultdict(Counter)
374375
modes = {}
375376
delims = {}
376377
start, end = 0, chunkLength
377378
while start < len(data):
378379
iteration += 1
379380
for line in data[start:end]:
380-
for char in ascii:
381-
metaFrequency = charFrequency.get(char, {})
382-
# must count even if frequency is 0
383-
freq = line.count(char)
384-
# value is the mode
385-
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
386-
charFrequency[char] = metaFrequency
387-
388-
for char in charFrequency.keys():
389-
items = list(charFrequency[char].items())
381+
num_lines += 1
382+
for char, count in Counter(line).items():
383+
if char.isascii():
384+
char_frequency[char][count] += 1
385+
386+
for char, counts in char_frequency.items():
387+
items = list(counts.items())
388+
missed_lines = num_lines - sum(counts.values())
389+
if missed_lines:
390+
# Store the number of lines 'char' was missing from.
391+
items.append((0, missed_lines))
390392
if len(items) == 1 and items[0][0] == 0:
391393
continue
392394
# get the mode of the frequencies

Lib/test/test_csv.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,6 +1437,56 @@ def test_doublequote(self):
14371437
dialect = sniffer.sniff(self.sample9)
14381438
self.assertTrue(dialect.doublequote)
14391439

1440+
def test_guess_delimiter_crlf_not_chosen(self):
1441+
# Ensure that we pick the real delimiter ("|") over "\r" in a tie.
1442+
sniffer = csv.Sniffer()
1443+
sample = "a|b\r\nc|d\r\ne|f\r\n"
1444+
self.assertEqual(sniffer.sniff(sample).delimiter, "|")
1445+
self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r")
1446+
1447+
def test_zero_mode_tie_order_independence(self):
1448+
sniffer = csv.Sniffer()
1449+
# ":" appears in half the rows (1, 0, 1, 0) - a tie between
1450+
# 0 and 1 per line.
1451+
# "," appears once every row (true delimiter).
1452+
#
1453+
# Even if the zero-frequency bucket is appended vs. inserted, the tie
1454+
# yields an adjusted score of 0, so ":" should not be promoted and
1455+
# "," must be selected.
1456+
sample = (
1457+
"a,b:c\n"
1458+
"d,e\n"
1459+
"f,g:c\n"
1460+
"h,i\n"
1461+
)
1462+
dialect = sniffer.sniff(sample)
1463+
self.assertEqual(dialect.delimiter, ",")
1464+
1465+
def test_zero_mode_tie_order_comma_first(self):
1466+
sniffer = csv.Sniffer()
1467+
pattern = (
1468+
"a,b\n"
1469+
"c:d\n"
1470+
"e,f\n"
1471+
"g:h\n"
1472+
)
1473+
sample = pattern * 10
1474+
with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
1475+
sniffer.sniff(sample)
1476+
1477+
def test_zero_mode_tie_order_colon_first(self):
1478+
sniffer = csv.Sniffer()
1479+
pattern = (
1480+
"a:b\n"
1481+
"c,d\n"
1482+
"e:f\n"
1483+
"g,h\n"
1484+
)
1485+
sample = pattern * 10
1486+
with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
1487+
sniffer.sniff(sample)
1488+
1489+
14401490
class NUL:
14411491
def write(s, *args):
14421492
pass
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.

0 commit comments

Comments
 (0)