Skip to content

Commit 80be530

Browse files
committed
do not iterate over all ascii
1 parent b36d23f commit 80be530

File tree

1 file changed

+16
-10
lines changed

1 file changed

+16
-10
lines changed

Lib/csv.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class excel:
7171
QUOTE_STRINGS, QUOTE_NOTNULL
7272
from _csv import Dialect as _Dialect
7373

74+
from collections import defaultdict, Counter
7475
from io import StringIO
7576

7677
__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
@@ -367,25 +368,30 @@ def _guess_delimiter(self, data, delimiters):
367368

368369
data = list(filter(None, data.split('\n')))
369370

370-
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
371+
ascii = {chr(c) for c in range(127)} # 7-bit ASCII
371372

372373
# build frequency tables
373374
chunkLength = min(10, len(data))
374375
iteration = 0
375-
charFrequency = {}
376+
# {char -> {count_per_line -> num_lines_with_that_count}}
377+
charFrequency = defaultdict(Counter)
376378
modes = {}
377379
delims = {}
378380
start, end = 0, chunkLength
379381
while start < len(data):
380382
iteration += 1
381-
for line in data[start:end]:
382-
for char in ascii:
383-
metaFrequency = charFrequency.get(char, {})
384-
# must count even if frequency is 0
385-
freq = line.count(char)
386-
# value is the mode
387-
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
388-
charFrequency[char] = metaFrequency
383+
chunk = data[start:end]
384+
candidate_chars = set("".join(chunk))
385+
candidate_chars.intersection_update(ascii)
386+
for line in chunk:
387+
for char in candidate_chars:
388+
count = line.count(char)
389+
charFrequency[char][count] += 1
390+
391+
missing_chars = ascii.difference(candidate_chars)
392+
chunk_len = len(chunk)
393+
for char in missing_chars:
394+
charFrequency[char][0] += chunk_len
389395

390396
for char in charFrequency.keys():
391397
items = list(charFrequency[char].items())

0 commit comments

Comments
 (0)