@@ -71,6 +71,7 @@ class excel:
71
71
QUOTE_STRINGS , QUOTE_NOTNULL
72
72
from _csv import Dialect as _Dialect
73
73
74
+ from collections import defaultdict , Counter
74
75
from io import StringIO
75
76
76
77
__all__ = ["QUOTE_MINIMAL" , "QUOTE_ALL" , "QUOTE_NONNUMERIC" , "QUOTE_NONE" ,
@@ -367,25 +368,30 @@ def _guess_delimiter(self, data, delimiters):
367
368
368
369
data = list (filter (None , data .split ('\n ' )))
369
370
370
- ascii = [ chr (c ) for c in range (127 )] # 7-bit ASCII
371
+ ascii = { chr (c ) for c in range (127 )} # 7-bit ASCII
371
372
372
373
# build frequency tables
373
374
chunkLength = min (10 , len (data ))
374
375
iteration = 0
375
- charFrequency = {}
376
+ # {char -> {count_per_line -> num_lines_with_that_count}}
377
+ charFrequency = defaultdict (Counter )
376
378
modes = {}
377
379
delims = {}
378
380
start , end = 0 , chunkLength
379
381
while start < len (data ):
380
382
iteration += 1
381
- for line in data [start :end ]:
382
- for char in ascii :
383
- metaFrequency = charFrequency .get (char , {})
384
- # must count even if frequency is 0
385
- freq = line .count (char )
386
- # value is the mode
387
- metaFrequency [freq ] = metaFrequency .get (freq , 0 ) + 1
388
- charFrequency [char ] = metaFrequency
383
+ chunk = data [start :end ]
384
+ candidate_chars = set ("" .join (chunk ))
385
+ candidate_chars .intersection_update (ascii )
386
+ for line in chunk :
387
+ for char in candidate_chars :
388
+ count = line .count (char )
389
+ charFrequency [char ][count ] += 1
390
+
391
+ missing_chars = ascii .difference (candidate_chars )
392
+ chunk_len = len (chunk )
393
+ for char in missing_chars :
394
+ charFrequency [char ][0 ] += chunk_len
389
395
390
396
for char in charFrequency .keys ():
391
397
items = list (charFrequency [char ].items ())
0 commit comments