@@ -84,8 +84,6 @@ class excel:
8484__version__ = "1.0"
8585
8686
87- _ASCII_CHARS = frozenset (map (chr , range (127 ))) # 7-bit ASCII
88-
8987class Dialect :
9088 """Describe a CSV dialect.
9189
@@ -373,29 +371,28 @@ def _guess_delimiter(self, data, delimiters):
373371 # build frequency tables
374372 chunkLength = min (10 , len (data ))
375373 iteration = 0
374+ seen = 0
376375 # {char -> {count_per_line -> num_lines_with_that_count}}
377376 charFrequency = defaultdict (Counter )
378377 modes = {}
379378 delims = {}
380379 start , end = 0 , chunkLength
381380 while start < len (data ):
382381 iteration += 1
383- chunk = data [start :end ]
384- candidate_chars = set ().union (* chunk )
385- candidate_chars &= _ASCII_CHARS
386- for line in chunk :
387- for char in candidate_chars :
388- count = line .count (char )
389- charFrequency [char ][count ] += 1
390-
391- # must count even if frequency is 0
392- missing_chars = _ASCII_CHARS - candidate_chars
393- chunk_len = len (chunk )
394- for char in missing_chars :
395- charFrequency [char ][0 ] += chunk_len
396-
397- for char in charFrequency .keys ():
398- items = list (charFrequency [char ].items ())
382+ for line in data [start :end ]:
383+ seen += 1
384+ charCounts = Counter (line )
385+ for char , count in charCounts .items ():
386+ if ord (char ) < 127 :
387+ charFrequency [char ][count ] += 1
388+
389+ for char , counts in charFrequency .items ():
390+ presentCount = sum (counts .values ())
391+ zeroCount = seen - presentCount
392+ if zeroCount > 0 :
393+ items = list (counts .items ()) + [(0 , zeroCount )]
394+ else :
395+ items = list (counts .items ())
399396 if len (items ) == 1 and items [0 ][0 ] == 0 :
400397 continue
401398 # get the mode of the frequencies
0 commit comments