Skip to content

Commit 8cb069f

Browse files
committed
Adds csv sniffer ability to guess a quoting policy
1 parent 1b90e54 commit 8cb069f

File tree

2 files changed

+70
-1
lines changed

2 files changed

+70
-1
lines changed

Lib/csv.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ class dialect(Dialect):
265265
# _csv.reader won't accept a quotechar of ''
266266
dialect.quotechar = quotechar or '"'
267267
dialect.skipinitialspace = skipinitialspace
268-
268+
dialect.quoting = self._guess_quoting(sample, dialect)
269269
return dialect
270270

271271

@@ -448,6 +448,66 @@ def _guess_delimiter(self, data, delimiters):
448448
data[0].count("%c " % delim))
449449
return (delim, skipinitialspace)
450450

451+
def _guess_quoting(self, data, dialect):
452+
"""
453+
Looks for the quoting rules that are used in the data setting the most conservative quoting rule.
454+
"""
455+
lines = data.split(dialect.lineterminator)
456+
if len(lines) == 1:
457+
lines = data.split("\n")
458+
policies = []
459+
for line in lines[:5]:
460+
elems = line.split(dialect.delimiter)
461+
num_quoted_elems = 0
462+
num_float_elems = 0
463+
num_none_elems = 0
464+
for elem in elems:
465+
if not elem:
466+
continue
467+
if elem[0] == dialect.quotechar:
468+
num_quoted_elems += 1
469+
elem = elem.strip(dialect.quotechar)
470+
if dialect.doublequote:
471+
if elem[0] == dialect.quotechar:
472+
elem = elem.strip(dialect.quotechar)
473+
elif elem[0] == '"':
474+
elem = elem.strip('"')
475+
else:
476+
elem = elem.strip("'")
477+
try:
478+
float(elem)
479+
num_float_elems += 1
480+
is_float = True
481+
except ValueError:
482+
is_float = False
483+
if elem == "None":
484+
num_none_elems += 1
485+
max_quote_policy = QUOTE_NONE
486+
if num_quoted_elems == len(elems):
487+
max_quote_policy = QUOTE_ALL
488+
elif num_quoted_elems == len(elems) - num_none_elems:
489+
max_quote_policy = QUOTE_NOTNULL
490+
elif num_quoted_elems == len(elems) - num_float_elems:
491+
max_quote_policy = QUOTE_NONNUMERIC
492+
elif num_quoted_elems == len(elems) - num_none_elems - num_float_elems:
493+
max_quote_policy = QUOTE_STRINGS
494+
elif num_quoted_elems > 0:
495+
max_quote_policy = QUOTE_MINIMAL
496+
policies.append(max_quote_policy)
497+
return self._determine_quote_priority(policies)
498+
499+
def _determine_quote_priority(self, quote_policies):
500+
priority_order = {
501+
QUOTE_ALL: 50,
502+
QUOTE_NOTNULL: 40,
503+
QUOTE_NONNUMERIC: 30,
504+
QUOTE_STRINGS: 20,
505+
QUOTE_MINIMAL: 10,
506+
QUOTE_NONE: 0
507+
}
508+
reverse_priority_order = {v: k for k, v in priority_order.items()}
509+
max_priority = map(lambda e:priority_order.get(e, 0), quote_policies)
510+
return reverse_priority_order[max(max_priority)]
451511

452512
def has_header(self, sample):
453513
# Creates a dictionary of types of data in each column. If any

Lib/test/test_csv.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,6 +1423,15 @@ def test_doublequote(self):
14231423
dialect = sniffer.sniff(self.sample9)
14241424
self.assertTrue(dialect.doublequote)
14251425

1426+
def test_quote_detection(self):
1427+
sniffer = csv.Sniffer()
1428+
dialect = sniffer.sniff(self.sample1)
1429+
self.assertEqual(dialect.quoting, csv.QUOTE_NONE)
1430+
breakpoint()
1431+
dialect = sniffer.sniff(self.sample2)
1432+
self.assertEqual(dialect.quoting, csv.QUOTE_ALL)
1433+
1434+
14261435
class NUL:
14271436
def write(s, *args):
14281437
pass

0 commit comments

Comments
 (0)