diff --git a/Lib/csv.py b/Lib/csv.py index cd202659873811..2575c03f90d0cc 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -233,7 +233,7 @@ def writerows(self, rowdicts): class Sniffer: ''' - "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) + "Sniffs" the format of a CSV file (i.e. delimiter, quotechar, lineterminator) Returns a Dialect object. ''' def __init__(self): @@ -246,18 +246,19 @@ def sniff(self, sample, delimiters=None): Returns a dialect (or None) corresponding to the sample """ + lineterminator = self._guess_lineterminator(sample) quotechar, doublequote, delimiter, skipinitialspace = \ self._guess_quote_and_delimiter(sample, delimiters) if not delimiter: delimiter, skipinitialspace = self._guess_delimiter(sample, - delimiters) + delimiters, + lineterminator) if not delimiter: raise Error("Could not determine delimiter") class dialect(Dialect): _name = "sniffed" - lineterminator = '\r\n' quoting = QUOTE_MINIMAL # escapechar = '' @@ -266,6 +267,7 @@ class dialect(Dialect): # _csv.reader won't accept a quotechar of '' dialect.quotechar = quotechar or '"' dialect.skipinitialspace = skipinitialspace + dialect.lineterminator = lineterminator return dialect @@ -346,7 +348,7 @@ def _guess_quote_and_delimiter(self, data, delimiters): return (quotechar, doublequote, delim, skipinitialspace) - def _guess_delimiter(self, data, delimiters): + def _guess_delimiter(self, data, delimiters, lineterminator): """ The delimiter /should/ occur the same number of times on each row. However, due to malformed data, it may not. We don't want @@ -365,7 +367,7 @@ def _guess_delimiter(self, data, delimiters): additional chunks as necessary. """ - data = list(filter(None, data.split('\n'))) + data = list(filter(None, data.split(lineterminator))) ascii = [chr(c) for c in range(127)] # 7-bit ASCII @@ -511,3 +513,15 @@ def has_header(self, sample): hasHeader -= 1 return hasHeader > 0 + + + def _guess_lineterminator(self, sample): + # Guess line terminator based on presence + terminators = ('\r\n', '\n', '\r') + + for terminator in terminators: + if terminator in sample: + return terminator + + from os import linesep + return linesep diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index ce5c03659f1979..7ab37b6bbec0cb 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1414,6 +1414,21 @@ def test_doublequote(self): dialect = sniffer.sniff(self.sample9) self.assertTrue(dialect.doublequote) + def test_guess_lineterminator(self): + from os import linesep + sniffer = csv.Sniffer() + dialect = sniffer.sniff('Date;Value\r\n2010-01-01;10') + self.assertEqual(dialect.lineterminator, '\r\n') + dialect = sniffer.sniff('Date;Value\n2010-01-01;10') + self.assertEqual(dialect.lineterminator, '\n') + dialect = sniffer.sniff('Date;Value\r2010-01-01;10') + self.assertEqual(dialect.lineterminator, '\r') + dialect = sniffer.sniff('Date;Value\v2010-01-01;10') + self.assertEqual(dialect.lineterminator, linesep) + dialect = sniffer.sniff('Date;Value') + self.assertEqual(dialect.lineterminator, linesep) + + class NUL: def write(s, *args): pass diff --git a/Misc/NEWS.d/next/Library/2018-02-12-05-10-28.bpo-30825.t0RXql.rst b/Misc/NEWS.d/next/Library/2018-02-12-05-10-28.bpo-30825.t0RXql.rst new file mode 100644 index 00000000000000..f59df8a8df41e7 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-02-12-05-10-28.bpo-30825.t0RXql.rst @@ -0,0 +1 @@ +:class:`csv.Sniffer` now detects line terminator instead of defaulting to '\r\n'