-
-
Notifications
You must be signed in to change notification settings - Fork 33.2k
bpo-30825: guess lineterminator in csv.Sniffer #2529
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
|
|
||
| from collections import OrderedDict | ||
| from io import StringIO | ||
| import os | ||
|
|
||
| __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", | ||
| "Error", "Dialect", "__doc__", "excel", "excel_tab", | ||
|
|
@@ -165,7 +166,7 @@ def writerows(self, rowdicts): | |
|
|
||
| class Sniffer: | ||
| ''' | ||
| "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) | ||
| "Sniffs" the format of a CSV file (i.e. delimiter, quotechar, lineterminator) | ||
| Returns a Dialect object. | ||
| ''' | ||
| def __init__(self): | ||
|
|
@@ -178,18 +179,19 @@ def sniff(self, sample, delimiters=None): | |
| Returns a dialect (or None) corresponding to the sample | ||
| """ | ||
|
|
||
| lineterminator = self._guess_lineterminator(sample) | ||
| quotechar, doublequote, delimiter, skipinitialspace = \ | ||
| self._guess_quote_and_delimiter(sample, delimiters) | ||
| if not delimiter: | ||
| delimiter, skipinitialspace = self._guess_delimiter(sample, | ||
| delimiters) | ||
| delimiters, | ||
| lineterminator) | ||
|
|
||
| if not delimiter: | ||
| raise Error("Could not determine delimiter") | ||
|
|
||
| class dialect(Dialect): | ||
| _name = "sniffed" | ||
| lineterminator = '\r\n' | ||
| quoting = QUOTE_MINIMAL | ||
| # escapechar = '' | ||
|
|
||
|
|
@@ -198,6 +200,7 @@ class dialect(Dialect): | |
| # _csv.reader won't accept a quotechar of '' | ||
| dialect.quotechar = quotechar or '"' | ||
| dialect.skipinitialspace = skipinitialspace | ||
| dialect.lineterminator = lineterminator | ||
|
|
||
| return dialect | ||
|
|
||
|
|
@@ -278,7 +281,7 @@ def _guess_quote_and_delimiter(self, data, delimiters): | |
| return (quotechar, doublequote, delim, skipinitialspace) | ||
|
|
||
|
|
||
| def _guess_delimiter(self, data, delimiters): | ||
| def _guess_delimiter(self, data, delimiters, lineterminator): | ||
| """ | ||
| The delimiter /should/ occur the same number of times on | ||
| each row. However, due to malformed data, it may not. We don't want | ||
|
|
@@ -297,7 +300,7 @@ def _guess_delimiter(self, data, delimiters): | |
| additional chunks as necessary. | ||
| """ | ||
|
|
||
| data = list(filter(None, data.split('\n'))) | ||
| data = list(filter(None, data.split(lineterminator))) | ||
|
|
||
| ascii = [chr(c) for c in range(127)] # 7-bit ASCII | ||
|
|
||
|
|
@@ -447,3 +450,14 @@ def has_header(self, sample): | |
| hasHeader -= 1 | ||
|
|
||
| return hasHeader > 0 | ||
|
|
||
|
|
||
| def _guess_lineterminator(self, sample): | ||
| # Guess line terminator based on presence | ||
| terminators = ('\r\n', '\n', '\r') | ||
|
|
||
| for terminator in terminators: | ||
| if terminator in sample: | ||
| return terminator | ||
|
|
||
| return os.linesep | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case user submits a sample with an unusual or no line terminator. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| import csv | ||
| import gc | ||
| import pickle | ||
| import os | ||
| from test import support | ||
| from itertools import permutations | ||
| from textwrap import dedent | ||
|
|
@@ -1018,6 +1019,20 @@ def test_doublequote(self): | |
| dialect = sniffer.sniff(self.sample9) | ||
| self.assertTrue(dialect.doublequote) | ||
|
|
||
| def test_guess_lineterminator(self): | ||
| sniffer = csv.Sniffer() | ||
| dialect = sniffer.sniff(r"Date;Value\r\n2010-01-01;10") | ||
| self.assertEqual(dialect.lineterminator, '\r\n') | ||
| dialect = sniffer.sniff(r"Date;Value\n2010-01-01;10") | ||
| self.assertEqual(dialect.lineterminator, '\n') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assumes that the line terminator for multiline triple-quoted string literals is '\n'. We require that '\r\n' in code files be converted to '\n' before merging, but the test should not assume '\n' for all developer and user systems where the test might be run. So Since the sample strings need not be long, they can all be literals defined here. Make all 3 the same except for the terminator. This will make the test both crystal clear and independent of system and optional local settings. |
||
| dialect = sniffer.sniff(r"Date;Value\r2010-01-01;10") | ||
| self.assertEqual(dialect.lineterminator, '\r') | ||
| dialect = sniffer.sniff(r"Date;Value\v2010-01-01;10") | ||
| self.assertEqual(dialect.lineterminator, os.linesep) | ||
| dialect = sniffer.sniff(r"Date;Value") | ||
| self.assertEqual(dialect.lineterminator, os.linesep) | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add tests to cover the last line of _guess_lineterminator: |
||
|
|
||
| class NUL: | ||
| def write(s, *args): | ||
| pass | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| csv.Sniffer now detects line terminator instead of defaulting to '\r\n' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the core of the patch. The sequence looks right.