diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index ecbd6e9b3b288..a58e3499ac38f 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -29,7 +29,7 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - +- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 092cba093421a..3fe5e5e826ebd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1509,10 +1509,11 @@ def read(self, nrows=None): if self._first_chunk: self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) - index, columns, col_dict = _get_empty_meta( names, self.index_col, self.index_names, dtype=self.kwds.get('dtype')) + columns = self._maybe_make_multi_index_columns( + columns, self.col_names) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1979,8 +1980,11 @@ def read(self, rows=None): if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) - return _get_empty_meta(names, self.index_col, - self.index_names) + index, columns, col_dict = _get_empty_meta( + names, self.index_col, self.index_names) + columns = self._maybe_make_multi_index_columns( + columns, self.col_names) + return index, columns, col_dict # handle new style for names in index count_empty_content_vals = count_empty_vals(content[0]) @@ -2083,6 +2087,12 @@ def _infer_columns(self): # We have an empty file, so check # if columns are provided. That will # serve as the 'line' for parsing + if have_mi_columns and hr > 0: + if clear_buffer: + self._clear_buffer() + columns.append([None] * len(columns[-1])) + return columns, num_original_columns + if not self.names: raise EmptyDataError( "No columns to parse from file") diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4cb00c48976a4..6eb73876c11dd 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -606,6 +606,28 @@ def test_multi_index_no_level_names(self): expected = self.read_csv(StringIO(data), index_col=[1, 0]) tm.assert_frame_equal(df, expected, check_names=False) + def test_multi_index_blank_df(self): + # GH 14545 + data = """a,b +""" + df = self.read_csv(StringIO(data), header=[0]) + expected = DataFrame(columns=['a', 'b']) + tm.assert_frame_equal(df, expected) + round_trip = self.read_csv(StringIO( + expected.to_csv(index=False)), header=[0]) + tm.assert_frame_equal(round_trip, expected) + + data_multiline = """a,b +c,d +""" + df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) + cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) + expected2 = DataFrame(columns=cols) + tm.assert_frame_equal(df2, expected2) + round_trip = self.read_csv(StringIO( + expected2.to_csv(index=False)), header=[0, 1]) + tm.assert_frame_equal(round_trip, expected2) + def test_no_unnamed_index(self): data = """ id c0 c1 c2 0 1 0 a b diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 9fb99637731be..6b43dfbabc4a0 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -717,7 +717,9 @@ cdef class TextReader: start = self.parser.line_start[0] # e.g., if header=3 and file only has 2 lines - elif self.parser.lines < hr + 1: + elif (self.parser.lines < hr + 1 + and not isinstance(self.orig_header, list)) or ( + self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): msg = "[%s], len of %d," % ( @@ -940,7 +942,7 @@ cdef class TextReader: raise_parser_error('Error tokenizing data', self.parser) footer = self.skipfooter - if self.parser_start == self.parser.lines: + if self.parser_start >= self.parser.lines: raise StopIteration self._end_clock('Tokenization') diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4d6a5bb32038d..1eb3454519ce3 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -587,7 +587,7 @@ def _make_frame(names=None): df = _make_frame(True) df.to_csv(path, tupleize_cols=False) - for i in [5, 6, 7]: + for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with assertRaisesRegexp(ParserError, msg): read_csv(path, tupleize_cols=False,