From 9d6b61f2498f616c1e515804ce12d89ca02604d3 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 25 Nov 2016 10:09:00 -0500 Subject: [PATCH 1/6] Fix 14734 --- pandas/io/sas/sas7bdat.py | 10 ++++++++++ pandas/io/tests/sas/test_sas7bdat.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 2a82fd7a53222..91f417abc0502 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -225,6 +225,12 @@ def _get_properties(self): self.os_name = self.os_name.decode( self.encoding or self.default_encoding) + def __next__(self): + da = self.read(nrows=self.chunksize or 1) + if da is None: + raise StopIteration + return da + # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): @@ -591,6 +597,10 @@ def read(self, nrows=None): if self._current_row_in_file_index >= self.row_count: return None + m = self.row_count - self._current_row_in_file_index + if nrows > m: + nrows = m + nd = (self.column_types == b'd').sum() ns = (self.column_types == b's').sum() diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 06eb9774679b1..0ae8e69999987 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -65,6 +65,32 @@ def test_from_iterator(self): df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) + def test_iterator_loop(self): + for j in 0, 1: + for k in self.test_ix[j]: + for chunksize in 3, 5, 10, 11: + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + with open(fname, 'rb') as f: + byts = f.read() + buf = io.BytesIO(byts) + rdr = pd.read_sas(buf, format="sas7bdat", + chunksize=chunksize, encoding='utf-8') + y = 0 + for x in rdr: + y += x.shape[0] + assert(y == rdr.row_count) + + def test_iterator_read_too_much(self): + # github #14734 + k = self.test_ix[0][0] + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + with open(fname, 'rb') as f: + byts = f.read() + buf = io.BytesIO(byts) + rdr = pd.read_sas(buf, format="sas7bdat", + iterator=True, encoding='utf-8') + rdr.read(rdr.row_count + 20) + def test_encoding_options(): dirpath = tm.get_data_path() From e8327e0ee5a827334c42a9b71fae625e98fad0ab Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 25 Nov 2016 14:23:40 -0500 Subject: [PATCH 2/6] Added to whatsnew --- doc/source/whatsnew/v0.20.0.txt | 3 +++ pandas/io/tests/sas/test_sas7bdat.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 65b62601c7022..5d2c5f014391d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -84,3 +84,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and +``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when +reading a SAS file incrementally. diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 0ae8e69999987..4cbc385ea1168 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -66,6 +66,7 @@ def test_from_iterator(self): tm.assert_frame_equal(df, df0.iloc[2:5, :]) def test_iterator_loop(self): + # github #13654 for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: @@ -78,7 +79,7 @@ def test_iterator_loop(self): y = 0 for x in rdr: y += x.shape[0] - assert(y == rdr.row_count) + self.assertTrue(y == rdr.row_count) def test_iterator_read_too_much(self): # github #14734 From 4504df5c6e63cb79356b4bd2b4a0918aca11644a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 25 Nov 2016 17:03:12 -0500 Subject: [PATCH 3/6] Add iterator tests for xport --- pandas/io/tests/sas/test_xport.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index d0627a80f9604..fe2f7cb4bf4be 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -35,6 +35,13 @@ def test1_basic(self): # Read full file data = read_sas(self.file01, format="xport") tm.assert_frame_equal(data, data_csv) + num_rows = data.shape[0] + + # Test reading beyond end of file + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(num_rows + 100) + self.assertTrue(data.shape[0] == num_rows) + reader.close() # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) @@ -48,6 +55,14 @@ def test1_basic(self): reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) + # Test read in loop + m = 0 + reader = read_sas(self.file01, format="xport", chunksize=100) + for x in reader: + m += x.shape[0] + reader.close() + self.assertTrue(m == num_rows) + # Read full file with `read_sas` method data = read_sas(self.file01) tm.assert_frame_equal(data, data_csv) From a7b7da88052f472e8c2c592d885014a0e1b75230 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 25 Nov 2016 17:15:45 -0500 Subject: [PATCH 4/6] Moved whatsnew to 19.2 --- doc/source/whatsnew/v0.19.2.txt | 1 + doc/source/whatsnew/v0.20.0.txt | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index d9aa92270669d..a5fca8f268d9c 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -31,6 +31,7 @@ Bug Fixes - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) +- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 5d2c5f014391d..03e0cae6cc83f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -83,7 +83,3 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - -- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and -``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when -reading a SAS file incrementally. From 8c1e17e24ffbd36383c3c8bacd7d7eff7b822a21 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 25 Nov 2016 17:27:06 -0500 Subject: [PATCH 5/6] Bypass ioreader --- pandas/io/tests/sas/test_sas7bdat.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 4cbc385ea1168..530e0b3701e32 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -44,10 +44,7 @@ def test_from_buffer(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - with open(fname, 'rb') as f: - byts = f.read() - buf = io.BytesIO(byts) - df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0, check_exact=False) def test_from_iterator(self): @@ -55,11 +52,7 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - with open(fname, 'rb') as f: - byts = f.read() - buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) @@ -71,11 +64,7 @@ def test_iterator_loop(self): for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - with open(fname, 'rb') as f: - byts = f.read() - buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", - chunksize=chunksize, encoding='utf-8') + rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') y = 0 for x in rdr: y += x.shape[0] @@ -90,7 +79,10 @@ def test_iterator_read_too_much(self): buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') - rdr.read(rdr.row_count + 20) + d1 = rdr.read(rdr.row_count + 20) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + d2 = rdr.read(rdr.row_count + 20) + tm.assert_frame_equal(d1, d2) def test_encoding_options(): From 28d4038c3330332da2d3631214d26aa93c98d636 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Sat, 26 Nov 2016 10:01:00 -0500 Subject: [PATCH 6/6] Minor change to tests --- pandas/io/tests/sas/test_sas7bdat.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 530e0b3701e32..e20ea48247119 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -44,7 +44,12 @@ def test_from_buffer(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - df = pd.read_sas(fname, encoding="utf-8") + with open(fname, 'rb') as f: + byts = f.read() + buf = io.BytesIO(byts) + rdr = pd.read_sas(buf, format="sas7bdat", + iterator=True, encoding='utf-8') + df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) def test_from_iterator(self): @@ -74,10 +79,7 @@ def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - with open(fname, 'rb') as f: - byts = f.read() - buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", + rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding='utf-8') d1 = rdr.read(rdr.row_count + 20) rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")