Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Bug Fixes
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.



Expand Down
1 change: 0 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,3 @@ Performance Improvements

Bug Fixes
~~~~~~~~~

10 changes: 10 additions & 0 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,12 @@ def _get_properties(self):
self.os_name = self.os_name.decode(
self.encoding or self.default_encoding)

def __next__(self):
da = self.read(nrows=self.chunksize or 1)
if da is None:
raise StopIteration
return da

# Read a single float of the given width (4 or 8).
def _read_float(self, offset, width):
if width not in (4, 8):
Expand Down Expand Up @@ -591,6 +597,10 @@ def read(self, nrows=None):
if self._current_row_in_file_index >= self.row_count:
return None

m = self.row_count - self._current_row_in_file_index
if nrows > m:
nrows = m

nd = (self.column_types == b'd').sum()
ns = (self.column_types == b's').sum()

Expand Down
37 changes: 28 additions & 9 deletions pandas/io/tests/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,46 @@ def test_from_buffer(self):
df0 = self.data[j]
for k in self.test_ix[j]:
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
with open(fname, 'rb') as f:
byts = f.read()
buf = io.BytesIO(byts)
df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8')
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, df0, check_exact=False)

def test_from_iterator(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is more logical to keep the buffer reading here, as this test is called "test_from_buffer" is I suppose is exactly testing this (and then maybe use plain reading from file in "test_iterator_read_too_much")

for j in 0, 1:
df0 = self.data[j]
for k in self.test_ix[j]:
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
with open(fname, 'rb') as f:
byts = f.read()
buf = io.BytesIO(byts)
rdr = pd.read_sas(buf, format="sas7bdat",
iterator=True, encoding='utf-8')
rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
df = rdr.read(2)
tm.assert_frame_equal(df, df0.iloc[0:2, :])
df = rdr.read(3)
tm.assert_frame_equal(df, df0.iloc[2:5, :])

def test_iterator_loop(self):
# github #13654
for j in 0, 1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue references here

for k in self.test_ix[j]:
for chunksize in 3, 5, 10, 11:
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
y = 0
for x in rdr:
y += x.shape[0]
self.assertTrue(y == rdr.row_count)

def test_iterator_read_too_much(self):
# github #14734
k = self.test_ix[0][0]
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
with open(fname, 'rb') as f:
byts = f.read()
buf = io.BytesIO(byts)
rdr = pd.read_sas(buf, format="sas7bdat",
iterator=True, encoding='utf-8')
d1 = rdr.read(rdr.row_count + 20)
rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
d2 = rdr.read(rdr.row_count + 20)
tm.assert_frame_equal(d1, d2)


def test_encoding_options():
dirpath = tm.get_data_path()
Expand Down
15 changes: 15 additions & 0 deletions pandas/io/tests/sas/test_xport.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ def test1_basic(self):
# Read full file
data = read_sas(self.file01, format="xport")
tm.assert_frame_equal(data, data_csv)
num_rows = data.shape[0]

# Test reading beyond end of file
reader = read_sas(self.file01, format="xport", iterator=True)
data = reader.read(num_rows + 100)
self.assertTrue(data.shape[0] == num_rows)
reader.close()

# Test incremental read with `read` method.
reader = read_sas(self.file01, format="xport", iterator=True)
Expand All @@ -48,6 +55,14 @@ def test1_basic(self):
reader.close()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])

# Test read in loop
m = 0
reader = read_sas(self.file01, format="xport", chunksize=100)
for x in reader:
m += x.shape[0]
reader.close()
self.assertTrue(m == num_rows)

# Read full file with `read_sas` method
data = read_sas(self.file01)
tm.assert_frame_equal(data, data_csv)
Expand Down