Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Bug Fixes


- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
- Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`)



Expand Down
17 changes: 17 additions & 0 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,20 @@ def test_internal_null_byte(self):

result = self.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)

def test_read_nrows_large(self):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
for i in range(10)]) + '\n'
data_narrow = '\t'.join(['somedatasomedatasomedata1'
for i in range(10)]) + '\n'
header_wide = '\t'.join(['COL_HEADER_' + str(i)
for i in range(15)]) + '\n'
data_wide = '\t'.join(['somedatasomedatasomedata2'
for i in range(15)]) + '\n'
test_input = (header_narrow + data_narrow * 1050 +
header_wide + data_wide * 2)

df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)

self.assertTrue(df.size == 1010 * 10)
17 changes: 17 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,23 @@ def test_read_nrows(self):
with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(StringIO(self.data1), nrows='foo')

def test_read_nrows_large(self):
# GH-7626 - Read only nrows of data in for large inputs (>262144b)
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is duplicating the above test in c_parser_only

for i in range(10)]) + '\n'
data_narrow = '\t'.join(['somedatasomedatasomedata1'
for i in range(10)]) + '\n'
header_wide = '\t'.join(['COL_HEADER_' + str(i)
for i in range(15)]) + '\n'
data_wide = '\t'.join(['somedatasomedatasomedata2'
for i in range(15)]) + '\n'
test_input = (header_narrow + data_narrow * 1050 +
header_wide + data_wide * 2)

df = self.read_csv(StringIO(test_input), sep="\t", nrows=1010)

self.assertTrue(df.size == 1010 * 10)

def test_read_chunksize(self):
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
df = self.read_csv(StringIO(self.data1), index_col=0)
Expand Down
8 changes: 3 additions & 5 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,16 +726,14 @@ int skip_this_line(parser_t *self, int64_t rownum) {
}
}

int tokenize_bytes(parser_t *self, size_t line_limit)
int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines)
{
int i, slen, start_lines;
int i, slen;
long maxstreamsize;
char c;
char *stream;
char *buf = self->data + self->datapos;

start_lines = self->lines;

if (make_stream_space(self, self->datalen - self->datapos) < 0) {
self->error_msg = "out of memory";
return -1;
Expand Down Expand Up @@ -1384,7 +1382,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n",
self->datalen - self->datapos, self->datalen, self->datapos));

status = tokenize_bytes(self, nrows);
status = tokenize_bytes(self, nrows, start_lines);

if (status < 0) {
// XXX
Expand Down