Skip to content

Commit 26a2e6c

Browse files
l09rinseberg
andauthored
BUG: fix max_rows and chunked string/datetime reading in loadtxt (numpy#26762)
* fixed bug at line 1058 in file numpy/lib&npyio_impl.py; in function _read(), called by loadtxt() method, when files are read in chunks to reduce memory overhead, max_rows lines were always loaded every time, also in the case max_rows>_loadtxt_chunksize, in which case it loaded chunks with the wrong size. A test has been added in numpy/lib/tests/test_loadtxt.py, to check for the array size loaded for different max_rows, less and greater than _loadtxt_chunksize. * changed numpy/lib/tests/test_loadtxt.py; added further tests in functions at lines test_maxrows_exceeding_chunksize() and test_parametric_unit_discovery() to check if loadtxt() method loads correctly files as a whole and in chunks. It seems that the function _load_from_filelike() works well with file-like streams, but not with file objects. * changed value of filelike variable in file numpy/lib/_npyio_impl.py at line 1045; file was converted to iterable, but not accounted for, then _load_from_fillelike() was not able to read the stream properly until the end. * I forgot to add the new version of test_loadtxt.py with the updated test functions for reading files in chunks... * within file numpy/lib/tests/test_loadtxt.py I reduced the size of the arrays within function test_maxrows_exceeding_chunksize() * add max_rows=10 in the call of loadtxt() within function test_field_growing_cases() to avoid memory allocation issues when the line grows too much. * Update numpy/lib/tests/test_loadtxt.py --------- Co-authored-by: Sebastian Berg <[email protected]>
1 parent 387830a commit 26a2e6c

File tree

2 files changed

+36
-5
lines changed

2 files changed

+36
-5
lines changed

numpy/lib/_npyio_impl.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
10401040
# Due to chunking, certain error reports are less clear, currently.
10411041
if filelike:
10421042
data = iter(data) # cannot chunk when reading from file
1043+
filelike = False
10431044

10441045
c_byte_converters = False
10451046
if read_dtype_via_object_chunks == "S":
@@ -1055,7 +1056,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
10551056
next_arr = _load_from_filelike(
10561057
data, delimiter=delimiter, comment=comment, quote=quote,
10571058
imaginary_unit=imaginary_unit,
1058-
usecols=usecols, skiplines=skiplines, max_rows=max_rows,
1059+
usecols=usecols, skiplines=skiplines, max_rows=chunk_size,
10591060
converters=converters, dtype=dtype,
10601061
encoding=encoding, filelike=filelike,
10611062
byte_converters=byte_converters,

numpy/lib/tests/test_loadtxt.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -970,24 +970,33 @@ def test_parametric_unit_discovery(
970970
"""Check that the correct unit (e.g. month, day, second) is discovered from
971971
the data when a user specifies a unitless datetime."""
972972
# Unit should be "D" (days) due to last entry
973-
data = [generic_data] * 50000 + [long_datum]
973+
data = [generic_data] * nrows + [long_datum]
974974
expected = np.array(data, dtype=expected_dtype)
975+
assert len(data) == nrows+1
976+
assert len(data) == len(expected)
975977

976978
# file-like path
977979
txt = StringIO("\n".join(data))
978980
a = np.loadtxt(txt, dtype=unitless_dtype)
981+
assert len(a) == len(expected)
979982
assert a.dtype == expected.dtype
980983
assert_equal(a, expected)
981984

982985
# file-obj path
983986
fd, fname = mkstemp()
984987
os.close(fd)
985988
with open(fname, "w") as fh:
986-
fh.write("\n".join(data))
989+
fh.write("\n".join(data)+"\n")
990+
# loading the full file...
987991
a = np.loadtxt(fname, dtype=unitless_dtype)
988-
os.remove(fname)
992+
assert len(a) == len(expected)
989993
assert a.dtype == expected.dtype
990994
assert_equal(a, expected)
995+
# loading half of the file...
996+
a = np.loadtxt(fname, dtype=unitless_dtype, max_rows=int(nrows/2))
997+
os.remove(fname)
998+
assert len(a) == int(nrows/2)
999+
assert_equal(a, expected[:int(nrows/2)])
9911000

9921001

9931002
def test_str_dtype_unit_discovery_with_converter():
@@ -1041,5 +1050,26 @@ def test_field_growing_cases():
10411050
assert len(res) == 0
10421051

10431052
for i in range(1, 1024):
1044-
res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
1053+
res = np.loadtxt(["," * i], delimiter=",", dtype=bytes, max_rows=10)
10451054
assert len(res) == i+1
1055+
1056+
@pytest.mark.parametrize("nmax", (10000, 50000, 55000, 60000))
1057+
def test_maxrows_exceeding_chunksize(nmax):
1058+
# tries to read all of the file,
1059+
# or less, equal, greater than _loadtxt_chunksize
1060+
file_length = 60000
1061+
1062+
# file-like path
1063+
data = ["a 0.5 1"]*file_length
1064+
txt = StringIO("\n".join(data))
1065+
res = np.loadtxt(txt, dtype=str, delimiter=" ", max_rows=nmax)
1066+
assert len(res) == nmax
1067+
1068+
# file-obj path
1069+
fd, fname = mkstemp()
1070+
os.close(fd)
1071+
with open(fname, "w") as fh:
1072+
fh.write("\n".join(data))
1073+
res = np.loadtxt(fname, dtype=str, delimiter=" ", max_rows=nmax)
1074+
os.remove(fname)
1075+
assert len(res) == nmax

0 commit comments

Comments
 (0)