BUG: fix max_rows and chunked string/datetime reading in loadtxt (numpy#26762)

l09rin · seberg · web-flow · commit 26a2e6c882d2 · 2024-06-27T12:00:21.000+02:00
* fixed bug at line 1058 in file numpy/lib&amp;npyio_impl.py; in function _read(), called by loadtxt() method, when files are read in chunks to reduce memory overhead, max_rows lines were always loaded every time, also in the case max_rows&gt;_loadtxt_chunksize, in which case it loaded chunks with the wrong size. A test has been added in numpy/lib/tests/test_loadtxt.py, to check for the array size loaded for different max_rows, less and greater than _loadtxt_chunksize.

* changed numpy/lib/tests/test_loadtxt.py; added further tests in functions at lines test_maxrows_exceeding_chunksize() and test_parametric_unit_discovery() to check if loadtxt() method loads correctly files as a whole and in chunks. It seems that the function _load_from_filelike() works well with file-like streams, but not with file objects.

* changed value of filelike variable in file numpy/lib/_npyio_impl.py at line 1045; file was converted to iterable, but not accounted for, then _load_from_fillelike() was not able to read the stream properly until the end.

* I forgot to add the new version of test_loadtxt.py with the updated test functions for reading files in chunks...

* within file numpy/lib/tests/test_loadtxt.py I reduced the size of the arrays within function test_maxrows_exceeding_chunksize()

* add max_rows=10 in the call of loadtxt() within function test_field_growing_cases() to avoid memory allocation issues when the line grows too much.

* Update numpy/lib/tests/test_loadtxt.py

---------

Co-authored-by: Sebastian Berg &lt;sebastian@sipsolutions.net&gt;
diff --git a/numpy/lib/_npyio_impl.py b/numpy/lib/_npyio_impl.py
@@ -1040,6 +1040,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
             # Due to chunking, certain error reports are less clear, currently.
             if filelike:
                 data = iter(data)  # cannot chunk when reading from file
+                filelike = False
 
             c_byte_converters = False
             if read_dtype_via_object_chunks == "S":
@@ -1055,7 +1056,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"',
                 next_arr = _load_from_filelike(
                     data, delimiter=delimiter, comment=comment, quote=quote,
                     imaginary_unit=imaginary_unit,
-                    usecols=usecols, skiplines=skiplines, max_rows=max_rows,
+                    usecols=usecols, skiplines=skiplines, max_rows=chunk_size,
                     converters=converters, dtype=dtype,
                     encoding=encoding, filelike=filelike,
                     byte_converters=byte_converters,
diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py
@@ -970,24 +970,33 @@ def test_parametric_unit_discovery(
     """Check that the correct unit (e.g. month, day, second) is discovered from
     the data when a user specifies a unitless datetime."""
     # Unit should be "D" (days) due to last entry
-    data = [generic_data] * 50000 + [long_datum]
+    data = [generic_data] * nrows + [long_datum]
     expected = np.array(data, dtype=expected_dtype)
+    assert len(data) == nrows+1
+    assert len(data) == len(expected)
 
     # file-like path
     txt = StringIO("\n".join(data))
     a = np.loadtxt(txt, dtype=unitless_dtype)
+    assert len(a) == len(expected)
     assert a.dtype == expected.dtype
     assert_equal(a, expected)
 
     # file-obj path
     fd, fname = mkstemp()
     os.close(fd)
     with open(fname, "w") as fh:
-        fh.write("\n".join(data))
+        fh.write("\n".join(data)+"\n")
+    # loading the full file...
     a = np.loadtxt(fname, dtype=unitless_dtype)
-    os.remove(fname)
+    assert len(a) == len(expected)
     assert a.dtype == expected.dtype
     assert_equal(a, expected)
+    # loading half of the file...
+    a = np.loadtxt(fname, dtype=unitless_dtype, max_rows=int(nrows/2))
+    os.remove(fname)
+    assert len(a) == int(nrows/2)
+    assert_equal(a, expected[:int(nrows/2)])
 
 
 def test_str_dtype_unit_discovery_with_converter():
@@ -1041,5 +1050,26 @@ def test_field_growing_cases():
     assert len(res) == 0
 
     for i in range(1, 1024):
-        res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
+        res = np.loadtxt(["," * i], delimiter=",", dtype=bytes, max_rows=10)
         assert len(res) == i+1
+
+@pytest.mark.parametrize("nmax", (10000, 50000, 55000, 60000))
+def test_maxrows_exceeding_chunksize(nmax):
+    # tries to read all of the file,
+    # or less, equal, greater than _loadtxt_chunksize
+    file_length = 60000
+
+    # file-like path
+    data = ["a 0.5 1"]*file_length
+    txt = StringIO("\n".join(data))
+    res = np.loadtxt(txt, dtype=str, delimiter=" ", max_rows=nmax)
+    assert len(res) == nmax
+
+    # file-obj path
+    fd, fname = mkstemp()
+    os.close(fd)
+    with open(fname, "w") as fh:
+        fh.write("\n".join(data))
+    res = np.loadtxt(fname, dtype=str, delimiter=" ", max_rows=nmax)
+    os.remove(fname)
+    assert len(res) == nmax