Merge pull request #419 from Blosc/fixConditions

lshaw8317 · web-flow · commit 7eabaf018fdb · 2025-05-30T13:56:56.000+02:00
Make behaviour of compute consistent for slicing
diff --git a/examples/ndarray/lazyexpr_where_indexing.py b/examples/ndarray/lazyexpr_where_indexing.py
@@ -0,0 +1,42 @@
+# Imports
+
+import numpy as np
+
+import blosc2
+
+N = 1000
+it = ((-x + 1, x - 2, 0.1 * x) for x in range(N))
+sa = blosc2.fromiter(
+    it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w"
+)
+expr = sa["(A < B)"]
+A = sa["A"][:]
+B = sa["B"][:]
+C = sa["C"][:]
+temp = sa[:]
+indices = A < B
+idx = np.argmax(indices)
+
+# One might think that expr[:10] gives the first 10 elements of the evaluated expression, but this is not the case.
+# It actually computes the expression on the first 10 elements of the operands; since for some elements the condition
+# is False, the result will be shorter than 10 elements.
+# Returns less than 10 elements in general
+sliced = expr.compute(slice(0, 10))
+gotitem = expr[:10]
+np.testing.assert_array_equal(sliced[:], gotitem)
+np.testing.assert_array_equal(gotitem, temp[:10][indices[:10]])  # Equivalent syntax
+# Actually this makes sense since one can understand this as a request to compute on a portion of operands.
+# If one desires a portion of the result, one should compute the whole expression and then slice it.
+
+# Get first element for which condition is true
+sliced = expr.compute(idx)
+gotitem = expr[idx]
+# Arrays of one element
+np.testing.assert_array_equal(sliced[()], gotitem)
+np.testing.assert_array_equal(gotitem, temp[idx])
+
+# Should return void arrays here.
+sliced = expr.compute(0)
+gotitem = expr[0]
+np.testing.assert_array_equal(sliced[()], gotitem)
+np.testing.assert_array_equal(gotitem, temp[0])
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
@@ -278,8 +278,9 @@ def compute(self, item: slice | list[slice] | None = None, **kwargs: Any) -> blo
         Parameters
         ----------
         item: slice, list of slices, optional
-            If not None, only the chunks that intersect with the slices
-            in items will be evaluated.
+            If provided, item is used to slice the operands *prior* to computation; not to retrieve specified slices of
+            the evaluated result. This difference between slicing operands and slicing the final expression
+            is important when reductions or a where clause are used in the expression.
 
         kwargs: Any, optional
             Keyword arguments that are supported by the :func:`empty` constructor.
@@ -328,7 +329,9 @@ def __getitem__(self, item: int | slice | Sequence[slice]) -> blosc2.NDArray:
         Parameters
         ----------
         item: int, slice or sequence of slices
-            The slice(s) to be retrieved. Note that step parameter is not yet honored.
+            If provided, item is used to slice the operands *prior* to computation; not to retrieve specified slices of
+            the evaluated result. This difference between slicing operands and slicing the final expression
+            is important when reductions or a where clause are used in the expression.
 
         Returns
         -------
@@ -1378,7 +1381,8 @@ def slices_eval(  # noqa: C901
             for i, (c, s) in enumerate(zip(coords, chunks, strict=True))
         )
         # Check whether current slice_ intersects with _slice
-        if _slice is not None and _slice != ():
+        checker = _slice.item() if hasattr(_slice, "item") else _slice  # can't use != when _slice is np.int
+        if checker is not None and checker != ():
             # Ensure that _slice is of type slice
             key = ndindex.ndindex(_slice).expand(shape).raw
             _slice = tuple(k if isinstance(k, slice) else slice(k, k + 1, None) for k in key)
@@ -1508,19 +1512,7 @@ def slices_eval(  # noqa: C901
         else:
             raise ValueError("The where condition must be a tuple with one or two elements")
 
-    if orig_slice is not None:
-        if isinstance(out, np.ndarray):
-            out = out[orig_slice]
-            if _order is not None:
-                indices_ = indices_[orig_slice]
-        elif isinstance(out, blosc2.NDArray):
-            # It *seems* better to choose an automatic chunks and blocks for the output array
-            # out = out.slice(orig_slice, chunks=out.chunks, blocks=out.blocks)
-            out = out.slice(orig_slice)
-        else:
-            raise ValueError("The output array is not a NumPy array or a NDArray")
-
-    if where is not None and len(where) < 2:
+    if where is not None and len(where) < 2:  # Don't need to take orig_slice since filled up from 0 index
         if _order is not None:
             # argsort the result following _order
             new_order = np.argsort(out[:lenout])
@@ -1532,6 +1524,19 @@ def slices_eval(  # noqa: C901
         else:
             out.resize((lenout,))
 
+    else:  # Need to take orig_slice since filled up array according to slice_ for each chunk
+        if orig_slice is not None:
+            if isinstance(out, np.ndarray):
+                out = out[orig_slice]
+                if _order is not None:
+                    indices_ = indices_[orig_slice]
+            elif isinstance(out, blosc2.NDArray):
+                # It *seems* better to choose an automatic chunks and blocks for the output array
+                # out = out.slice(orig_slice, chunks=out.chunks, blocks=out.blocks)
+                out = out.slice(orig_slice)
+            else:
+                raise ValueError("The output array is not a NumPy array or a NDArray")
+
     return out
 
 
@@ -1827,7 +1832,8 @@ def chunked_eval(  # noqa: C901
     operands: dict
         A dictionary containing the operands for the expression.
     item: int, slice or sequence of slices, optional
-        The slice(s) to be retrieved. Note that step parameter is not honored yet.
+        The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet.
+        Item is used to slice the operands PRIOR to computation.
     kwargs: Any, optional
         Additional keyword arguments supported by the :func:`empty` constructor.  In addition,
         the following keyword arguments are supported:
diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py
@@ -279,17 +279,19 @@ def test_where_one_param(array_fixture):
         res = np.sort(res)
         nres = np.sort(nres)
     np.testing.assert_allclose(res[:], nres)
+
     # Test with getitem
     sl = slice(100)
     res = expr.where(a1)[sl]
+    nres = na1[sl][ne_evaluate("na1**2 + na2**2 > 2 * na1 * na2 + 1")[sl]]
     if len(a1.shape) == 1 or a1.chunks == a1.shape:
         # TODO: fix this, as it seems that is not working well for numexpr?
         if blosc2.IS_WASM:
             return
-        np.testing.assert_allclose(res, nres[sl])
+        np.testing.assert_allclose(res, nres)
     else:
         # In this case, we cannot compare results, only the length
-        assert len(res) == len(nres[sl])
+        assert len(res) == len(nres)
 
 
 # Test where indirectly via a condition in getitem in a NDArray
@@ -330,25 +332,26 @@ def test_where_getitem(array_fixture):
     # Test with partial slice
     sl = slice(100)
     res = sa1[a1**2 + a2**2 > 2 * a1 * a2 + 1][sl]
+    nres = nsa1[sl][ne_evaluate("na1**2 + na2**2 > 2 * na1 * na2 + 1")[sl]]
     if len(a1.shape) == 1 or a1.chunks == a1.shape:
         # TODO: fix this, as it seems that is not working well for numexpr?
         if blosc2.IS_WASM:
             return
-        np.testing.assert_allclose(res["a"], nres[sl]["a"])
-        np.testing.assert_allclose(res["b"], nres[sl]["b"])
+        np.testing.assert_allclose(res["a"], nres["a"])
+        np.testing.assert_allclose(res["b"], nres["b"])
     else:
         # In this case, we cannot compare results, only the length
-        assert len(res["a"]) == len(nres[sl]["a"])
-        assert len(res["b"]) == len(nres[sl]["b"])
+        assert len(res["a"]) == len(nres["a"])
+        assert len(res["b"]) == len(nres["b"])
     # string version
     res = sa1["a**2 + b**2 > 2 * a * b + 1"][sl]
     if len(a1.shape) == 1 or a1.chunks == a1.shape:
-        np.testing.assert_allclose(res["a"], nres[sl]["a"])
-        np.testing.assert_allclose(res["b"], nres[sl]["b"])
+        np.testing.assert_allclose(res["a"], nres["a"])
+        np.testing.assert_allclose(res["b"], nres["b"])
     else:
         # We cannot compare the results here, other than the length
-        assert len(res["a"]) == len(nres[sl]["a"])
-        assert len(res["b"]) == len(nres[sl]["b"])
+        assert len(res["a"]) == len(nres["a"])
+        assert len(res["b"]) == len(nres["b"])
 
 
 # Test where indirectly via a condition in getitem in a NDField
@@ -631,3 +634,40 @@ def test_col_reduction(reduce_op):
     ns = nreduc(nC[nC > 0])
     np.testing.assert_allclose(s, ns)
     np.testing.assert_allclose(s2, ns)
+
+
+def test_fields_indexing():
+    N = 1000
+    it = ((-x + 1, x - 2, 0.1 * x) for x in range(N))
+    sa = blosc2.fromiter(
+        it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w"
+    )
+    expr = sa["(A < B)"]
+    A = sa["A"][:]
+    B = sa["B"][:]
+    C = sa["C"][:]
+    temp = sa[:]
+    indices = A < B
+    idx = np.argmax(indices)
+
+    # Returns less than 10 elements in general
+    sliced = expr.compute(slice(0, 10))
+    gotitem = expr[:10]
+    np.testing.assert_array_equal(sliced[:], gotitem)
+    np.testing.assert_array_equal(gotitem, temp[:10][indices[:10]])
+    # Actually this makes sense since one can understand this as a request to compute on a portion of operands.
+    # If one desires a portion of the result, one should compute the whole expression and then slice it.
+    # For a general slice it is quite difficult to simply stop when the desired slice has been obtained. Or
+    # to try to optimise chunk computation order.
+
+    # Get first true element
+    sliced = expr.compute(idx)
+    gotitem = expr[idx]
+    np.testing.assert_array_equal(sliced[()], gotitem)
+    np.testing.assert_array_equal(gotitem, temp[idx])
+
+    # Should return void arrays here.
+    sliced = expr.compute(0)  # typically gives array of zeros
+    gotitem = expr[0]  # gives an error
+    np.testing.assert_array_equal(sliced[()], gotitem)
+    np.testing.assert_array_equal(gotitem, temp[0])