Support for col selects and reductions in one go

FrancescAlted · FrancescAlted · commit e4d6c47f8cb2 · 2024-11-30T12:36:51.000+01:00
diff --git a/doc/reference/lazyarray.rst b/doc/reference/lazyarray.rst
@@ -3,15 +3,15 @@
 LazyArray
 =========
 
-This is an API interface for evaluating an expression or a Python user defined function.
+This is an API interface for computing an expression or a Python user defined function.
 
 You can get an object following the LazyArray API with any of the following ways:
 
 * Any expression that involves one or more NDArray objects. e.g. ``a + b``, where ``a`` and ``b`` are NDArray objects (see  `this tutorial <../getting_started/tutorials/03.lazyarray-expressions.html>`_).
 * Using the ``lazyexpr`` constructor.
 * Using the ``lazyudf`` constructor (see `a tutorial <../getting_started/tutorials/03.lazyarray-udf.html>`_).
 
-The LazyArray object is a thin wrapper around the expression or user-defined function that allows for lazy evaluation. This means that the expression is not evaluated until the ``compute`` or ``__getitem__`` methods are called. The ``compute`` method will return a new NDArray object with the result of the expression evaluation. The ``__getitem__`` method will return an NumPy object instead.
+The LazyArray object is a thin wrapper around the expression or user-defined function that allows for lazy computation. This means that the expression is not computed until the ``compute`` or ``__getitem__`` methods are called. The ``compute`` method will return a new NDArray object with the result of the expression evaluation. The ``__getitem__`` method will return an NumPy object instead.
 
 See the `LazyExpr`_ and `LazyUDF`_ sections for more information.
 
@@ -38,7 +38,7 @@ LazyExpr
 
 An expression like ``a + sum(b)``, where there is at least one NDArray object in operands ``a`` and ``b``, `returns a LazyExpr object <../getting_started/tutorials/03.lazyarray-expressions.html>`_. You can also get a LazyExpr object using the ``lazyexpr`` constructor (see below).
 
-This object follows the `LazyArray`_ API for evaluation and storage.
+This object follows the `LazyArray`_ API for computation and storage.
 
 .. currentmodule:: blosc2
 
@@ -67,7 +67,7 @@ LazyUDF
 
 For getting a LazyUDF object (which is LazyArray-compliant) from a user-defined Python function, you can use the lazyudf constructor below. See  `a tutorial on how this works <../getting_started/tutorials/03.lazyarray-udf.html>`_.
 
-This object follows the `LazyArray`_ API for evaluation, although storage is not supported yet.
+This object follows the `LazyArray`_ API for computation, although storage is not supported yet.
 
 .. autosummary::
     :toctree: autofiles/lazyarray
diff --git a/src/blosc2/core.py b/src/blosc2/core.py
@@ -1285,7 +1285,7 @@ def compute_chunks_blocks(  # noqa: C901
     blocks: tuple | list | None = None,
     dtype: np.dtype = np.uint8,
     **kwargs: dict,
-) -> tuple[(int, int)]:
+) -> tuple:
     """
     Compute educated guesses for chunks and blocks of a :ref:`NDArray`.
 
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
@@ -1442,16 +1442,15 @@ def reduce_slices(  # noqa: C901
         else:
             # Apply the where condition (in result)
             if len(where) == 2:
-                # x = chunk_operands["_where_x"]
-                # y = chunk_operands["_where_y"]
-                # result = np.where(result, x, y)
-                # numexpr is a bit faster than np.where, and we can fuse operations in this case
                 new_expr = f"where({expression}, _where_x, _where_y)"
                 result = ne.evaluate(new_expr, chunk_operands)
+            elif len(where) == 1:
+                result = ne.evaluate(expression, chunk_operands)
+                x = chunk_operands["_where_x"]
+                result = x[result]
             else:
-                raise ValueError(
-                    "A where condition with less than 2 params in combination with reductions"
-                    " is not supported yet"
+                raise NotImplementedError(
+                    "A where condition with no params in combination with reductions is not supported yet"
                 )
 
         # Reduce the result
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
@@ -384,7 +384,7 @@ def sum(
         Axis or axes along which a sum is performed. By default, axis=None,
         sums all the elements of the input array. If axis is negative,
         it counts from the last to the first axis.
-    dtype: np.dtype or str, optional
+    dtype: np.dtype or list str, optional
         The type of the returned array and of the accumulator in which the
         elements are summed. The dtype of :paramref:`ndarr` is used by default unless it has
         an integer dtype of less precision than the default platform integer.
@@ -477,7 +477,7 @@ def std(
     axis: int or tuple of ints, optional
         Axis or axes along which the standard deviation is computed. By default, `axis=None`
         computes the standard deviation of the flattened array.
-    dtype: np.dtype or str, optional
+    dtype: np.dtype or list str, optional
         Type to use in computing the standard deviation. For integer inputs, the
         default is float32; for floating point inputs, it is the same as the input dtype.
     ddof: int, optional
@@ -1666,7 +1666,7 @@ def copy(self, dtype: np.dtype | str = None, **kwargs: Any) -> NDArray:
 
         Parameters
         ----------
-        dtype: np.dtype or str
+        dtype: np.dtype or list str
             The new array dtype. Default is `self.dtype`.
 
         Other Parameters
@@ -2793,7 +2793,7 @@ def empty(shape: int | tuple | list, dtype: np.dtype | str | None = np.float64,
     ----------
     shape: int, tuple or list
         The shape for the final array.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The data type of the array elements in NumPy format. Default is `np.uint8`.
         This will override the `typesize`
         in the compression parameters if they are provided.
@@ -2965,7 +2965,7 @@ def full(
         Default value to use for uninitialized portions of the array.
         Its size will override the `typesize`
         in the cparams if they are passed.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The ndarray dtype in NumPy format. By default, this will
         be taken from the :paramref:`fill_value`.
         This will override the `typesize`
@@ -3059,7 +3059,7 @@ def arange(
         The end value of the sequence.
     step: int, float, complex or np.number
         Spacing between values.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The data type of the array elements in NumPy format. Default is `np.uint8`.
         This will override the `typesize`
         in the compression parameters if they are provided.
@@ -3152,7 +3152,7 @@ def linspace(start, stop, num=50, endpoint=True, dtype=np.float64, shape=None, c
         Number of samples to generate.
     endpoint: bool
         If True, `stop` is the last sample. Otherwise, it is not included.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The data type of the array elements in NumPy format. Default is `np.float64`.
     shape: int, tuple or list
         The shape of the final array. If None, the shape will be guessed from `num`.
@@ -3218,7 +3218,7 @@ def eye(N, M=None, k=0, dtype=np.float64, **kwargs: Any):
         Index of the diagonal: 0 (the default) refers to the main diagonal,
         a positive value refers to an upper diagonal, and a negative value
         to a lower diagonal.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The data type of the array elements in NumPy format. Default is `np.float64`.
 
     Returns
@@ -3263,7 +3263,7 @@ def fromiter(iterable, shape, dtype, c_order=True, **kwargs) -> NDArray:
         An iterable object providing data for the array.
     shape: int, tuple or list
         The shape of the final array.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The data type of the array elements in NumPy format.
     c_order: bool
         Whether to store the array in C order (row-major) or insertion order.
@@ -3329,7 +3329,7 @@ def frombuffer(
         The buffer of the data to populate the container.
     shape: int, tuple or list
         The shape for the final container.
-    dtype: np.dtype or str
+    dtype: np.dtype or list str
         The ndarray dtype in NumPy format. Default is `np.uint8`.
         This will override the `typesize`
         in the cparams if they are passed.
@@ -3373,8 +3373,8 @@ def copy(array: NDArray, dtype: np.dtype | str = None, **kwargs: Any) -> NDArray
     --------
     >>> import numpy as np
     >>> import blosc2
-    >>> # Create an instance of MyNDArray with some data
-    >>> original_array = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
+    >>> # Create an instance of NDArray with some data
+    >>> original_array = blosc2.asarray(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]]))
     >>> # Create a copy of the array without changing dtype
     >>> copied_array = blosc2.copy(original_array)
     >>> print("Copied array (default dtype):")
@@ -3727,7 +3727,14 @@ def __getitem__(self, key: int | slice | Sequence[slice]) -> np.ndarray:
             return key.where(self)
 
         if isinstance(key, str):
-            raise TypeError("This array is a NDField; use a structured NDArray for bool expressions")
+            # Try to compute the key as a boolean expression
+            # Operands will be a dict with all the fields in the NDArray
+            operands = {field: NDField(self.ndarr, field) for field in self.ndarr.dtype.names}
+            expr = blosc2.lazyexpr(key, operands)
+            if expr.dtype != np.bool_:
+                raise TypeError("The expression should return a boolean array")
+            return expr.where(self)
+            # raise TypeError("This array is a NDField; use a structured NDArray for bool expressions")
 
         # Check if the key is in the last read cache
         inmutable_key = make_key_hashable(key)
diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py
@@ -60,7 +60,7 @@ class CParams:
     splitmode: :class:`SplitMode`
         The split mode for the blocks.
         The default value is :py:obj:`SplitMode.AUTO_SPLIT <SplitMode>`.
-    filters: :class:`Filter` or int list
+    filters: :class:`Filter` or int list or None
         The sequence of filters. Default: [:py:obj:`Filter.NOFILTER <Filter>`,
         :py:obj:`Filter.NOFILTER <Filter>`, :py:obj:`Filter.NOFILTER <Filter>`, :py:obj:`Filter.NOFILTER <Filter>`,
         :py:obj:`Filter.NOFILTER <Filter>`, :py:obj:`Filter.SHUFFLE <Filter>`].
@@ -99,7 +99,8 @@ def __post_init__(self):
             raise ValueError("Number of filters exceeds 6")
         if len(self.filters) < len(self.filters_meta):
             self.filters_meta = self.filters_meta[: len(self.filters)]
-            warnings.warn("Changed `filters_meta` length to match `filters` length")
+            # There is no need to raise a warning here
+            # warnings.warn("Changed `filters_meta` length to match `filters` length")
         if len(self.filters) > len(self.filters_meta):
             raise ValueError("Number of filters cannot exceed number of filters meta")
 
diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py
@@ -500,3 +500,24 @@ def test_iter(shape, chunks, blocks):
         np.testing.assert_equal(a, b)
         assert a.dtype == b.dtype
     assert _i == shape[0] - 1
+
+
+def test_col_reduction():
+    N = 1000
+    rng = np.random.default_rng()
+    it = ((-x + 1, x - 2, rng.normal()) for x in range(N))
+    sa = blosc2.fromiter(
+        it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w"
+    )
+
+    # The operations
+    C = sa.fields["C"]
+    s = blosc2.sum(C[C > 0])
+    s2 = blosc2.sum(C["C > 0"])
+
+    # Check
+    nsa = sa[:]
+    nC = nsa["C"]
+    ns = np.sum(nC[nC > 0])
+    np.testing.assert_allclose(s, ns)
+    np.testing.assert_allclose(s2, ns)