Skip to content

Commit e4d6c47

Browse files
committed
Support for col selects and reductions in one go
1 parent 6231691 commit e4d6c47

File tree

6 files changed

+55
-27
lines changed

6 files changed

+55
-27
lines changed

doc/reference/lazyarray.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
LazyArray
44
=========
55

6-
This is an API interface for evaluating an expression or a Python user defined function.
6+
This is an API interface for computing an expression or a Python user defined function.
77

88
You can get an object following the LazyArray API with any of the following ways:
99

1010
* Any expression that involves one or more NDArray objects. e.g. ``a + b``, where ``a`` and ``b`` are NDArray objects (see `this tutorial <../getting_started/tutorials/03.lazyarray-expressions.html>`_).
1111
* Using the ``lazyexpr`` constructor.
1212
* Using the ``lazyudf`` constructor (see `a tutorial <../getting_started/tutorials/03.lazyarray-udf.html>`_).
1313

14-
The LazyArray object is a thin wrapper around the expression or user-defined function that allows for lazy evaluation. This means that the expression is not evaluated until the ``compute`` or ``__getitem__`` methods are called. The ``compute`` method will return a new NDArray object with the result of the expression evaluation. The ``__getitem__`` method will return an NumPy object instead.
14+
The LazyArray object is a thin wrapper around the expression or user-defined function that allows for lazy computation. This means that the expression is not computed until the ``compute`` or ``__getitem__`` methods are called. The ``compute`` method will return a new NDArray object with the result of the expression evaluation. The ``__getitem__`` method will return an NumPy object instead.
1515

1616
See the `LazyExpr`_ and `LazyUDF`_ sections for more information.
1717

@@ -38,7 +38,7 @@ LazyExpr
3838

3939
An expression like ``a + sum(b)``, where there is at least one NDArray object in operands ``a`` and ``b``, `returns a LazyExpr object <../getting_started/tutorials/03.lazyarray-expressions.html>`_. You can also get a LazyExpr object using the ``lazyexpr`` constructor (see below).
4040

41-
This object follows the `LazyArray`_ API for evaluation and storage.
41+
This object follows the `LazyArray`_ API for computation and storage.
4242

4343
.. currentmodule:: blosc2
4444

@@ -67,7 +67,7 @@ LazyUDF
6767

6868
For getting a LazyUDF object (which is LazyArray-compliant) from a user-defined Python function, you can use the lazyudf constructor below. See `a tutorial on how this works <../getting_started/tutorials/03.lazyarray-udf.html>`_.
6969

70-
This object follows the `LazyArray`_ API for evaluation, although storage is not supported yet.
70+
This object follows the `LazyArray`_ API for computation, although storage is not supported yet.
7171

7272
.. autosummary::
7373
:toctree: autofiles/lazyarray

src/blosc2/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1285,7 +1285,7 @@ def compute_chunks_blocks( # noqa: C901
12851285
blocks: tuple | list | None = None,
12861286
dtype: np.dtype = np.uint8,
12871287
**kwargs: dict,
1288-
) -> tuple[(int, int)]:
1288+
) -> tuple:
12891289
"""
12901290
Compute educated guesses for chunks and blocks of a :ref:`NDArray`.
12911291

src/blosc2/lazyexpr.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1442,16 +1442,15 @@ def reduce_slices( # noqa: C901
14421442
else:
14431443
# Apply the where condition (in result)
14441444
if len(where) == 2:
1445-
# x = chunk_operands["_where_x"]
1446-
# y = chunk_operands["_where_y"]
1447-
# result = np.where(result, x, y)
1448-
# numexpr is a bit faster than np.where, and we can fuse operations in this case
14491445
new_expr = f"where({expression}, _where_x, _where_y)"
14501446
result = ne.evaluate(new_expr, chunk_operands)
1447+
elif len(where) == 1:
1448+
result = ne.evaluate(expression, chunk_operands)
1449+
x = chunk_operands["_where_x"]
1450+
result = x[result]
14511451
else:
1452-
raise ValueError(
1453-
"A where condition with less than 2 params in combination with reductions"
1454-
" is not supported yet"
1452+
raise NotImplementedError(
1453+
"A where condition with no params in combination with reductions is not supported yet"
14551454
)
14561455

14571456
# Reduce the result

src/blosc2/ndarray.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ def sum(
384384
Axis or axes along which a sum is performed. By default, axis=None,
385385
sums all the elements of the input array. If axis is negative,
386386
it counts from the last to the first axis.
387-
dtype: np.dtype or str, optional
387+
dtype: np.dtype or list str, optional
388388
The type of the returned array and of the accumulator in which the
389389
elements are summed. The dtype of :paramref:`ndarr` is used by default unless it has
390390
an integer dtype of less precision than the default platform integer.
@@ -477,7 +477,7 @@ def std(
477477
axis: int or tuple of ints, optional
478478
Axis or axes along which the standard deviation is computed. By default, `axis=None`
479479
computes the standard deviation of the flattened array.
480-
dtype: np.dtype or str, optional
480+
dtype: np.dtype or list str, optional
481481
Type to use in computing the standard deviation. For integer inputs, the
482482
default is float32; for floating point inputs, it is the same as the input dtype.
483483
ddof: int, optional
@@ -1666,7 +1666,7 @@ def copy(self, dtype: np.dtype | str = None, **kwargs: Any) -> NDArray:
16661666
16671667
Parameters
16681668
----------
1669-
dtype: np.dtype or str
1669+
dtype: np.dtype or list str
16701670
The new array dtype. Default is `self.dtype`.
16711671
16721672
Other Parameters
@@ -2793,7 +2793,7 @@ def empty(shape: int | tuple | list, dtype: np.dtype | str | None = np.float64,
27932793
----------
27942794
shape: int, tuple or list
27952795
The shape for the final array.
2796-
dtype: np.dtype or str
2796+
dtype: np.dtype or list str
27972797
The data type of the array elements in NumPy format. Default is `np.uint8`.
27982798
This will override the `typesize`
27992799
in the compression parameters if they are provided.
@@ -2965,7 +2965,7 @@ def full(
29652965
Default value to use for uninitialized portions of the array.
29662966
Its size will override the `typesize`
29672967
in the cparams if they are passed.
2968-
dtype: np.dtype or str
2968+
dtype: np.dtype or list str
29692969
The ndarray dtype in NumPy format. By default, this will
29702970
be taken from the :paramref:`fill_value`.
29712971
This will override the `typesize`
@@ -3059,7 +3059,7 @@ def arange(
30593059
The end value of the sequence.
30603060
step: int, float, complex or np.number
30613061
Spacing between values.
3062-
dtype: np.dtype or str
3062+
dtype: np.dtype or list str
30633063
The data type of the array elements in NumPy format. Default is `np.uint8`.
30643064
This will override the `typesize`
30653065
in the compression parameters if they are provided.
@@ -3152,7 +3152,7 @@ def linspace(start, stop, num=50, endpoint=True, dtype=np.float64, shape=None, c
31523152
Number of samples to generate.
31533153
endpoint: bool
31543154
If True, `stop` is the last sample. Otherwise, it is not included.
3155-
dtype: np.dtype or str
3155+
dtype: np.dtype or list str
31563156
The data type of the array elements in NumPy format. Default is `np.float64`.
31573157
shape: int, tuple or list
31583158
The shape of the final array. If None, the shape will be guessed from `num`.
@@ -3218,7 +3218,7 @@ def eye(N, M=None, k=0, dtype=np.float64, **kwargs: Any):
32183218
Index of the diagonal: 0 (the default) refers to the main diagonal,
32193219
a positive value refers to an upper diagonal, and a negative value
32203220
to a lower diagonal.
3221-
dtype: np.dtype or str
3221+
dtype: np.dtype or list str
32223222
The data type of the array elements in NumPy format. Default is `np.float64`.
32233223
32243224
Returns
@@ -3263,7 +3263,7 @@ def fromiter(iterable, shape, dtype, c_order=True, **kwargs) -> NDArray:
32633263
An iterable object providing data for the array.
32643264
shape: int, tuple or list
32653265
The shape of the final array.
3266-
dtype: np.dtype or str
3266+
dtype: np.dtype or list str
32673267
The data type of the array elements in NumPy format.
32683268
c_order: bool
32693269
Whether to store the array in C order (row-major) or insertion order.
@@ -3329,7 +3329,7 @@ def frombuffer(
33293329
The buffer of the data to populate the container.
33303330
shape: int, tuple or list
33313331
The shape for the final container.
3332-
dtype: np.dtype or str
3332+
dtype: np.dtype or list str
33333333
The ndarray dtype in NumPy format. Default is `np.uint8`.
33343334
This will override the `typesize`
33353335
in the cparams if they are passed.
@@ -3373,8 +3373,8 @@ def copy(array: NDArray, dtype: np.dtype | str = None, **kwargs: Any) -> NDArray
33733373
--------
33743374
>>> import numpy as np
33753375
>>> import blosc2
3376-
>>> # Create an instance of MyNDArray with some data
3377-
>>> original_array = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
3376+
>>> # Create an instance of NDArray with some data
3377+
>>> original_array = blosc2.asarray(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]]))
33783378
>>> # Create a copy of the array without changing dtype
33793379
>>> copied_array = blosc2.copy(original_array)
33803380
>>> print("Copied array (default dtype):")
@@ -3727,7 +3727,14 @@ def __getitem__(self, key: int | slice | Sequence[slice]) -> np.ndarray:
37273727
return key.where(self)
37283728

37293729
if isinstance(key, str):
3730-
raise TypeError("This array is a NDField; use a structured NDArray for bool expressions")
3730+
# Try to compute the key as a boolean expression
3731+
# Operands will be a dict with all the fields in the NDArray
3732+
operands = {field: NDField(self.ndarr, field) for field in self.ndarr.dtype.names}
3733+
expr = blosc2.lazyexpr(key, operands)
3734+
if expr.dtype != np.bool_:
3735+
raise TypeError("The expression should return a boolean array")
3736+
return expr.where(self)
3737+
# raise TypeError("This array is a NDField; use a structured NDArray for bool expressions")
37313738

37323739
# Check if the key is in the last read cache
37333740
inmutable_key = make_key_hashable(key)

src/blosc2/storage.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ class CParams:
6060
splitmode: :class:`SplitMode`
6161
The split mode for the blocks.
6262
The default value is :py:obj:`SplitMode.AUTO_SPLIT <SplitMode>`.
63-
filters: :class:`Filter` or int list
63+
filters: :class:`Filter` or int list or None
6464
The sequence of filters. Default: [:py:obj:`Filter.NOFILTER <Filter>`,
6565
:py:obj:`Filter.NOFILTER <Filter>`, :py:obj:`Filter.NOFILTER <Filter>`, :py:obj:`Filter.NOFILTER <Filter>`,
6666
:py:obj:`Filter.NOFILTER <Filter>`, :py:obj:`Filter.SHUFFLE <Filter>`].
@@ -99,7 +99,8 @@ def __post_init__(self):
9999
raise ValueError("Number of filters exceeds 6")
100100
if len(self.filters) < len(self.filters_meta):
101101
self.filters_meta = self.filters_meta[: len(self.filters)]
102-
warnings.warn("Changed `filters_meta` length to match `filters` length")
102+
# There is no need to raise a warning here
103+
# warnings.warn("Changed `filters_meta` length to match `filters` length")
103104
if len(self.filters) > len(self.filters_meta):
104105
raise ValueError("Number of filters cannot exceed number of filters meta")
105106

tests/ndarray/test_lazyexpr_fields.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,3 +500,24 @@ def test_iter(shape, chunks, blocks):
500500
np.testing.assert_equal(a, b)
501501
assert a.dtype == b.dtype
502502
assert _i == shape[0] - 1
503+
504+
505+
def test_col_reduction():
506+
N = 1000
507+
rng = np.random.default_rng()
508+
it = ((-x + 1, x - 2, rng.normal()) for x in range(N))
509+
sa = blosc2.fromiter(
510+
it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w"
511+
)
512+
513+
# The operations
514+
C = sa.fields["C"]
515+
s = blosc2.sum(C[C > 0])
516+
s2 = blosc2.sum(C["C > 0"])
517+
518+
# Check
519+
nsa = sa[:]
520+
nC = nsa["C"]
521+
ns = np.sum(nC[nC > 0])
522+
np.testing.assert_allclose(s, ns)
523+
np.testing.assert_allclose(s2, ns)

0 commit comments

Comments
 (0)