Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions src/blosc2/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,3 +676,50 @@ def wrapper(*args, **func_kwargs):
return decorator
else:
return decorator(func)


class PandasUdfEngine:
@staticmethod
def _ensure_numpy_data(data):
if not isinstance(data, np.ndarray):
try:
data = data.values
except AttributeError as err:
raise ValueError(
"blosc2.jit received an object of type {data.__name__}, which is not supported. "
"Try casting your Series or DataFrame to a NumPy dtype."
) from err
return data

@classmethod
def map(cls, data, func, args, kwargs, decorator, skip_na):
"""
JIT a NumPy array element-wise. In the case of Blosc2, functions are
expected to be vectorized NumPy operations, so the function is called
with the NumPy array as the function parameter, instead of calling the
function once for each element.
"""
data = cls._ensure_numpy_data(data)
func = decorator(func)
return func(data, *args, **kwargs)

@classmethod
def apply(cls, data, func, args, kwargs, decorator, axis):
"""
JIT a NumPy array by column or row. In the case of Blosc2, functions are
expected to be vectorized NumPy operations, so the function is called
with the NumPy array as the function parameter, instead of calling the
function once for each column or row.
"""
data = cls._ensure_numpy_data(data)
func = decorator(func)
if data.ndim in (1, 2):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this check? I am pretty sure that Blosc2 can handle arrays up to 8 dims (can be made larger by recompiling the underlying C-Blosc2 library). If that is not the case, this is a bug.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. pandas is the one that shouldn't be sending data with more than 2D. I had this check first as I had separate if branches for 1D and 2D, and I wanted to raise for unknown cases, but this is indeed not really needed. I removed it now.

return func(data, *args, **kwargs)
else:
raise NotImplementedError(
"The blosc2 engine only supports data with with 1 or 2 dimensions. "
f"A NumPy array with {data.ndim} dimensions has been received."
)


jit.__pandas_udf__ = PandasUdfEngine
125 changes: 125 additions & 0 deletions tests/test_pandas_udf_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <[email protected]>
# All rights reserved.
#
# This source code is licensed under a BSD-style license (found in the
# LICENSE file in the root directory of this source tree)
#######################################################################

import numpy as np

import blosc2


class TestPandasUDF:
def test_map_1d(self):
def add_one(x):
return x + 1

data = np.array([1, 2])

result = blosc2.jit.__pandas_udf__.map(
data,
add_one,
args=(),
kwargs={},
decorator=blosc2.jit,
skip_na=False,
)
assert result.shape == (2,)
assert result[0] == 2
assert result[1] == 3

def test_map_1d_with_args(self):
def add_numbers(x, num1, num2):
return x + num1 + num2

data = np.array([1, 2])

result = blosc2.jit.__pandas_udf__.map(
data,
add_numbers,
args=(10,),
kwargs={"num2": 100},
decorator=blosc2.jit,
skip_na=False,
)
assert result.shape == (2,)
assert result[0] == 111
assert result[1] == 112

def test_map_2d(self):
def add_one(x):
return x + 1

data = np.array([[1, 2], [3, 4]])

result = blosc2.jit.__pandas_udf__.map(
data,
add_one,
args=(),
kwargs={},
decorator=blosc2.jit,
skip_na=False,
)
assert result.shape == (2, 2)
assert result[0, 0] == 2
assert result[0, 1] == 3
assert result[1, 0] == 4
assert result[1, 1] == 5

def test_apply_1d(self):
def add_one(x):
return x + 1

data = np.array([1, 2])

result = blosc2.jit.__pandas_udf__.apply(
data,
add_one,
args=(),
kwargs={},
decorator=blosc2.jit,
axis=0,
)
assert result.shape == (2,)
assert result[0] == 2
assert result[1] == 3

def test_apply_1d_with_args(self):
def add_numbers(x, num1, num2):
return x + num1 + num2

data = np.array([1, 2])

result = blosc2.jit.__pandas_udf__.apply(
data,
add_numbers,
args=(10,),
kwargs={"num2": 100},
decorator=blosc2.jit,
axis=0,
)
assert result.shape == (2,)
assert result[0] == 111
assert result[1] == 112

def test_apply_2d(self):
def add_one(x):
return x + 1

data = np.array([[1, 2], [3, 4]])

result = blosc2.jit.__pandas_udf__.apply(
data,
add_one,
args=(),
kwargs={},
decorator=blosc2.jit,
axis=0,
)
assert result.shape == (2, 2)
assert result[0, 0] == 2
assert result[0, 1] == 3
assert result[1, 0] == 4
assert result[1, 1] == 5