Skip to content

Commit bc153c5

Browse files
Add support for new pandas UDF engine
1 parent fe75347 commit bc153c5

File tree

2 files changed

+172
-0
lines changed

2 files changed

+172
-0
lines changed

src/blosc2/proxy.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,3 +676,50 @@ def wrapper(*args, **func_kwargs):
676676
return decorator
677677
else:
678678
return decorator(func)
679+
680+
681+
class PandasUdfEngine:
682+
@staticmethod
683+
def _ensure_numpy_data(data):
684+
if not isinstance(data, np.ndarray):
685+
try:
686+
data = data.values
687+
except AttributeError as err:
688+
raise ValueError(
689+
"blosc2.jit received an object of type {data.__name__}, which is not supported. "
690+
"Try casting your Series or DataFrame to a NumPy dtype."
691+
) from err
692+
return data
693+
694+
@classmethod
695+
def map(cls, data, func, args, kwargs, decorator, skip_na):
696+
"""
697+
JIT a NumPy array element-wise. In the case of Blosc2, functions are
698+
expected to be vectorized NumPy operations, so the function is called
699+
with the NumPy array as the function parameter, instead of calling the
700+
function once for each element.
701+
"""
702+
data = cls._ensure_numpy_data(data)
703+
func = decorator(func)
704+
return func(data, *args, **kwargs)
705+
706+
@classmethod
707+
def apply(cls, data, func, args, kwargs, decorator, axis):
708+
"""
709+
JIT a NumPy array by column or row. In the case of Blosc2, functions are
710+
expected to be vectorized NumPy operations, so the function is called
711+
with the NumPy array as the function parameter, instead of calling the
712+
function once for each column or row.
713+
"""
714+
data = cls._ensure_numpy_data(data)
715+
func = decorator(func)
716+
if data.ndim in (1, 2):
717+
return func(data, *args, **kwargs)
718+
else:
719+
raise NotImplementedError(
720+
"The blosc2 engine only supports data with with 1 or 2 dimensions. "
721+
f"A NumPy array with {data.ndim} dimensions has been received."
722+
)
723+
724+
725+
jit.__pandas_udf__ = PandasUdfEngine

tests/test_pandas_udf_engine.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <[email protected]>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
9+
import numpy as np
10+
11+
import blosc2
12+
13+
14+
class TestPandasUDF:
15+
def test_map_1d(self):
16+
def add_one(x):
17+
return x + 1
18+
19+
data = np.array([1, 2])
20+
21+
result = blosc2.jit.__pandas_udf__.map(
22+
data,
23+
add_one,
24+
args=(),
25+
kwargs={},
26+
decorator=blosc2.jit,
27+
skip_na=False,
28+
)
29+
assert result.shape == (2,)
30+
assert result[0] == 2
31+
assert result[1] == 3
32+
33+
def test_map_1d_with_args(self):
34+
def add_numbers(x, num1, num2):
35+
return x + num1 + num2
36+
37+
data = np.array([1, 2])
38+
39+
result = blosc2.jit.__pandas_udf__.map(
40+
data,
41+
add_numbers,
42+
args=(10,),
43+
kwargs={"num2": 100},
44+
decorator=blosc2.jit,
45+
skip_na=False,
46+
)
47+
assert result.shape == (2,)
48+
assert result[0] == 111
49+
assert result[1] == 112
50+
51+
def test_map_2d(self):
52+
def add_one(x):
53+
return x + 1
54+
55+
data = np.array([[1, 2], [3, 4]])
56+
57+
result = blosc2.jit.__pandas_udf__.map(
58+
data,
59+
add_one,
60+
args=(),
61+
kwargs={},
62+
decorator=blosc2.jit,
63+
skip_na=False,
64+
)
65+
assert result.shape == (2, 2)
66+
assert result[0, 0] == 2
67+
assert result[0, 1] == 3
68+
assert result[1, 0] == 4
69+
assert result[1, 1] == 5
70+
71+
def test_apply_1d(self):
72+
def add_one(x):
73+
return x + 1
74+
75+
data = np.array([1, 2])
76+
77+
result = blosc2.jit.__pandas_udf__.apply(
78+
data,
79+
add_one,
80+
args=(),
81+
kwargs={},
82+
decorator=blosc2.jit,
83+
axis=0,
84+
)
85+
assert result.shape == (2,)
86+
assert result[0] == 2
87+
assert result[1] == 3
88+
89+
def test_apply_1d_with_args(self):
90+
def add_numbers(x, num1, num2):
91+
return x + num1 + num2
92+
93+
data = np.array([1, 2])
94+
95+
result = blosc2.jit.__pandas_udf__.apply(
96+
data,
97+
add_numbers,
98+
args=(10,),
99+
kwargs={"num2": 100},
100+
decorator=blosc2.jit,
101+
axis=0,
102+
)
103+
assert result.shape == (2,)
104+
assert result[0] == 111
105+
assert result[1] == 112
106+
107+
def test_apply_2d(self):
108+
def add_one(x):
109+
return x + 1
110+
111+
data = np.array([[1, 2], [3, 4]])
112+
113+
result = blosc2.jit.__pandas_udf__.apply(
114+
data,
115+
add_one,
116+
args=(),
117+
kwargs={},
118+
decorator=blosc2.jit,
119+
axis=0,
120+
)
121+
assert result.shape == (2, 2)
122+
assert result[0, 0] == 2
123+
assert result[0, 1] == 3
124+
assert result[1, 0] == 4
125+
assert result[1, 1] == 5

0 commit comments

Comments
 (0)