Skip to content

Commit aa0cc04

Browse files
authored
Merge pull request numpy#26280 from pijyoi/speedup-float-clip
ENH: Speedup clip for floating point
2 parents 5b7b91c + 9c8ed4e commit aa0cc04

File tree

2 files changed

+115
-28
lines changed

2 files changed

+115
-28
lines changed

benchmarks/benchmarks/bench_clip.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from .common import Benchmark
2+
3+
import numpy as np
4+
5+
6+
class ClipFloat(Benchmark):
7+
param_names = ["dtype", "size"]
8+
params = [
9+
[np.float32, np.float64, np.longdouble],
10+
[100, 100_000]
11+
]
12+
13+
def setup(self, dtype, size):
14+
rng = np.random.default_rng()
15+
self.array = rng.random(size=size).astype(dtype)
16+
self.dataout = np.full_like(self.array, 0.5)
17+
18+
def time_clip(self, dtype, size):
19+
np.clip(self.array, 0.125, 0.875, self.dataout)
20+
21+
22+
class ClipInteger(Benchmark):
23+
param_names = ["dtype", "size"]
24+
params = [
25+
[np.int32, np.int64],
26+
[100, 100_000]
27+
]
28+
29+
def setup(self, dtype, size):
30+
rng = np.random.default_rng()
31+
self.array = rng.integers(256, size=size, dtype=dtype)
32+
self.dataout = np.full_like(self.array, 128)
33+
34+
def time_clip(self, dtype, size):
35+
np.clip(self.array, 32, 224, self.dataout)

numpy/_core/src/umath/clip.cpp

Lines changed: 80 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
/**
22
* This module provides the inner loops for the clip ufunc
33
*/
4+
#include <type_traits>
5+
46
#define _UMATHMODULE
57
#define _MULTIARRAYMODULE
68
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
@@ -150,50 +152,100 @@ _NPY_CLIP(T x, T min, T max)
150152
return _NPY_MIN<Tag>(_NPY_MAX<Tag>((x), (min)), (max));
151153
}
152154

153-
template <class Tag, class T = typename Tag::type>
154-
static void
155-
_npy_clip_(T **args, npy_intp const *dimensions, npy_intp const *steps)
156-
{
157-
npy_intp n = dimensions[0];
158-
if (steps[1] == 0 && steps[2] == 0) {
159-
/* min and max are constant throughout the loop, the most common case
160-
*/
161-
/* NOTE: it may be possible to optimize these checks for nan */
162-
T min_val = *args[1];
163-
T max_val = *args[2];
155+
template <class Tag, class T>
156+
static inline void
157+
_npy_clip_const_minmax_(
158+
char *ip, npy_intp is, char *op, npy_intp os, npy_intp n, T min_val, T max_val,
159+
std::false_type /* non-floating point */
160+
)
161+
{
162+
/* contiguous, branch to let the compiler optimize */
163+
if (is == sizeof(T) && os == sizeof(T)) {
164+
for (npy_intp i = 0; i < n; i++, ip += sizeof(T), op += sizeof(T)) {
165+
*(T *)op = _NPY_CLIP<Tag>(*(T *)ip, min_val, max_val);
166+
}
167+
}
168+
else {
169+
for (npy_intp i = 0; i < n; i++, ip += is, op += os) {
170+
*(T *)op = _NPY_CLIP<Tag>(*(T *)ip, min_val, max_val);
171+
}
172+
}
173+
}
164174

165-
T *ip1 = args[0], *op1 = args[3];
166-
npy_intp is1 = steps[0] / sizeof(T), os1 = steps[3] / sizeof(T);
175+
template <class Tag, class T>
176+
static inline void
177+
_npy_clip_const_minmax_(
178+
char *ip, npy_intp is, char *op, npy_intp os, npy_intp n, T min_val, T max_val,
179+
std::true_type /* floating point */
180+
)
181+
{
182+
if (!npy_isnan(min_val) && !npy_isnan(max_val)) {
183+
/*
184+
* The min/max_val are not NaN so the comparison below will
185+
* propagate NaNs in the input without further NaN checks.
186+
*/
167187

168188
/* contiguous, branch to let the compiler optimize */
169-
if (is1 == 1 && os1 == 1) {
170-
for (npy_intp i = 0; i < n; i++, ip1++, op1++) {
171-
*op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val);
189+
if (is == sizeof(T) && os == sizeof(T)) {
190+
for (npy_intp i = 0; i < n; i++, ip += sizeof(T), op += sizeof(T)) {
191+
T x = *(T *)ip;
192+
if (x < min_val) {
193+
x = min_val;
194+
}
195+
if (x > max_val) {
196+
x = max_val;
197+
}
198+
*(T *)op = x;
172199
}
173200
}
174201
else {
175-
for (npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
176-
*op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val);
202+
for (npy_intp i = 0; i < n; i++, ip += is, op += os) {
203+
T x = *(T *)ip;
204+
if (x < min_val) {
205+
x = min_val;
206+
}
207+
if (x > max_val) {
208+
x = max_val;
209+
}
210+
*(T *)op = x;
177211
}
178212
}
179213
}
180214
else {
181-
T *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];
182-
npy_intp is1 = steps[0] / sizeof(T), is2 = steps[1] / sizeof(T),
183-
is3 = steps[2] / sizeof(T), os1 = steps[3] / sizeof(T);
184-
for (npy_intp i = 0; i < n;
185-
i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
186-
*op1 = _NPY_CLIP<Tag>(*ip1, *ip2, *ip3);
215+
/* min_val and/or max_val are nans */
216+
T x = npy_isnan(min_val) ? min_val : max_val;
217+
for (npy_intp i = 0; i < n; i++, op += os) {
218+
*(T *)op = x;
219+
}
187220
}
188-
npy_clear_floatstatus_barrier((char *)dimensions);
189221
}
190222

191-
template <class Tag>
223+
template <class Tag, class T = typename Tag::type>
192224
static void
193225
_npy_clip(char **args, npy_intp const *dimensions, npy_intp const *steps)
194226
{
195-
using T = typename Tag::type;
196-
return _npy_clip_<Tag>((T **)args, dimensions, steps);
227+
npy_intp n = dimensions[0];
228+
if (steps[1] == 0 && steps[2] == 0) {
229+
/* min and max are constant throughout the loop, the most common case */
230+
T min_val = *(T *)args[1];
231+
T max_val = *(T *)args[2];
232+
233+
_npy_clip_const_minmax_<Tag, T>(
234+
args[0], steps[0], args[3], steps[3], n, min_val, max_val,
235+
std::is_base_of<npy::floating_point_tag, Tag>{}
236+
);
237+
}
238+
else {
239+
char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];
240+
npy_intp is1 = steps[0], is2 = steps[1],
241+
is3 = steps[2], os1 = steps[3];
242+
for (npy_intp i = 0; i < n;
243+
i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
244+
{
245+
*(T *)op1 = _NPY_CLIP<Tag>(*(T *)ip1, *(T *)ip2, *(T *)ip3);
246+
}
247+
}
248+
npy_clear_floatstatus_barrier((char *)dimensions);
197249
}
198250

199251
extern "C" {

0 commit comments

Comments
 (0)