|
1 | 1 | /**
|
2 | 2 | * This module provides the inner loops for the clip ufunc
|
3 | 3 | */
|
| 4 | +#include <type_traits> |
| 5 | + |
4 | 6 | #define _UMATHMODULE
|
5 | 7 | #define _MULTIARRAYMODULE
|
6 | 8 | #define NPY_NO_DEPRECATED_API NPY_API_VERSION
|
@@ -150,50 +152,100 @@ _NPY_CLIP(T x, T min, T max)
|
150 | 152 | return _NPY_MIN<Tag>(_NPY_MAX<Tag>((x), (min)), (max));
|
151 | 153 | }
|
152 | 154 |
|
153 |
| -template <class Tag, class T = typename Tag::type> |
154 |
| -static void |
155 |
| -_npy_clip_(T **args, npy_intp const *dimensions, npy_intp const *steps) |
156 |
| -{ |
157 |
| - npy_intp n = dimensions[0]; |
158 |
| - if (steps[1] == 0 && steps[2] == 0) { |
159 |
| - /* min and max are constant throughout the loop, the most common case |
160 |
| - */ |
161 |
| - /* NOTE: it may be possible to optimize these checks for nan */ |
162 |
| - T min_val = *args[1]; |
163 |
| - T max_val = *args[2]; |
| 155 | +template <class Tag, class T> |
| 156 | +static inline void |
| 157 | +_npy_clip_const_minmax_( |
| 158 | + char *ip, npy_intp is, char *op, npy_intp os, npy_intp n, T min_val, T max_val, |
| 159 | + std::false_type /* non-floating point */ |
| 160 | +) |
| 161 | +{ |
| 162 | + /* contiguous, branch to let the compiler optimize */ |
| 163 | + if (is == sizeof(T) && os == sizeof(T)) { |
| 164 | + for (npy_intp i = 0; i < n; i++, ip += sizeof(T), op += sizeof(T)) { |
| 165 | + *(T *)op = _NPY_CLIP<Tag>(*(T *)ip, min_val, max_val); |
| 166 | + } |
| 167 | + } |
| 168 | + else { |
| 169 | + for (npy_intp i = 0; i < n; i++, ip += is, op += os) { |
| 170 | + *(T *)op = _NPY_CLIP<Tag>(*(T *)ip, min_val, max_val); |
| 171 | + } |
| 172 | + } |
| 173 | +} |
164 | 174 |
|
165 |
| - T *ip1 = args[0], *op1 = args[3]; |
166 |
| - npy_intp is1 = steps[0] / sizeof(T), os1 = steps[3] / sizeof(T); |
| 175 | +template <class Tag, class T> |
| 176 | +static inline void |
| 177 | +_npy_clip_const_minmax_( |
| 178 | + char *ip, npy_intp is, char *op, npy_intp os, npy_intp n, T min_val, T max_val, |
| 179 | + std::true_type /* floating point */ |
| 180 | +) |
| 181 | +{ |
| 182 | + if (!npy_isnan(min_val) && !npy_isnan(max_val)) { |
| 183 | + /* |
| 184 | + * The min/max_val are not NaN so the comparison below will |
| 185 | + * propagate NaNs in the input without further NaN checks. |
| 186 | + */ |
167 | 187 |
|
168 | 188 | /* contiguous, branch to let the compiler optimize */
|
169 |
| - if (is1 == 1 && os1 == 1) { |
170 |
| - for (npy_intp i = 0; i < n; i++, ip1++, op1++) { |
171 |
| - *op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val); |
| 189 | + if (is == sizeof(T) && os == sizeof(T)) { |
| 190 | + for (npy_intp i = 0; i < n; i++, ip += sizeof(T), op += sizeof(T)) { |
| 191 | + T x = *(T *)ip; |
| 192 | + if (x < min_val) { |
| 193 | + x = min_val; |
| 194 | + } |
| 195 | + if (x > max_val) { |
| 196 | + x = max_val; |
| 197 | + } |
| 198 | + *(T *)op = x; |
172 | 199 | }
|
173 | 200 | }
|
174 | 201 | else {
|
175 |
| - for (npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) { |
176 |
| - *op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val); |
| 202 | + for (npy_intp i = 0; i < n; i++, ip += is, op += os) { |
| 203 | + T x = *(T *)ip; |
| 204 | + if (x < min_val) { |
| 205 | + x = min_val; |
| 206 | + } |
| 207 | + if (x > max_val) { |
| 208 | + x = max_val; |
| 209 | + } |
| 210 | + *(T *)op = x; |
177 | 211 | }
|
178 | 212 | }
|
179 | 213 | }
|
180 | 214 | else {
|
181 |
| - T *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3]; |
182 |
| - npy_intp is1 = steps[0] / sizeof(T), is2 = steps[1] / sizeof(T), |
183 |
| - is3 = steps[2] / sizeof(T), os1 = steps[3] / sizeof(T); |
184 |
| - for (npy_intp i = 0; i < n; |
185 |
| - i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1) |
186 |
| - *op1 = _NPY_CLIP<Tag>(*ip1, *ip2, *ip3); |
| 215 | + /* min_val and/or max_val are nans */ |
| 216 | + T x = npy_isnan(min_val) ? min_val : max_val; |
| 217 | + for (npy_intp i = 0; i < n; i++, op += os) { |
| 218 | + *(T *)op = x; |
| 219 | + } |
187 | 220 | }
|
188 |
| - npy_clear_floatstatus_barrier((char *)dimensions); |
189 | 221 | }
|
190 | 222 |
|
191 |
| -template <class Tag> |
| 223 | +template <class Tag, class T = typename Tag::type> |
192 | 224 | static void
|
193 | 225 | _npy_clip(char **args, npy_intp const *dimensions, npy_intp const *steps)
|
194 | 226 | {
|
195 |
| - using T = typename Tag::type; |
196 |
| - return _npy_clip_<Tag>((T **)args, dimensions, steps); |
| 227 | + npy_intp n = dimensions[0]; |
| 228 | + if (steps[1] == 0 && steps[2] == 0) { |
| 229 | + /* min and max are constant throughout the loop, the most common case */ |
| 230 | + T min_val = *(T *)args[1]; |
| 231 | + T max_val = *(T *)args[2]; |
| 232 | + |
| 233 | + _npy_clip_const_minmax_<Tag, T>( |
| 234 | + args[0], steps[0], args[3], steps[3], n, min_val, max_val, |
| 235 | + std::is_base_of<npy::floating_point_tag, Tag>{} |
| 236 | + ); |
| 237 | + } |
| 238 | + else { |
| 239 | + char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3]; |
| 240 | + npy_intp is1 = steps[0], is2 = steps[1], |
| 241 | + is3 = steps[2], os1 = steps[3]; |
| 242 | + for (npy_intp i = 0; i < n; |
| 243 | + i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1) |
| 244 | + { |
| 245 | + *(T *)op1 = _NPY_CLIP<Tag>(*(T *)ip1, *(T *)ip2, *(T *)ip3); |
| 246 | + } |
| 247 | + } |
| 248 | + npy_clear_floatstatus_barrier((char *)dimensions); |
197 | 249 | }
|
198 | 250 |
|
199 | 251 | extern "C" {
|
|
0 commit comments