1
- #include " numpy/npy_math.h"
2
1
#include " simd/simd.h"
3
2
#include " loops_utils.h"
4
3
#include " loops.h"
@@ -31,7 +30,7 @@ namespace hn = hwy::HWY_NAMESPACE;
31
30
* elements or when there's no native FUSED support instead of fallback to libc
32
31
*/
33
32
34
- #if NPY_SIMD_FMA3 // native support
33
+ #if HWY_NATIVE_FMA // native support
35
34
typedef enum
36
35
{
37
36
SIMD_COMPUTE_SIN,
@@ -44,7 +43,7 @@ using vec_f32 = hn::Vec<decltype(f32)>;
44
43
using vec_s32 = hn::Vec<decltype (s32)>;
45
44
using opmask_t = hn::Mask<decltype (f32 )>;
46
45
47
- NPY_FINLINE HWY_ATTR vec_f32
46
+ HWY_INLINE HWY_ATTR vec_f32
48
47
simd_range_reduction_f32 (vec_f32& x, vec_f32& y, const vec_f32& c1, const vec_f32& c2, const vec_f32& c3)
49
48
{
50
49
vec_f32 reduced_x = hn::MulAdd (y, c1, x);
@@ -53,7 +52,7 @@ simd_range_reduction_f32(vec_f32& x, vec_f32& y, const vec_f32& c1, const vec_f3
53
52
return reduced_x;
54
53
}
55
54
56
- NPY_FINLINE HWY_ATTR vec_f32
55
+ HWY_INLINE HWY_ATTR vec_f32
57
56
simd_cosine_poly_f32 (vec_f32& x2)
58
57
{
59
58
const vec_f32 invf8 = hn::Set (f32 , 0x1 .98e616p-16f );
@@ -73,7 +72,7 @@ simd_cosine_poly_f32(vec_f32& x2)
73
72
* Maximum ULP across all 32-bit floats = 0.647
74
73
* Polynomial approximation based on unpublished work by T. Myklebust
75
74
*/
76
- NPY_FINLINE HWY_ATTR vec_f32
75
+ HWY_INLINE HWY_ATTR vec_f32
77
76
simd_sine_poly_f32 (vec_f32& x, vec_f32& x2)
78
77
{
79
78
const vec_f32 invf9 = hn::Set (f32 , 0x1 .7d3bbcp-19f );
@@ -89,26 +88,6 @@ simd_sine_poly_f32(vec_f32& x, vec_f32& x2)
89
88
return r;
90
89
}
91
90
92
- NPY_FINLINE HWY_ATTR vec_f32
93
- GatherIndexN (const float * src, npy_intp ssrc, npy_intp len)
94
- {
95
- float temp[hn::Lanes (f32 )] = { 0 .0f };
96
- for (auto ii = 0 ; ii < std::min (len, (npy_intp)hn::Lanes (f32 )); ++ii) {
97
- temp[ii] = src[ii * ssrc];
98
- }
99
- return hn::LoadU (f32 , temp);
100
- }
101
-
102
- NPY_FINLINE HWY_ATTR void
103
- ScatterIndexN (vec_f32 vec, float * dst, npy_intp sdst, npy_intp len)
104
- {
105
- float temp[hn::Lanes (f32 )];
106
- hn::StoreU (vec, f32 , temp);
107
- for (auto ii = 0 ; ii < std::min (len, (npy_intp)hn::Lanes (f32 )); ++ii) {
108
- dst[ii * sdst] = temp[ii];
109
- }
110
- }
111
-
112
91
static void HWY_ATTR SIMD_MSVC_NOINLINE
113
92
simd_sincos_f32 (const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
114
93
npy_intp len, SIMD_TRIG_OP trig_op)
@@ -130,23 +109,15 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
130
109
const vec_f32 max_cody = hn::Set (f32 , max_codi);
131
110
132
111
const int lanes = hn::Lanes (f32 );
133
- // npy_intp load_index[lanes/2];
134
- // for (auto i = 0; i < lanes; ++i) {
135
- // load_index[i] = i * ssrc;
136
- // }
137
- // vec_s32 vec_lindex = hn::LoadU(s32, load_index);
138
- // npy_intp store_index[lanes/2];
139
- // for (auto i = 0; i < lanes; ++i) {
140
- // store_index[i] = i * sdst;
141
- // }
142
- // vec_s32 vec_sindex = hn::LoadU(s32, store_index);
112
+ const vec_s32 src_index = hn::Mul (hn::Iota (s32, 0 ), hn::Set (s32, ssrc));
113
+ const vec_s32 dst_index = hn::Mul (hn::Iota (s32, 0 ), hn::Set (s32, sdst));
143
114
144
115
for (; len > 0 ; len -= lanes, src += ssrc*lanes, dst += sdst*lanes) {
145
116
vec_f32 x_in;
146
117
if (ssrc == 1 ) {
147
118
x_in = hn::LoadN (f32 , src, len);
148
119
} else {
149
- x_in = GatherIndexN (src, ssrc , len);
120
+ x_in = hn:: GatherIndexN (f32 , src, src_index , len);
150
121
}
151
122
opmask_t nnan_mask = hn::Not (hn::IsNaN (x_in));
152
123
// Eliminate NaN to avoid FP invalid exception
@@ -191,7 +162,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
191
162
if (sdst == 1 ) {
192
163
hn::StoreN (cos, f32 , dst, len);
193
164
} else {
194
- ScatterIndexN (cos, dst, sdst , len);
165
+ hn:: ScatterIndexN (cos, f32 , dst, dst_index , len);
195
166
}
196
167
}
197
168
if (!hn::AllTrue (f32 , simd_mask)) {
@@ -221,7 +192,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
221
192
npyv_cleanup ();
222
193
}
223
194
}
224
- #endif // NPY_SIMD_FMA3
195
+ #endif // HWY_NATIVE_FMA
225
196
226
197
/* Disable SIMD code sin/cos f64 and revert to libm: see
227
198
* https://mail.python.org/archives/list/[email protected] /thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
@@ -242,7 +213,7 @@ DISPATCH_DOUBLE_FUNC(cos)
242
213
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
243
214
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (data))
244
215
{
245
- #if NPY_SIMD_F32 && NPY_SIMD_FMA3
216
+ #if HWY_NATIVE_FMA
246
217
const npy_float *src = (npy_float*)args[0 ];
247
218
npy_float *dst = (npy_float*)args[1 ];
248
219
@@ -271,7 +242,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
271
242
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX (FLOAT_cos)
272
243
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (data))
273
244
{
274
- #if NPY_SIMD_F32 && NPY_SIMD_FMA3
245
+ #if HWY_NATIVE_FMA
275
246
const npy_float *src = (npy_float*)args[0 ];
276
247
npy_float *dst = (npy_float*)args[1 ];
277
248
0 commit comments