Skip to content

Commit 6ccad06

Browse files
howjmayseiko2plus
authored andcommitted
ENH: Implement SIMD for ceil
1 parent 4149a37 commit 6ccad06

File tree

7 files changed

+88
-2
lines changed

7 files changed

+88
-2
lines changed

numpy/core/src/_simd/_simd.dispatch.c.src

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@)
381381
***************************/
382382
#if @fp_only@
383383
/**begin repeat1
384-
* #intrin = sqrt, recip, abs, square#
384+
* #intrin = sqrt, recip, abs, square, ceil#
385385
*/
386386
SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
387387
/**end repeat1**/
@@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@)
615615
***************************/
616616
#if @fp_only@
617617
/**begin repeat1
618-
* #intrin = sqrt, recip, abs, square#
618+
* #intrin = sqrt, recip, abs, square, ceil#
619619
*/
620620
SIMD_INTRIN_DEF(@intrin@_@sfx@)
621621
/**end repeat1**/

numpy/core/src/common/simd/avx2/math.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
105105
return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b));
106106
}
107107

108+
// ceil
109+
#define npyv_ceil_f32 _mm256_ceil_ps
110+
#define npyv_ceil_f64 _mm256_ceil_pd
111+
108112
#endif // _NPY_SIMD_AVX2_MATH_H

numpy/core/src/common/simd/avx512/math.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,8 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
112112
#define npyv_min_u64 _mm512_min_epu64
113113
#define npyv_min_s64 _mm512_min_epi64
114114

115+
// ceil
116+
#define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF)
117+
#define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF)
118+
115119
#endif // _NPY_SIMD_AVX512_MATH_H

numpy/core/src/common/simd/neon/math.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,4 +153,20 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
153153
return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
154154
}
155155

156+
// ceil
157+
#ifdef NPY_HAVE_ASIMD
158+
#define npyv_ceil_f32 vrndpq_f32
159+
#else
160+
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
161+
{
162+
npyv_f32 conv_trunc = vcvtq_f32_s32(vcvtq_s32_f32(a));
163+
npyv_f32 conv_trunc_add_one = npyv_add_f32(conv_trunc, vdupq_n_f32(1.0f));
164+
npyv_u32 mask = vcltq_f32(conv_trunc, a);
165+
return vbslq_f32(mask, conv_trunc, conv_trunc_add_one);
166+
}
167+
#endif
168+
#if NPY_SIMD_F64
169+
#define npyv_ceil_f64 vrndpq_f64
170+
#endif // NPY_SIMD_F64
171+
156172
#endif // _NPY_SIMD_NEON_MATH_H

numpy/core/src/common/simd/sse/math.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,4 +143,35 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
143143
return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
144144
}
145145

146+
// ceil
147+
#ifdef NPY_HAVE_SSE41
148+
#define npyv_ceil_f32 _mm_ceil_ps
149+
#define npyv_ceil_f64 _mm_ceil_pd
150+
#else
151+
NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
152+
{
153+
const npyv_f32 szero = _mm_set1_ps(-0.0f);
154+
const npyv_f32 one = _mm_set1_ps(1.0f);
155+
npyv_s32 roundi = _mm_cvttps_epi32(a);
156+
npyv_f32 round = _mm_cvtepi32_ps(roundi);
157+
npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
158+
// respect signed zero, e.g. -0.5 -> -0.0
159+
npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
160+
// if overflow return a
161+
return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
162+
}
163+
NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
164+
{
165+
const npyv_f64 szero = _mm_set1_pd(-0.0);
166+
const npyv_f64 one = _mm_set1_pd(1.0);
167+
const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
168+
npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
169+
// round by add magic number 2^52
170+
npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
171+
npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
172+
// respect signed zero, e.g. -0.5 -> -0.0
173+
return _mm_or_pd(ceil, _mm_and_pd(a, szero));
174+
}
175+
#endif
176+
146177
#endif // _NPY_SIMD_SSE_MATH_H

numpy/core/src/common/simd/vsx/math.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,8 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
6969
#define npyv_min_u64 vec_min
7070
#define npyv_min_s64 vec_min
7171

72+
// ceil
73+
#define npyv_ceil_f32 vec_ceil
74+
#define npyv_ceil_f64 vec_ceil
75+
7276
#endif // _NPY_SIMD_VSX_MATH_H

numpy/core/tests/test_simd.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,33 @@ def test_square(self):
330330
square = self.square(vdata)
331331
assert square == data_square
332332

333+
@pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil)])
334+
def test_rounding(self, intrin, func):
335+
"""
336+
Test intrinsics:
337+
npyv_ceil_##SFX
338+
"""
339+
intrin = eval(intrin)
340+
pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
341+
# special cases
342+
round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf))
343+
for case, desired in round_cases:
344+
data_round = [desired]*self.nlanes
345+
_round = intrin(self.setall(case))
346+
assert _round == pytest.approx(data_round, nan_ok=True)
347+
for x in range(0, 2**20, 256**2):
348+
for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15):
349+
data = [x*w+a for a in range(self.nlanes)]
350+
vdata = self.load(data)
351+
data_round = [func(x) for x in data]
352+
_round = intrin(vdata)
353+
assert _round == data_round
354+
# signed zero
355+
for w in (-0.25, -0.30, -0.45):
356+
_round = self._to_unsigned(intrin(self.setall(w)))
357+
data_round = self._to_unsigned(self.setall(-0.0))
358+
assert _round == data_round
359+
333360
def test_max(self):
334361
"""
335362
Test intrinsics:

0 commit comments

Comments
 (0)