@@ -143,4 +143,35 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
143143 return npyv_select_s64 (npyv_cmplt_s64 (a , b ), a , b );
144144}
145145
146+ // ceil
147+ #ifdef NPY_HAVE_SSE41
148+ #define npyv_ceil_f32 _mm_ceil_ps
149+ #define npyv_ceil_f64 _mm_ceil_pd
150+ #else
151+ NPY_FINLINE npyv_f32 npyv_ceil_f32 (npyv_f32 a )
152+ {
153+ const npyv_f32 szero = _mm_set1_ps (-0.0f );
154+ const npyv_f32 one = _mm_set1_ps (1.0f );
155+ npyv_s32 roundi = _mm_cvttps_epi32 (a );
156+ npyv_f32 round = _mm_cvtepi32_ps (roundi );
157+ npyv_f32 ceil = _mm_add_ps (round , _mm_and_ps (_mm_cmplt_ps (round , a ), one ));
158+ // respect signed zero, e.g. -0.5 -> -0.0
159+ npyv_f32 rzero = _mm_or_ps (ceil , _mm_and_ps (a , szero ));
160+ // if overflow return a
161+ return npyv_select_f32 (_mm_cmpeq_epi32 (roundi , _mm_castps_si128 (szero )), a , rzero );
162+ }
163+ NPY_FINLINE npyv_f64 npyv_ceil_f64 (npyv_f64 a )
164+ {
165+ const npyv_f64 szero = _mm_set1_pd (-0.0 );
166+ const npyv_f64 one = _mm_set1_pd (1.0 );
167+ const npyv_f64 two_power_52 = _mm_set1_pd (0x10000000000000 );
168+ npyv_f64 sign_two52 = _mm_or_pd (two_power_52 , _mm_and_pd (a , szero ));
169+ // round by add magic number 2^52
170+ npyv_f64 round = _mm_sub_pd (_mm_add_pd (a , sign_two52 ), sign_two52 );
171+ npyv_f64 ceil = _mm_add_pd (round , _mm_and_pd (_mm_cmplt_pd (round , a ), one ));
172+ // respect signed zero, e.g. -0.5 -> -0.0
173+ return _mm_or_pd (ceil , _mm_and_pd (a , szero ));
174+ }
175+ #endif
176+
146177#endif // _NPY_SIMD_SSE_MATH_H
0 commit comments