Skip to content

Commit c46a513

Browse files
authored
Merge pull request numpy#25781 from r-devulap/sincos-hwy
ENH: Convert fp32 sin/cos from C universal intrinsics to C++ using Highway
2 parents 1ea9ef6 + e7e6574 commit c46a513

File tree

7 files changed

+288
-468
lines changed

7 files changed

+288
-468
lines changed

meson_cpu/x86/meson.build

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,14 @@ FMA3 = mod_features.new(
5959
'FMA3', 24, implies: F16C, args: '-mfma',
6060
test_code: files(source_root + '/numpy/distutils/checks/cpu_fma3.c')[0]
6161
)
62+
# match this to HWY_AVX2
6263
AVX2 = mod_features.new(
63-
'AVX2', 25, implies: F16C, args: '-mavx2',
64+
'AVX2', 25, implies: FMA3, args: ['-mavx2', '-maes', '-mpclmul', '-mbmi', '-mbmi2'],
6465
test_code: files(source_root + '/numpy/distutils/checks/cpu_avx2.c')[0]
6566
)
6667
# 25-40 left as margin for any extra features
6768
AVX512F = mod_features.new(
68-
'AVX512F', 40, implies: [FMA3, AVX2],
69+
'AVX512F', 40, implies: [AVX2],
6970
# Disables mmx because of stack corruption that may happen during mask
7071
# conversions.
7172
# TODO (seiko2plus): provide more clarification

numpy/_core/meson.build

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -944,12 +944,10 @@ foreach gen_mtargets : [
944944
],
945945
[
946946
'loops_trigonometric.dispatch.h',
947-
src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
947+
'src/umath/loops_trigonometric.dispatch.cpp',
948948
[
949-
AVX512F, [AVX2, FMA3],
950-
VSX4, VSX3, VSX2,
949+
AVX512_SKX, [AVX2, FMA3],
951950
NEON_VFPV4,
952-
VXE2, VXE
953951
]
954952
],
955953
[
@@ -1020,7 +1018,8 @@ foreach gen_mtargets : [
10201018
'src/common',
10211019
'src/multiarray',
10221020
'src/npymath',
1023-
'src/umath'
1021+
'src/umath',
1022+
'src/highway',
10241023
]
10251024
)
10261025
if not is_variable('multiarray_umath_mtargets')

numpy/_core/src/common/npy_cpu_features.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,8 @@ npy__cpu_init_features(void)
474474
// third call to the cpuid to get extended AVX2 & AVX512 feature bits
475475
npy__cpu_cpuid(reg, 7);
476476
npy__cpu_have[NPY_CPU_FEATURE_AVX2] = (reg[1] & (1 << 5)) != 0;
477+
npy__cpu_have[NPY_CPU_FEATURE_AVX2] = npy__cpu_have[NPY_CPU_FEATURE_AVX2] &&
478+
npy__cpu_have[NPY_CPU_FEATURE_FMA3];
477479
if (!npy__cpu_have[NPY_CPU_FEATURE_AVX2])
478480
return;
479481
// detect AVX2 & FMA3
@@ -641,7 +643,7 @@ static void
641643
npy__cpu_init_features(void)
642644
{
643645
memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
644-
646+
645647
unsigned int hwcap = getauxval(AT_HWCAP);
646648
if ((hwcap & HWCAP_S390_VX) == 0) {
647649
return;
@@ -653,7 +655,7 @@ npy__cpu_init_features(void)
653655
npy__cpu_have[NPY_CPU_FEATURE_VXE2] = 1;
654656
return;
655657
}
656-
658+
657659
npy__cpu_have[NPY_CPU_FEATURE_VXE] = (hwcap & HWCAP_S390_VXE) != 0;
658660

659661
npy__cpu_have[NPY_CPU_FEATURE_VX] = 1;

numpy/_core/src/highway

Submodule highway updated 55 files

numpy/_core/src/umath/loops.h.src

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
1111
#endif
1212

13+
#ifdef __cplusplus
14+
extern "C" {
15+
#endif
16+
1317
/*
1418
*****************************************************************************
1519
** BOOLEAN LOOPS **
@@ -875,5 +879,7 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
875879
** END LOOPS **
876880
*****************************************************************************
877881
*/
878-
882+
#ifdef __cplusplus
883+
}
884+
#endif
879885
#endif

0 commit comments

Comments
 (0)