Skip to content

Commit bc29d96

Browse files
Fix dispatching mechanism
Traverse required arch in the order provided by the user instead of trying to guess the best one. It is actually impossible to define the notion of a best architectures as Intel instruction sets have a tree structure and not a linear structure : there are multiple leaves and none of them can be considered the "best".
1 parent a48ab43 commit bc29d96

File tree

6 files changed

+59
-106
lines changed

6 files changed

+59
-106
lines changed

include/xsimd/config/xsimd_arch.hpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,6 @@ namespace xsimd
187187
};
188188
} // namespace detail
189189

190-
struct unsupported
191-
{
192-
};
193190
using all_x86_architectures = arch_list<
194191
avx512vnni<avx512vbmi>, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
195192
avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
@@ -221,7 +218,7 @@ namespace xsimd
221218
class dispatcher
222219
{
223220

224-
const unsigned best_arch_found;
221+
const decltype(available_architectures()) availables_archs;
225222
F functor;
226223

227224
template <class Arch, class... Tys>
@@ -234,15 +231,15 @@ namespace xsimd
234231
template <class Arch, class ArchNext, class... Archs, class... Tys>
235232
inline auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
236233
{
237-
if (Arch::version() <= best_arch_found)
234+
if (availables_archs.has(Arch {}))
238235
return functor(Arch {}, std::forward<Tys>(args)...);
239236
else
240237
return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
241238
}
242239

243240
public:
244241
inline dispatcher(F f) noexcept
245-
: best_arch_found(available_architectures().best)
242+
: availables_archs(available_architectures())
246243
, functor(f)
247244
{
248245
}

include/xsimd/config/xsimd_cpuid.hpp

Lines changed: 48 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -33,65 +33,67 @@ namespace xsimd
3333
{
3434
struct supported_arch
3535
{
36-
unsigned sse2 : 1;
37-
unsigned sse3 : 1;
38-
unsigned ssse3 : 1;
39-
unsigned sse4_1 : 1;
40-
unsigned sse4_2 : 1;
41-
unsigned sse4a : 1;
42-
unsigned fma3_sse : 1;
43-
unsigned fma4 : 1;
44-
unsigned xop : 1;
45-
unsigned avx : 1;
46-
unsigned fma3_avx : 1;
47-
unsigned avx2 : 1;
48-
unsigned avxvnni : 1;
49-
unsigned fma3_avx2 : 1;
50-
unsigned avx512f : 1;
51-
unsigned avx512cd : 1;
52-
unsigned avx512dq : 1;
53-
unsigned avx512bw : 1;
54-
unsigned avx512er : 1;
55-
unsigned avx512pf : 1;
56-
unsigned avx512ifma : 1;
57-
unsigned avx512vbmi : 1;
58-
unsigned avx512vnni_bw : 1;
59-
unsigned avx512vnni_vbmi : 1;
60-
unsigned neon : 1;
61-
unsigned neon64 : 1;
62-
unsigned sve : 1;
63-
unsigned rvv : 1;
64-
65-
// version number of the best arch available
66-
unsigned best;
36+
37+
#define ARCH_FIELD_EX(arch, field_name) \
38+
unsigned field_name; \
39+
inline bool has(::xsimd::arch) const { return this->field_name; }
40+
#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name)
41+
42+
ARCH_FIELD(sse2)
43+
ARCH_FIELD(sse3)
44+
45+
ARCH_FIELD(ssse3)
46+
ARCH_FIELD(sse4_1)
47+
ARCH_FIELD(sse4_2)
48+
// ARCH_FIELD(sse4a)
49+
ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42)
50+
ARCH_FIELD(fma4)
51+
// ARCH_FIELD(xop)
52+
ARCH_FIELD(avx)
53+
ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx)
54+
ARCH_FIELD(avx2)
55+
ARCH_FIELD(avxvnni)
56+
ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2)
57+
ARCH_FIELD(avx512f)
58+
ARCH_FIELD(avx512cd)
59+
ARCH_FIELD(avx512dq)
60+
ARCH_FIELD(avx512bw)
61+
ARCH_FIELD(avx512er)
62+
ARCH_FIELD(avx512pf)
63+
ARCH_FIELD(avx512ifma)
64+
ARCH_FIELD(avx512vbmi)
65+
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw)
66+
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi)
67+
ARCH_FIELD(neon)
68+
ARCH_FIELD(neon64)
69+
ARCH_FIELD(sve)
70+
ARCH_FIELD(rvv)
71+
ARCH_FIELD(wasm)
72+
73+
#undef ARCH_FIELD
6774

6875
inline supported_arch() noexcept
6976
{
7077
memset(this, 0, sizeof(supported_arch));
7178

79+
#if XSIMD_WITH_WASM
80+
wasm = 1;
81+
#endif
82+
7283
#if defined(__aarch64__) || defined(_M_ARM64)
7384
neon = 1;
7485
neon64 = 1;
75-
best = neon64::version();
7686
#elif defined(__ARM_NEON) || defined(_M_ARM)
7787

7888
#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
7989
neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
80-
#else
81-
// that's very conservative :-/
82-
neon = 0;
8390
#endif
84-
neon64 = 0;
85-
best = neon::version() * neon;
8691

8792
#elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
8893

8994
#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
9095
sve = bool(getauxval(AT_HWCAP) & HWCAP_SVE);
91-
#else
92-
sve = 0;
9396
#endif
94-
best = sve::version() * sve;
9597

9698
#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
9799

@@ -100,11 +102,8 @@ namespace xsimd
100102
#define HWCAP_V (1 << ('V' - 'A'))
101103
#endif
102104
rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
103-
#else
104-
rvv = 0;
105105
#endif
106106

107-
best = ::xsimd::rvv::version() * rvv;
108107
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
109108
auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
110109
{
@@ -122,14 +121,12 @@ namespace xsimd
122121
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
123122
"cpuid\n\t"
124123
"xchg{l}\t{%%}ebx, %1\n\t"
125-
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
126-
"=d"(reg[3])
124+
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
127125
: "0"(level), "2"(count));
128126

129127
#else
130128
__asm__("cpuid\n\t"
131-
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
132-
"=d"(reg[3])
129+
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
133130
: "0"(level), "2"(count));
134131
#endif
135132

@@ -143,87 +140,47 @@ namespace xsimd
143140
get_cpuid(regs1, 0x1);
144141

145142
sse2 = regs1[3] >> 26 & 1;
146-
best = std::max(best, sse2::version() * sse2);
147-
148143
sse3 = regs1[2] >> 0 & 1;
149-
best = std::max(best, sse3::version() * sse3);
150-
151144
ssse3 = regs1[2] >> 9 & 1;
152-
best = std::max(best, ssse3::version() * ssse3);
153-
154145
sse4_1 = regs1[2] >> 19 & 1;
155-
best = std::max(best, sse4_1::version() * sse4_1);
156-
157146
sse4_2 = regs1[2] >> 20 & 1;
158-
best = std::max(best, sse4_2::version() * sse4_2);
159-
160-
fma3_sse = regs1[2] >> 12 & 1;
161-
if (sse4_2)
162-
best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
147+
fma3_sse42 = regs1[2] >> 12 & 1;
163148

164149
avx = regs1[2] >> 28 & 1;
165-
best = std::max(best, avx::version() * avx);
166-
167-
fma3_avx = avx && fma3_sse;
168-
best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
150+
fma3_avx = avx && fma3_sse42;
169151

170152
int regs8[4];
171153
get_cpuid(regs8, 0x80000001);
172154
fma4 = regs8[2] >> 16 & 1;
173-
best = std::max(best, fma4::version() * fma4);
174155

175156
// sse4a = regs[2] >> 6 & 1;
176-
// best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
177157

178158
// xop = regs[2] >> 11 & 1;
179-
// best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
180159

181160
int regs7[4];
182161
get_cpuid(regs7, 0x7);
183162
avx2 = regs7[1] >> 5 & 1;
184-
best = std::max(best, avx2::version() * avx2);
185163

186164
int regs7a[4];
187165
get_cpuid(regs7a, 0x7, 0x1);
188166
avxvnni = regs7a[0] >> 4 & 1;
189-
best = std::max(best, avxvnni::version() * avxvnni * avx2);
190167

191-
fma3_avx2 = avx2 && fma3_sse;
192-
best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
168+
fma3_avx2 = avx2 && fma3_sse42;
193169

194170
avx512f = regs7[1] >> 16 & 1;
195-
best = std::max(best, avx512f::version() * avx512f);
196-
197171
avx512cd = regs7[1] >> 28 & 1;
198-
best = std::max(best, avx512cd::version() * avx512cd * avx512f);
199-
200172
avx512dq = regs7[1] >> 17 & 1;
201-
best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
202-
203173
avx512bw = regs7[1] >> 30 & 1;
204-
best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
205-
206174
avx512er = regs7[1] >> 27 & 1;
207-
best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f);
208-
209175
avx512pf = regs7[1] >> 26 & 1;
210-
best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f);
211-
212176
avx512ifma = regs7[1] >> 21 & 1;
213-
best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);
214-
215177
avx512vbmi = regs7[2] >> 1 & 1;
216-
best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);
217-
218178
avx512vnni_bw = regs7[2] >> 11 & 1;
219-
best = std::max(best, avx512vnni<xsimd::avx512bw>::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f);
220-
221179
avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
222-
best = std::max(best, avx512vnni<xsimd::avx512vbmi>::version() * avx512vnni_vbmi);
223180
#endif
224181
}
225182
};
226-
}
183+
} // namespace detail
227184

228185
inline detail::supported_arch available_architectures() noexcept
229186
{

include/xsimd/types/xsimd_generic_arch.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ namespace xsimd
4343
protected:
4444
static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; }
4545
};
46+
47+
struct unsupported
48+
{
49+
};
4650
}
4751

4852
#endif

include/xsimd/types/xsimd_rvv_register.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,8 @@ namespace xsimd
411411
using type = detail::rvv_bool_simd_register<T>;
412412
};
413413
} // namespace types
414+
#else
415+
using rvv = detail::rvv<0xFFFFFFFF>;
414416
#endif
415417
} // namespace xsimd
416418

include/xsimd/types/xsimd_sve_register.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ namespace xsimd
149149
using type = detail::sve_bool_simd_register;
150150
};
151151
} // namespace types
152+
#else
153+
using sve = detail::sve<0xFFFFFFFF>;
152154
#endif
153155
} // namespace xsimd
154156

test/test_arch.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,6 @@ TEST_CASE("[multi arch support]")
124124
float res = dispatched(data, 17);
125125
CHECK_EQ(ref, res);
126126
}
127-
128-
// check that we pick the most appropriate version
129-
{
130-
auto dispatched = xsimd::dispatch<xsimd::arch_list<xsimd::sse3, xsimd::sse2, xsimd::generic>>(get_arch_version {});
131-
unsigned expected = xsimd::available_architectures().best >= xsimd::sse3::version()
132-
? xsimd::sse3::version()
133-
: xsimd::sse2::version();
134-
CHECK_EQ(expected, dispatched());
135-
}
136127
#endif
137128
}
138129

0 commit comments

Comments
 (0)