|
1 | | -/** |
2 | | - * @file simd_utils.hpp |
3 | | - * @brief SIMD utility functions and feature detection |
4 | | - * |
5 | | - * This header provides common utilities for SIMD programming including |
6 | | - * feature detection, alignment helpers, and basic SIMD operations. |
7 | | - * |
8 | | - * All functionality is header-only for ease of integration. |
9 | | - * |
10 | | - * Validates: |
11 | | - * - Requirement 4.1: Automatic Vectorization Patterns |
12 | | - * - Requirement 4.2: SIMD Intrinsics Introduction |
13 | | - * - Requirement 4.3: SIMD Abstraction Wrappers |
14 | | - * - Requirement 4.4: CPU Capability Detection |
15 | | - * - Requirement 4.5: Scalar vs Vectorized Benchmark |
16 | | - * - Requirement 4.6: Vectorization Reports |
17 | | - */ |
18 | | - |
19 | 1 | #pragma once |
20 | | - |
21 | | -#include <cstddef> |
22 | | -#include <cstdint> |
23 | | -#include <cstdlib> |
24 | | -#include <limits> |
25 | | -#include <memory> |
26 | | -#include <new> |
27 | | -#include <vector> |
28 | | - |
29 | | -// Intel SIMD intrinsics - always include on x86 for target attribute dispatch |
30 | | -// The target attribute controls which instructions are actually used |
31 | | -#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) |
32 | | -#include <immintrin.h> |
33 | | -#endif |
34 | | - |
35 | | -// Feature detection macros for compile-time checks |
36 | | -#ifdef __SSE2__ |
37 | | -#define HPC_HAS_SSE2 1 |
38 | | -#endif |
39 | | - |
40 | | -#ifdef __AVX__ |
41 | | -#define HPC_HAS_AVX 1 |
42 | | -#endif |
43 | | - |
44 | | -#ifdef __AVX2__ |
45 | | -#define HPC_HAS_AVX2 1 |
46 | | -#endif |
47 | | - |
48 | | -#ifdef __AVX512F__ |
49 | | -#define HPC_HAS_AVX512 1 |
50 | | -#endif |
51 | | - |
52 | | -namespace hpc::simd { |
53 | | - |
54 | | -/** |
55 | | - * @brief Check if a pointer is aligned to the specified boundary |
56 | | - */ |
57 | | -inline bool is_aligned(const void* ptr, size_t alignment) { |
58 | | - return reinterpret_cast<uintptr_t>(ptr) % alignment == 0; |
59 | | -} |
60 | | - |
61 | | -/** |
62 | | - * @brief Align a size up to the next multiple of alignment |
63 | | - */ |
64 | | -inline size_t align_up(size_t size, size_t alignment) { |
65 | | - return (size + alignment - 1) & ~(alignment - 1); |
66 | | -} |
67 | | - |
68 | | -/** |
69 | | - * @brief Get the optimal SIMD alignment for the current platform |
70 | | - */ |
71 | | -inline size_t get_simd_alignment() { |
72 | | -#ifdef HPC_HAS_AVX512 |
73 | | - return 64; // AVX-512 uses 64-byte alignment |
74 | | -#elif defined(HPC_HAS_AVX) || defined(HPC_HAS_AVX2) |
75 | | - return 32; // AVX/AVX2 uses 32-byte alignment |
76 | | -#elif defined(HPC_HAS_SSE2) |
77 | | - return 16; // SSE uses 16-byte alignment |
78 | | -#else |
79 | | - return sizeof(void*); // Fallback to pointer alignment |
80 | | -#endif |
81 | | -} |
82 | | - |
83 | | -/** |
84 | | - * @brief SIMD-width aligned allocator for SIMD operations |
85 | | - * |
86 | | - * Uses runtime CPU feature detection to pick the optimal alignment |
87 | | - * (16/32/64 bytes) for the current platform's SIMD width. |
88 | | - * |
89 | | - * See CONTEXT.md: SIMD-width allocator for the domain rationale. |
90 | | - * For cache-line alignment, see hpc::memory::AlignedAllocator in memory_utils.hpp. |
91 | | - */ |
92 | | -template <typename T> |
93 | | -class AlignedAllocator { |
94 | | -public: |
95 | | - using value_type = T; |
96 | | - using size_type = std::size_t; |
97 | | - using difference_type = std::ptrdiff_t; |
98 | | - |
99 | | - template <typename U> |
100 | | - struct rebind { |
101 | | - using other = AlignedAllocator<U>; |
102 | | - }; |
103 | | - |
104 | | - AlignedAllocator() = default; |
105 | | - |
106 | | - template <typename U> |
107 | | - AlignedAllocator(const AlignedAllocator<U>&) {} |
108 | | - |
109 | | - T* allocate(size_type n) { |
110 | | - // Overflow protection |
111 | | - if (n > std::numeric_limits<size_type>::max() / sizeof(T)) { |
112 | | - throw std::bad_alloc(); |
113 | | - } |
114 | | - |
115 | | - if (n == 0) { |
116 | | - return nullptr; |
117 | | - } |
118 | | - |
119 | | - const size_t alignment = get_simd_alignment(); |
120 | | - const size_t size = n * sizeof(T); |
121 | | - |
122 | | - void* ptr = nullptr; |
123 | | -#if defined(_MSC_VER) |
124 | | - ptr = _aligned_malloc(size, alignment); |
125 | | -#else |
126 | | - if (posix_memalign(&ptr, alignment, size) != 0) { |
127 | | - ptr = nullptr; |
128 | | - } |
129 | | -#endif |
130 | | - if (!ptr) { |
131 | | - throw std::bad_alloc(); |
132 | | - } |
133 | | - return static_cast<T*>(ptr); |
134 | | - } |
135 | | - |
136 | | - void deallocate(T* p, size_type) { |
137 | | - if (p == nullptr) { |
138 | | - return; |
139 | | - } |
140 | | -#if defined(_MSC_VER) |
141 | | - _aligned_free(p); |
142 | | -#else |
143 | | - free(p); |
144 | | -#endif |
145 | | - } |
146 | | - |
147 | | - template <typename U> |
148 | | - bool operator==(const AlignedAllocator<U>&) const { |
149 | | - return true; |
150 | | - } |
151 | | - |
152 | | - template <typename U> |
153 | | - bool operator!=(const AlignedAllocator<U>&) const { |
154 | | - return false; |
155 | | - } |
156 | | -}; |
157 | | - |
158 | | -/** |
159 | | - * @brief Backward-compatible alias for AlignedAllocator |
160 | | - * @deprecated Use AlignedAllocator<T> directly |
161 | | - */ |
162 | | -template <typename T> |
163 | | -using aligned_allocator [[deprecated("Use AlignedAllocator<T> directly")]] = AlignedAllocator<T>; |
164 | | - |
165 | | -/** |
166 | | - * @brief Alias for AlignedAllocator with SIMD-specific naming |
167 | | - */ |
168 | | -template <typename T> |
169 | | -using simd_allocator = AlignedAllocator<T>; |
170 | | - |
171 | | -/** |
172 | | - * @brief Aligned vector type for SIMD operations |
173 | | - */ |
174 | | -template <typename T> |
175 | | -using aligned_vector = std::vector<T, AlignedAllocator<T>>; |
176 | | - |
177 | | -/** |
178 | | - * @brief Aligned buffer type alias for compatibility |
179 | | - */ |
180 | | -template <typename T> |
181 | | -using AlignedBuffer = aligned_vector<T>; |
182 | | - |
183 | | -/** |
184 | | - * @brief Create an aligned vector with the specified size |
185 | | - */ |
186 | | -template <typename T> |
187 | | -aligned_vector<T> make_aligned_vector(size_t size) { |
188 | | - return aligned_vector<T>(size); |
189 | | -} |
190 | | - |
191 | | -/** |
192 | | - * @brief Create an aligned vector with the specified size and initial value |
193 | | - */ |
194 | | -template <typename T> |
195 | | -aligned_vector<T> make_aligned_vector(size_t size, const T& value) { |
196 | | - return aligned_vector<T>(size, value); |
197 | | -} |
198 | | - |
199 | | -/** |
200 | | - * @brief SIMD capability levels |
201 | | - */ |
202 | | -enum class SIMDLevel { Scalar, SSE2, AVX, AVX2, AVX512 }; |
203 | | - |
204 | | -/** |
205 | | - * @brief Detect the highest available SIMD level |
206 | | - */ |
207 | | -inline SIMDLevel detect_simd_level() { |
208 | | -#ifdef HPC_HAS_AVX512 |
209 | | - return SIMDLevel::AVX512; |
210 | | -#elif defined(HPC_HAS_AVX2) |
211 | | - return SIMDLevel::AVX2; |
212 | | -#elif defined(HPC_HAS_AVX) |
213 | | - return SIMDLevel::AVX; |
214 | | -#elif defined(HPC_HAS_SSE2) |
215 | | - return SIMDLevel::SSE2; |
216 | | -#else |
217 | | - return SIMDLevel::Scalar; |
218 | | -#endif |
219 | | -} |
220 | | - |
221 | | -/** |
222 | | - * @brief Get the name of a SIMD level |
223 | | - */ |
224 | | -inline const char* simd_level_name(SIMDLevel level) { |
225 | | - switch (level) { |
226 | | - case SIMDLevel::AVX512: |
227 | | - return "AVX-512"; |
228 | | - case SIMDLevel::AVX2: |
229 | | - return "AVX2"; |
230 | | - case SIMDLevel::AVX: |
231 | | - return "AVX"; |
232 | | - case SIMDLevel::SSE2: |
233 | | - return "SSE2"; |
234 | | - case SIMDLevel::Scalar: |
235 | | - return "Scalar"; |
236 | | - default: |
237 | | - return "Unknown"; |
238 | | - } |
239 | | -} |
240 | | - |
241 | | -/** |
242 | | - * @brief Get the vector width in bytes for a SIMD level |
243 | | - */ |
244 | | -inline size_t simd_vector_width(SIMDLevel level) { |
245 | | - switch (level) { |
246 | | - case SIMDLevel::AVX512: |
247 | | - return 64; |
248 | | - case SIMDLevel::AVX2: |
249 | | - return 32; |
250 | | - case SIMDLevel::AVX: |
251 | | - return 32; |
252 | | - case SIMDLevel::SSE2: |
253 | | - return 16; |
254 | | - case SIMDLevel::Scalar: |
255 | | - return sizeof(float); |
256 | | - default: |
257 | | - return sizeof(float); |
258 | | - } |
259 | | -} |
260 | | - |
261 | | -//------------------------------------------------------------------------------ |
262 | | -// Runtime SIMD Dispatch |
263 | | -//------------------------------------------------------------------------------ |
264 | | - |
265 | | -/** |
266 | | - * @brief Generic CPU capability resolver for multi-version functions. |
267 | | - * |
268 | | - * Given scalar, SSE2, AVX2 and AVX-512 function pointers, returns the |
269 | | - * best available one based on runtime CPU feature detection. |
270 | | - * |
271 | | - * @tparam Func Function pointer type (must be identical for all arguments) |
272 | | - * @return Best available implementation pointer |
273 | | - */ |
274 | | -template <typename Func> |
275 | | -Func resolve_best(Func scalar, Func sse2, Func avx2, Func avx512) { |
276 | | -#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) |
277 | | - __builtin_cpu_init(); |
278 | | - if (avx512 && __builtin_cpu_supports("avx512f")) |
279 | | - return avx512; |
280 | | - if (avx2 && __builtin_cpu_supports("avx2")) |
281 | | - return avx2; |
282 | | - if (sse2 && __builtin_cpu_supports("sse2")) |
283 | | - return sse2; |
284 | | -#else |
285 | | - (void)sse2; |
286 | | - (void)avx2; |
287 | | - (void)avx512; |
288 | | -#endif |
289 | | - return scalar; |
290 | | -} |
291 | | - |
292 | | -namespace detail { |
293 | | - |
294 | | -using AddArraysFn = void (*)(const float* a, const float* b, float* c, size_t n); |
295 | | - |
296 | | -inline void add_arrays_scalar(const float* a, const float* b, float* c, size_t n) { |
297 | | - for (size_t i = 0; i < n; ++i) { |
298 | | - c[i] = a[i] + b[i]; |
299 | | - } |
300 | | -} |
301 | | - |
302 | | -#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) |
303 | | - |
304 | | -__attribute__((target("sse2"))) inline void add_arrays_sse2(const float* a, const float* b, |
305 | | - float* c, size_t n) { |
306 | | - size_t i = 0; |
307 | | - for (; i + 4 <= n; i += 4) { |
308 | | - const __m128 va = _mm_loadu_ps(&a[i]); |
309 | | - const __m128 vb = _mm_loadu_ps(&b[i]); |
310 | | - const __m128 vc = _mm_add_ps(va, vb); |
311 | | - _mm_storeu_ps(&c[i], vc); |
312 | | - } |
313 | | - for (; i < n; ++i) { |
314 | | - c[i] = a[i] + b[i]; |
315 | | - } |
316 | | -} |
317 | | - |
318 | | -__attribute__((target("avx2,avx"))) inline void add_arrays_avx2(const float* a, const float* b, |
319 | | - float* c, size_t n) { |
320 | | - size_t i = 0; |
321 | | - for (; i + 8 <= n; i += 8) { |
322 | | - const __m256 va = _mm256_loadu_ps(&a[i]); |
323 | | - const __m256 vb = _mm256_loadu_ps(&b[i]); |
324 | | - const __m256 vc = _mm256_add_ps(va, vb); |
325 | | - _mm256_storeu_ps(&c[i], vc); |
326 | | - } |
327 | | - add_arrays_sse2(a + i, b + i, c + i, n - i); |
328 | | -} |
329 | | - |
330 | | -#endif |
331 | | - |
332 | | -} // namespace detail |
333 | | - |
334 | | -/** |
335 | | - * @brief Add two arrays using the best available SIMD path at runtime. |
336 | | - * |
337 | | - * Automatically selects AVX2, SSE2, or scalar implementation based on |
338 | | - * CPU capabilities. The resolved function pointer is cached in a |
339 | | - * static local for thread-safe, single-shot initialization. |
340 | | - */ |
341 | | -inline void dispatch_add_arrays(const float* a, const float* b, float* c, size_t n) { |
342 | | - using Fn = detail::AddArraysFn; |
343 | | - static const Fn dispatch = resolve_best<Fn>(&detail::add_arrays_scalar, |
344 | | -#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__)) |
345 | | - &detail::add_arrays_sse2, &detail::add_arrays_avx2, |
346 | | -#else |
347 | | - nullptr, nullptr, |
348 | | -#endif |
349 | | - nullptr); |
350 | | - dispatch(a, b, c, n); |
351 | | -} |
352 | | - |
353 | | -} // namespace hpc::simd |
| 2 | +#include <hpc/simd.hpp> |
0 commit comments