Skip to content

Commit b3630f0

Browse files
authored
Merge pull request #3 from LessUp/runtime-simd-deepening
feat(simd): add canonical hpc::simd module with runtime dispatch
2 parents d6fe2a1 + 3f2d843 commit b3630f0

17 files changed

Lines changed: 1096 additions & 856 deletions

File tree

examples/04-simd-vectorization/bench/simd_bench.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
#include <random>
1515
#include <vector>
1616

17-
#include "simd_utils.hpp"
18-
#include "simd_wrapper.hpp"
17+
#include <hpc/simd.hpp>
1918

2019
namespace {
2120

Lines changed: 1 addition & 352 deletions
Original file line numberDiff line numberDiff line change
@@ -1,353 +1,2 @@
1-
/**
2-
* @file simd_utils.hpp
3-
* @brief SIMD utility functions and feature detection
4-
*
5-
* This header provides common utilities for SIMD programming including
6-
* feature detection, alignment helpers, and basic SIMD operations.
7-
*
8-
* All functionality is header-only for ease of integration.
9-
*
10-
* Validates:
11-
* - Requirement 4.1: Automatic Vectorization Patterns
12-
* - Requirement 4.2: SIMD Intrinsics Introduction
13-
* - Requirement 4.3: SIMD Abstraction Wrappers
14-
* - Requirement 4.4: CPU Capability Detection
15-
* - Requirement 4.5: Scalar vs Vectorized Benchmark
16-
* - Requirement 4.6: Vectorization Reports
17-
*/
18-
191
#pragma once
20-
21-
#include <cstddef>
22-
#include <cstdint>
23-
#include <cstdlib>
24-
#include <limits>
25-
#include <memory>
26-
#include <new>
27-
#include <vector>
28-
29-
// Intel SIMD intrinsics - always include on x86 for target attribute dispatch
30-
// The target attribute controls which instructions are actually used
31-
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
32-
#include <immintrin.h>
33-
#endif
34-
35-
// Feature detection macros for compile-time checks
36-
#ifdef __SSE2__
37-
#define HPC_HAS_SSE2 1
38-
#endif
39-
40-
#ifdef __AVX__
41-
#define HPC_HAS_AVX 1
42-
#endif
43-
44-
#ifdef __AVX2__
45-
#define HPC_HAS_AVX2 1
46-
#endif
47-
48-
#ifdef __AVX512F__
49-
#define HPC_HAS_AVX512 1
50-
#endif
51-
52-
namespace hpc::simd {
53-
54-
/**
55-
* @brief Check if a pointer is aligned to the specified boundary
56-
*/
57-
inline bool is_aligned(const void* ptr, size_t alignment) {
58-
return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
59-
}
60-
61-
/**
62-
* @brief Align a size up to the next multiple of alignment
63-
*/
64-
inline size_t align_up(size_t size, size_t alignment) {
65-
return (size + alignment - 1) & ~(alignment - 1);
66-
}
67-
68-
/**
69-
* @brief Get the optimal SIMD alignment for the current platform
70-
*/
71-
inline size_t get_simd_alignment() {
72-
#ifdef HPC_HAS_AVX512
73-
return 64; // AVX-512 uses 64-byte alignment
74-
#elif defined(HPC_HAS_AVX) || defined(HPC_HAS_AVX2)
75-
return 32; // AVX/AVX2 uses 32-byte alignment
76-
#elif defined(HPC_HAS_SSE2)
77-
return 16; // SSE uses 16-byte alignment
78-
#else
79-
return sizeof(void*); // Fallback to pointer alignment
80-
#endif
81-
}
82-
83-
/**
84-
* @brief SIMD-width aligned allocator for SIMD operations
85-
*
86-
* Uses runtime CPU feature detection to pick the optimal alignment
87-
* (16/32/64 bytes) for the current platform's SIMD width.
88-
*
89-
* See CONTEXT.md: SIMD-width allocator for the domain rationale.
90-
* For cache-line alignment, see hpc::memory::AlignedAllocator in memory_utils.hpp.
91-
*/
92-
template <typename T>
93-
class AlignedAllocator {
94-
public:
95-
using value_type = T;
96-
using size_type = std::size_t;
97-
using difference_type = std::ptrdiff_t;
98-
99-
template <typename U>
100-
struct rebind {
101-
using other = AlignedAllocator<U>;
102-
};
103-
104-
AlignedAllocator() = default;
105-
106-
template <typename U>
107-
AlignedAllocator(const AlignedAllocator<U>&) {}
108-
109-
T* allocate(size_type n) {
110-
// Overflow protection
111-
if (n > std::numeric_limits<size_type>::max() / sizeof(T)) {
112-
throw std::bad_alloc();
113-
}
114-
115-
if (n == 0) {
116-
return nullptr;
117-
}
118-
119-
const size_t alignment = get_simd_alignment();
120-
const size_t size = n * sizeof(T);
121-
122-
void* ptr = nullptr;
123-
#if defined(_MSC_VER)
124-
ptr = _aligned_malloc(size, alignment);
125-
#else
126-
if (posix_memalign(&ptr, alignment, size) != 0) {
127-
ptr = nullptr;
128-
}
129-
#endif
130-
if (!ptr) {
131-
throw std::bad_alloc();
132-
}
133-
return static_cast<T*>(ptr);
134-
}
135-
136-
void deallocate(T* p, size_type) {
137-
if (p == nullptr) {
138-
return;
139-
}
140-
#if defined(_MSC_VER)
141-
_aligned_free(p);
142-
#else
143-
free(p);
144-
#endif
145-
}
146-
147-
template <typename U>
148-
bool operator==(const AlignedAllocator<U>&) const {
149-
return true;
150-
}
151-
152-
template <typename U>
153-
bool operator!=(const AlignedAllocator<U>&) const {
154-
return false;
155-
}
156-
};
157-
158-
/**
159-
* @brief Backward-compatible alias for AlignedAllocator
160-
* @deprecated Use AlignedAllocator<T> directly
161-
*/
162-
template <typename T>
163-
using aligned_allocator [[deprecated("Use AlignedAllocator<T> directly")]] = AlignedAllocator<T>;
164-
165-
/**
166-
* @brief Alias for AlignedAllocator with SIMD-specific naming
167-
*/
168-
template <typename T>
169-
using simd_allocator = AlignedAllocator<T>;
170-
171-
/**
172-
* @brief Aligned vector type for SIMD operations
173-
*/
174-
template <typename T>
175-
using aligned_vector = std::vector<T, AlignedAllocator<T>>;
176-
177-
/**
178-
* @brief Aligned buffer type alias for compatibility
179-
*/
180-
template <typename T>
181-
using AlignedBuffer = aligned_vector<T>;
182-
183-
/**
184-
* @brief Create an aligned vector with the specified size
185-
*/
186-
template <typename T>
187-
aligned_vector<T> make_aligned_vector(size_t size) {
188-
return aligned_vector<T>(size);
189-
}
190-
191-
/**
192-
* @brief Create an aligned vector with the specified size and initial value
193-
*/
194-
template <typename T>
195-
aligned_vector<T> make_aligned_vector(size_t size, const T& value) {
196-
return aligned_vector<T>(size, value);
197-
}
198-
199-
/**
200-
* @brief SIMD capability levels
201-
*/
202-
enum class SIMDLevel { Scalar, SSE2, AVX, AVX2, AVX512 };
203-
204-
/**
205-
* @brief Detect the highest available SIMD level
206-
*/
207-
inline SIMDLevel detect_simd_level() {
208-
#ifdef HPC_HAS_AVX512
209-
return SIMDLevel::AVX512;
210-
#elif defined(HPC_HAS_AVX2)
211-
return SIMDLevel::AVX2;
212-
#elif defined(HPC_HAS_AVX)
213-
return SIMDLevel::AVX;
214-
#elif defined(HPC_HAS_SSE2)
215-
return SIMDLevel::SSE2;
216-
#else
217-
return SIMDLevel::Scalar;
218-
#endif
219-
}
220-
221-
/**
222-
* @brief Get the name of a SIMD level
223-
*/
224-
inline const char* simd_level_name(SIMDLevel level) {
225-
switch (level) {
226-
case SIMDLevel::AVX512:
227-
return "AVX-512";
228-
case SIMDLevel::AVX2:
229-
return "AVX2";
230-
case SIMDLevel::AVX:
231-
return "AVX";
232-
case SIMDLevel::SSE2:
233-
return "SSE2";
234-
case SIMDLevel::Scalar:
235-
return "Scalar";
236-
default:
237-
return "Unknown";
238-
}
239-
}
240-
241-
/**
242-
* @brief Get the vector width in bytes for a SIMD level
243-
*/
244-
inline size_t simd_vector_width(SIMDLevel level) {
245-
switch (level) {
246-
case SIMDLevel::AVX512:
247-
return 64;
248-
case SIMDLevel::AVX2:
249-
return 32;
250-
case SIMDLevel::AVX:
251-
return 32;
252-
case SIMDLevel::SSE2:
253-
return 16;
254-
case SIMDLevel::Scalar:
255-
return sizeof(float);
256-
default:
257-
return sizeof(float);
258-
}
259-
}
260-
261-
//------------------------------------------------------------------------------
262-
// Runtime SIMD Dispatch
263-
//------------------------------------------------------------------------------
264-
265-
/**
266-
* @brief Generic CPU capability resolver for multi-version functions.
267-
*
268-
* Given scalar, SSE2, AVX2 and AVX-512 function pointers, returns the
269-
* best available one based on runtime CPU feature detection.
270-
*
271-
* @tparam Func Function pointer type (must be identical for all arguments)
272-
* @return Best available implementation pointer
273-
*/
274-
template <typename Func>
275-
Func resolve_best(Func scalar, Func sse2, Func avx2, Func avx512) {
276-
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
277-
__builtin_cpu_init();
278-
if (avx512 && __builtin_cpu_supports("avx512f"))
279-
return avx512;
280-
if (avx2 && __builtin_cpu_supports("avx2"))
281-
return avx2;
282-
if (sse2 && __builtin_cpu_supports("sse2"))
283-
return sse2;
284-
#else
285-
(void)sse2;
286-
(void)avx2;
287-
(void)avx512;
288-
#endif
289-
return scalar;
290-
}
291-
292-
namespace detail {
293-
294-
using AddArraysFn = void (*)(const float* a, const float* b, float* c, size_t n);
295-
296-
inline void add_arrays_scalar(const float* a, const float* b, float* c, size_t n) {
297-
for (size_t i = 0; i < n; ++i) {
298-
c[i] = a[i] + b[i];
299-
}
300-
}
301-
302-
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
303-
304-
__attribute__((target("sse2"))) inline void add_arrays_sse2(const float* a, const float* b,
305-
float* c, size_t n) {
306-
size_t i = 0;
307-
for (; i + 4 <= n; i += 4) {
308-
const __m128 va = _mm_loadu_ps(&a[i]);
309-
const __m128 vb = _mm_loadu_ps(&b[i]);
310-
const __m128 vc = _mm_add_ps(va, vb);
311-
_mm_storeu_ps(&c[i], vc);
312-
}
313-
for (; i < n; ++i) {
314-
c[i] = a[i] + b[i];
315-
}
316-
}
317-
318-
__attribute__((target("avx2,avx"))) inline void add_arrays_avx2(const float* a, const float* b,
319-
float* c, size_t n) {
320-
size_t i = 0;
321-
for (; i + 8 <= n; i += 8) {
322-
const __m256 va = _mm256_loadu_ps(&a[i]);
323-
const __m256 vb = _mm256_loadu_ps(&b[i]);
324-
const __m256 vc = _mm256_add_ps(va, vb);
325-
_mm256_storeu_ps(&c[i], vc);
326-
}
327-
add_arrays_sse2(a + i, b + i, c + i, n - i);
328-
}
329-
330-
#endif
331-
332-
} // namespace detail
333-
334-
/**
335-
* @brief Add two arrays using the best available SIMD path at runtime.
336-
*
337-
* Automatically selects AVX2, SSE2, or scalar implementation based on
338-
* CPU capabilities. The resolved function pointer is cached in a
339-
* static local for thread-safe, single-shot initialization.
340-
*/
341-
inline void dispatch_add_arrays(const float* a, const float* b, float* c, size_t n) {
342-
using Fn = detail::AddArraysFn;
343-
static const Fn dispatch = resolve_best<Fn>(&detail::add_arrays_scalar,
344-
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
345-
&detail::add_arrays_sse2, &detail::add_arrays_avx2,
346-
#else
347-
nullptr, nullptr,
348-
#endif
349-
nullptr);
350-
dispatch(a, b, c, n);
351-
}
352-
353-
} // namespace hpc::simd
2+
#include <hpc/simd.hpp>

0 commit comments

Comments
 (0)