Skip to content

Commit 9687d9f

Browse files
committed
Add: Unified hardware detection + dispatch layer (Phase 5)
Implements the missing Phase 5 infrastructure for multi-backend hardware acceleration: - hardware_detect.h/.cpp: Runtime CPUID probing for AVX-512/AVX2/SSE4.2, OpenMP thread count, CUDA device properties, Metal availability, and Eigen presence. Results cached via static initialization. - hardware_dispatch.h/.cpp: Unified backend selection (CUDA > Metal > AVX-512 > AVX2 > OpenMP > Scalar) with accelerated Boltzmann weight computation and numerically stable log-sum-exp reduction. Dispatches to Eigen vectorization, AVX-512 8-wide double SIMD, OpenMP parallel reductions, or scalar fallback at runtime. - simd_distance.h: Added AVX-512 16-wide float primitives (distance2_1x16, sum_sq_distances_avx512, lj_wall_16x, dot3_batch_avx512, boltzmann_batch_8d) alongside existing AVX2 8-wide implementations. RMSD dispatch auto-selects best available path at compile time. - test_hardware_detect_dispatch.cpp: 30 GoogleTest cases covering hardware detection caching, backend selection priority, Boltzmann batch correctness (single/multi-state, large energy ranges, degeneracy), log-sum-exp numerical stability, dispatch reporting, and SIMD primitive validation. All 30 new tests pass. No regressions in existing test suites. https://claude.ai/code/session_015C6xcNTq31AKYfrsrEQAE7
1 parent af69aee commit 9687d9f

File tree

7 files changed

+1236
-6
lines changed

7 files changed

+1236
-6
lines changed

CMakeLists.txt

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ target_sources(FlexAID PRIVATE
241241
LIB/statmech.cpp
242242
LIB/encom.cpp # ← NEW: ENCoM vibrational entropy
243243
# ── v1.5 thermodynamic + hardware acceleration modules ──
244+
LIB/hardware_detect.cpp
245+
LIB/hardware_dispatch.cpp
244246
LIB/ShannonThermoStack/ShannonThermoStack.cpp
245247
LIB/LigandRingFlex/RingConformerLibrary.cpp
246248
LIB/LigandRingFlex/SugarPucker.cpp
@@ -507,6 +509,40 @@ if(BUILD_TESTING)
507509
endif()
508510
add_test(NAME HardwareDispatchTests COMMAND test_hardware_dispatch)
509511

512+
# test_hardware_detect_dispatch — unified dispatch layer + hardware detection tests
513+
add_executable(test_hardware_detect_dispatch
514+
tests/test_hardware_detect_dispatch.cpp
515+
LIB/hardware_detect.cpp
516+
LIB/hardware_dispatch.cpp
517+
LIB/statmech.cpp)
518+
target_include_directories(test_hardware_detect_dispatch PRIVATE
519+
${CMAKE_CURRENT_SOURCE_DIR}/LIB)
520+
target_link_libraries(test_hardware_detect_dispatch PRIVATE GTest::gtest GTest::gtest_main)
521+
if(MSVC)
522+
target_compile_options(test_hardware_detect_dispatch PRIVATE /O2)
523+
else()
524+
target_compile_options(test_hardware_detect_dispatch PRIVATE -O2)
525+
endif()
526+
flexaids_configure_simd(test_hardware_detect_dispatch)
527+
if(FLEXAIDS_USE_OPENMP AND OpenMP_CXX_FOUND)
528+
target_link_libraries(test_hardware_detect_dispatch PRIVATE OpenMP::OpenMP_CXX)
529+
endif()
530+
if(FLEXAIDS_USE_EIGEN AND Eigen3_FOUND)
531+
target_link_libraries(test_hardware_detect_dispatch PRIVATE Eigen3::Eigen)
532+
target_compile_definitions(test_hardware_detect_dispatch PRIVATE FLEXAIDS_HAS_EIGEN)
533+
elseif(FLEXAIDS_USE_EIGEN AND EIGEN3_FOUND)
534+
target_include_directories(test_hardware_detect_dispatch PRIVATE ${EIGEN3_INCLUDE_DIRS})
535+
target_compile_definitions(test_hardware_detect_dispatch PRIVATE FLEXAIDS_HAS_EIGEN)
536+
endif()
537+
if(FLEXAIDS_USE_CUDA)
538+
target_compile_definitions(test_hardware_detect_dispatch PRIVATE FLEXAIDS_USE_CUDA)
539+
target_link_libraries(test_hardware_detect_dispatch PRIVATE CUDA::cudart)
540+
endif()
541+
if(FLEXAIDS_USE_METAL)
542+
target_compile_definitions(test_hardware_detect_dispatch PRIVATE FLEXAIDS_USE_METAL)
543+
endif()
544+
add_test(NAME HardwareDetectDispatchTests COMMAND test_hardware_detect_dispatch)
545+
510546
# test_tencom_diff — tENCoM differential engine tests
511547
add_executable(test_tencom_diff
512548
tests/test_tencom_diff.cpp
@@ -532,5 +568,5 @@ if(BUILD_TESTING)
532568
endif()
533569
add_test(NAME TENCoMDiffTests COMMAND test_tencom_diff)
534570

535-
message(STATUS "Unit tests enabled — build targets: test_statmech, test_hardware_dispatch, test_tencom_diff")
571+
message(STATUS "Unit tests enabled — build targets: test_statmech, test_hardware_dispatch, test_hardware_detect_dispatch, test_tencom_diff")
536572
endif()

LIB/hardware_detect.cpp

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
// hardware_detect.cpp — Runtime hardware capability detection
2+
//
3+
// Probes CPU SIMD features via CPUID (x86-64), queries CUDA device
4+
// properties via the driver API, and checks Metal availability on macOS.
5+
//
6+
// Apache-2.0 © 2026 Le Bonhomme Pharma
7+
#include "hardware_detect.h"
8+
9+
#include <sstream>
10+
#include <mutex>
11+
12+
// ── x86 CPUID ────────────────────────────────────────────────────────────────
13+
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
14+
# define FLEXAIDS_X86 1
15+
# ifdef _MSC_VER
16+
# include <intrin.h>
17+
static void flexaids_cpuid(int regs[4], int leaf) { __cpuid(regs, leaf); }
18+
static void flexaids_cpuidex(int regs[4], int leaf, int sub) { __cpuidex(regs, leaf, sub); }
19+
# else
20+
# include <cpuid.h>
21+
static void flexaids_cpuid(int regs[4], int leaf) {
22+
__cpuid_count(leaf, 0,
23+
reinterpret_cast<unsigned&>(regs[0]),
24+
reinterpret_cast<unsigned&>(regs[1]),
25+
reinterpret_cast<unsigned&>(regs[2]),
26+
reinterpret_cast<unsigned&>(regs[3]));
27+
}
28+
static void flexaids_cpuidex(int regs[4], int leaf, int sub) {
29+
__cpuid_count(leaf, sub,
30+
reinterpret_cast<unsigned&>(regs[0]),
31+
reinterpret_cast<unsigned&>(regs[1]),
32+
reinterpret_cast<unsigned&>(regs[2]),
33+
reinterpret_cast<unsigned&>(regs[3]));
34+
}
35+
# endif
36+
#else
37+
# define FLEXAIDS_X86 0
38+
#endif
39+
40+
// ── OpenMP ───────────────────────────────────────────────────────────────────
41+
#ifdef _OPENMP
42+
# include <omp.h>
43+
#endif
44+
45+
// ── CUDA (compile-time gated) ────────────────────────────────────────────────
46+
#ifdef FLEXAIDS_USE_CUDA
47+
# include <cuda_runtime.h>
48+
#endif
49+
50+
namespace flexaids {
51+
52+
static void detect_x86_simd(HardwareCapabilities& hw) {
53+
#if FLEXAIDS_X86
54+
int regs[4] = {};
55+
56+
// Leaf 1: ECX/EDX feature bits
57+
flexaids_cpuid(regs, 1);
58+
hw.has_sse42 = (regs[2] & (1 << 20)) != 0; // ECX bit 20
59+
hw.has_fma = (regs[2] & (1 << 12)) != 0; // ECX bit 12
60+
61+
// Leaf 7, sub-leaf 0: EBX/ECX extended features
62+
flexaids_cpuidex(regs, 7, 0);
63+
hw.has_avx2 = (regs[1] & (1 << 5)) != 0; // EBX bit 5
64+
hw.has_avx512f = (regs[1] & (1 << 16)) != 0; // EBX bit 16
65+
hw.has_avx512dq = (regs[1] & (1 << 17)) != 0; // EBX bit 17
66+
hw.has_avx512bw = (regs[1] & (1 << 30)) != 0; // EBX bit 30
67+
hw.has_avx512vnni= (regs[2] & (1 << 11)) != 0; // ECX bit 11
68+
69+
hw.has_avx512 = hw.has_avx512f && hw.has_avx512dq && hw.has_avx512bw;
70+
#else
71+
(void)hw;
72+
#endif
73+
}
74+
75+
static void detect_openmp(HardwareCapabilities& hw) {
76+
#ifdef _OPENMP
77+
hw.has_openmp = true;
78+
hw.openmp_max_threads = omp_get_max_threads();
79+
#else
80+
hw.has_openmp = false;
81+
hw.openmp_max_threads = 1;
82+
#endif
83+
}
84+
85+
static void detect_eigen(HardwareCapabilities& hw) {
86+
#ifdef FLEXAIDS_HAS_EIGEN
87+
hw.has_eigen = true;
88+
#else
89+
hw.has_eigen = false;
90+
#endif
91+
}
92+
93+
static void detect_cuda(HardwareCapabilities& hw) {
94+
#ifdef FLEXAIDS_USE_CUDA
95+
int count = 0;
96+
cudaError_t err = cudaGetDeviceCount(&count);
97+
if (err != cudaSuccess || count <= 0) return;
98+
99+
hw.has_cuda = true;
100+
hw.cuda_device_count = count;
101+
102+
// Query properties of device 0 (primary)
103+
cudaDeviceProp prop{};
104+
if (cudaGetDeviceProperties(&prop, 0) == cudaSuccess) {
105+
hw.cuda_device_name = prop.name;
106+
hw.cuda_sm_major = prop.major;
107+
hw.cuda_sm_minor = prop.minor;
108+
hw.cuda_global_mem = prop.totalGlobalMem;
109+
hw.cuda_arch = "sm_" + std::to_string(prop.major * 10 + prop.minor);
110+
}
111+
#else
112+
(void)hw;
113+
#endif
114+
}
115+
116+
static void detect_metal(HardwareCapabilities& hw) {
117+
#ifdef FLEXAIDS_USE_METAL
118+
// Metal availability is compile-time on macOS; if we compiled with
119+
// Metal support, we assume the system has a capable GPU.
120+
hw.has_metal = true;
121+
hw.metal_gpu_name = "Apple GPU (Metal-capable)";
122+
#else
123+
(void)hw;
124+
#endif
125+
}
126+
127+
std::string HardwareCapabilities::summary() const {
128+
std::ostringstream os;
129+
os << "[HW] Hardware Capabilities:\n";
130+
131+
if (has_cuda)
132+
os << "[HW] CUDA: " << cuda_device_name
133+
<< " (" << cuda_arch << ", "
134+
<< (cuda_global_mem >> 20) << " MB)\n";
135+
else
136+
os << "[HW] CUDA: not available\n";
137+
138+
if (has_metal)
139+
os << "[HW] Metal: " << metal_gpu_name << "\n";
140+
else
141+
os << "[HW] Metal: not available\n";
142+
143+
if (has_avx512)
144+
os << "[HW] AVX-512: F+DQ+BW"
145+
<< (has_avx512vnni ? "+VNNI" : "") << "\n";
146+
else if (has_avx2)
147+
os << "[HW] AVX2+FMA: yes\n";
148+
else if (has_sse42)
149+
os << "[HW] SSE4.2: yes (no AVX)\n";
150+
else
151+
os << "[HW] SIMD: baseline only\n";
152+
153+
os << "[HW] OpenMP: "
154+
<< (has_openmp ? std::to_string(openmp_max_threads) + " threads" : "disabled")
155+
<< "\n";
156+
157+
os << "[HW] Eigen: " << (has_eigen ? "yes" : "no") << "\n";
158+
159+
return os.str();
160+
}
161+
162+
const HardwareCapabilities& detect_hardware() {
163+
static HardwareCapabilities hw = [] {
164+
HardwareCapabilities caps;
165+
detect_x86_simd(caps);
166+
detect_openmp(caps);
167+
detect_eigen(caps);
168+
detect_cuda(caps);
169+
detect_metal(caps);
170+
return caps;
171+
}();
172+
return hw;
173+
}
174+
175+
} // namespace flexaids

LIB/hardware_detect.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// hardware_detect.h — Runtime hardware capability detection for FlexAIDdS
2+
//
3+
// Probes the execution environment at runtime to determine available
4+
// acceleration backends (CUDA, Metal, AVX-512, AVX2, OpenMP) and their
5+
// properties. The result is cached after the first call.
6+
//
7+
// Usage:
8+
// const auto& hw = flexaids::detect_hardware();
9+
// if (hw.has_avx512) { /* use 512-bit SIMD path */ }
10+
//
11+
// Apache-2.0 © 2026 Le Bonhomme Pharma
12+
#pragma once
13+
14+
#include <string>
15+
#include <cstdint>
16+
17+
namespace flexaids {
18+
19+
// Collected hardware capabilities of the current execution environment.
20+
struct HardwareCapabilities {
21+
// ── GPU: CUDA ──
22+
bool has_cuda = false;
23+
int cuda_device_count = 0;
24+
std::string cuda_device_name; // e.g. "NVIDIA GeForce RTX 4090"
25+
std::string cuda_arch; // e.g. "sm_89"
26+
int cuda_sm_major = 0;
27+
int cuda_sm_minor = 0;
28+
std::size_t cuda_global_mem = 0; // bytes
29+
30+
// ── GPU: Metal (Apple) ──
31+
bool has_metal = false;
32+
std::string metal_gpu_name; // e.g. "Apple M3 Max"
33+
34+
// ── SIMD: x86-64 ──
35+
bool has_sse42 = false;
36+
bool has_avx2 = false;
37+
bool has_fma = false;
38+
bool has_avx512f = false; // foundation
39+
bool has_avx512dq = false; // doubleword/quadword
40+
bool has_avx512bw = false; // byte/word
41+
bool has_avx512vnni = false; // Vector Neural Network Instructions
42+
43+
// Convenience composite
44+
bool has_avx512 = false; // avx512f && avx512dq && avx512bw
45+
46+
// ── OpenMP ──
47+
bool has_openmp = false;
48+
int openmp_max_threads = 1;
49+
50+
// ── Eigen ──
51+
bool has_eigen = false;
52+
53+
// Human-readable summary (one line per backend)
54+
std::string summary() const;
55+
};
56+
57+
// Detect hardware capabilities (cached after first call).
58+
const HardwareCapabilities& detect_hardware();
59+
60+
} // namespace flexaids

0 commit comments

Comments
 (0)